git.saurik.com Git - apple/icu.git/blob - icuSources/common/normlzr.cpp

2 // License & terms of use: http://www.unicode.org/copyright.html

3 /*

4 *************************************************************************

5 * COPYRIGHT:

8 *************************************************************************

9 */

11 #include "unicode/utypes.h"

13 #if !UCONFIG_NO_NORMALIZATION

15 #include "unicode/uniset.h"

16 #include "unicode/unistr.h"

17 #include "unicode/chariter.h"

18 #include "unicode/schriter.h"

19 #include "unicode/uchriter.h"

20 #include "unicode/normlzr.h"

21 #include "unicode/utf16.h"

22 #include "cmemory.h"

23 #include "normalizer2impl.h"

24 #include "uprops.h" // for uniset_getUnicode32Instance()

26 #if defined(move32)

27 // System can define move32 intrinsics, but the char iters define move32 method

28 // using same undef trick in headers, so undef here to re-enable the method.

29 #undef move32

30 #endif

32 U_NAMESPACE_BEGIN

34 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)

36 //-------------------------------------------------------------------------

37 // Constructors and other boilerplate

38 //-------------------------------------------------------------------------

 Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :

     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),

     text(new StringCharacterIterator(str)),

     currentIndex(0), nextIndex(0),

     buffer(), bufferPos(0)

45 {

46 init();

47 }

 Normalizer::Normalizer(ConstChar16Ptr str, int32_t length, UNormalizationMode mode) :

     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),

     text(new UCharCharacterIterator(str, length)),

     currentIndex(0), nextIndex(0),

     buffer(), bufferPos(0)

54 {

55 init();

56 }

 Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :

     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),

     text(iter.clone()),

     currentIndex(0), nextIndex(0),

     buffer(), bufferPos(0)

63 {

64 init();

65 }

 Normalizer::Normalizer(const Normalizer &copy) :

     UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),

     text(copy.text->clone()),

     currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),

     buffer(copy.buffer), bufferPos(copy.bufferPos)

72 {

73 init();

74 }

76 void

77 Normalizer::init() {

78 UErrorCode errorCode=U_ZERO_ERROR;

     fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);

80 if(fOptions&UNORM_UNICODE_3_2) {

81 delete fFilteredNorm2;

82 fNorm2=fFilteredNorm2=

             new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));

84 }

     if(U_FAILURE(errorCode)) {

86 errorCode=U_ZERO_ERROR;

         fNorm2=Normalizer2Factory::getNoopInstance(errorCode);

88 }

89 }

91 Normalizer::~Normalizer()

92 {

93 delete fFilteredNorm2;

94 delete text;

95 }

97 Normalizer*

 Normalizer::clone() const

99 {

     return new Normalizer(*this);

101 }

102

103 /**

104 * Generates a hash code for this iterator.

105 */

 int32_t Normalizer::hashCode() const

107 {

     return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;

109 }

110

 UBool Normalizer::operator==(const Normalizer& that) const

112 {

113 return

114 this==&that ||

115 (fUMode==that.fUMode &&

116 fOptions==that.fOptions &&

117 *text==*that.text &&

118 buffer==that.buffer &&

119 bufferPos==that.bufferPos &&

120 nextIndex==that.nextIndex);

121 }

122

123 //-------------------------------------------------------------------------

124 // Static utility methods

125 //-------------------------------------------------------------------------

126

127 void U_EXPORT2

 Normalizer::normalize(const UnicodeString& source, 

129 UNormalizationMode mode, int32_t options,

130 UnicodeString& result,

131 UErrorCode &status) {

     if(source.isBogus() || U_FAILURE(status)) {

133 result.setToBogus();

         if(U_SUCCESS(status)) {

135 status=U_ILLEGAL_ARGUMENT_ERROR;

136 }

137 } else {

138 UnicodeString localDest;

139 UnicodeString *dest;

140

141 if(&source!=&result) {

142 dest=&result;

143 } else {

144 // the source and result strings are the same object, use a temporary one

145 dest=&localDest;

146 }

         const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);

         if(U_SUCCESS(status)) {

149 if(options&UNORM_UNICODE_3_2) {

                 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).

                     normalize(source, *dest, status);

152 } else {

                 n2->normalize(source, *dest, status);

154 }

155 }

         if(dest==&localDest && U_SUCCESS(status)) {

157 result=*dest;

158 }

159 }

160 }

161

162 void U_EXPORT2

 Normalizer::compose(const UnicodeString& source, 

164 UBool compat, int32_t options,

165 UnicodeString& result,

166 UErrorCode &status) {

     normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);

168 }

169

170 void U_EXPORT2

 Normalizer::decompose(const UnicodeString& source, 

172 UBool compat, int32_t options,

173 UnicodeString& result,

174 UErrorCode &status) {

     normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);

176 }

177

178 UNormalizationCheckResult

 Normalizer::quickCheck(const UnicodeString& source,

180 UNormalizationMode mode, int32_t options,

181 UErrorCode &status) {

     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);

     if(U_SUCCESS(status)) {

184 if(options&UNORM_UNICODE_3_2) {

             return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).

186 quickCheck(source, status);

187 } else {

             return n2->quickCheck(source, status);

189 }

190 } else {

191 return UNORM_MAYBE;

192 }

193 }

194

195 UBool

 Normalizer::isNormalized(const UnicodeString& source,

197 UNormalizationMode mode, int32_t options,

198 UErrorCode &status) {

     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);

     if(U_SUCCESS(status)) {

201 if(options&UNORM_UNICODE_3_2) {

             return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).

203 isNormalized(source, status);

204 } else {

             return n2->isNormalized(source, status);

206 }

207 } else {

208 return FALSE;

209 }

210 }

211

212 UnicodeString & U_EXPORT2

 Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,

214 UnicodeString &result,

215 UNormalizationMode mode, int32_t options,

216 UErrorCode &errorCode) {

     if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {

218 result.setToBogus();

         if(U_SUCCESS(errorCode)) {

220 errorCode=U_ILLEGAL_ARGUMENT_ERROR;

221 }

222 } else {

223 UnicodeString localDest;

224 UnicodeString *dest;

225

226 if(&right!=&result) {

227 dest=&result;

228 } else {

229 // the right and result strings are the same object, use a temporary one

230 dest=&localDest;

231 }

232 *dest=left;

         const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);

         if(U_SUCCESS(errorCode)) {

235 if(options&UNORM_UNICODE_3_2) {

                 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).

                     append(*dest, right, errorCode);

238 } else {

                 n2->append(*dest, right, errorCode);

240 }

241 }

         if(dest==&localDest && U_SUCCESS(errorCode)) {

243 result=*dest;

244 }

245 }

246 return result;

247 }

248

249 //-------------------------------------------------------------------------

250 // Iteration API

251 //-------------------------------------------------------------------------

252

253 /**

254 * Return the current character in the normalized text.

255 */

256 UChar32 Normalizer::current() {

     if(bufferPos<buffer.length() || nextNormalize()) {

         return buffer.char32At(bufferPos);

259 } else {

260 return DONE;

261 }

262 }

263

264 /**

265 * Return the next character in the normalized text and advance

266 * the iteration position by one. If the end

267 * of the text has already been reached, {@link #DONE} is returned.

268 */

269 UChar32 Normalizer::next() {

     if(bufferPos<buffer.length() ||  nextNormalize()) {

         UChar32 c=buffer.char32At(bufferPos);

272 bufferPos+=U16_LENGTH(c);

273 return c;

274 } else {

275 return DONE;

276 }

277 }

278

279 /**

280 * Return the previous character in the normalized text and decrement

281 * the iteration position by one. If the beginning

282 * of the text has already been reached, {@link #DONE} is returned.

283 */

284 UChar32 Normalizer::previous() {

     if(bufferPos>0 || previousNormalize()) {

         UChar32 c=buffer.char32At(bufferPos-1);

287 bufferPos-=U16_LENGTH(c);

288 return c;

289 } else {

290 return DONE;

291 }

292 }

293

 void Normalizer::reset() {

     currentIndex=nextIndex=text->setToStart();

296 clearBuffer();

297 }

298

299 void

 Normalizer::setIndexOnly(int32_t index) {

     text->setIndex(index);  // pins index

     currentIndex=nextIndex=text->getIndex();

303 clearBuffer();

304 }

305

306 /**

307 * Return the first character in the normalized text. This resets

308 * the <tt>Normalizer's</tt> position to the beginning of the text.

309 */

310 UChar32 Normalizer::first() {

311 reset();

312 return next();

313 }

314

315 /**

316 * Return the last character in the normalized text. This resets

317 * the <tt>Normalizer's</tt> position to be just before the

318 * the input text corresponding to that normalized character.

319 */

320 UChar32 Normalizer::last() {

     currentIndex=nextIndex=text->setToEnd();

322 clearBuffer();

323 return previous();

324 }

325

326 /**

327 * Retrieve the current iteration position in the input text that is

328 * being normalized. This method is useful in applications such as

329 * searching, where you need to be able to determine the position in

330 * the input text that corresponds to a given normalized output character.

331 *

332 * Note: This method sets the position in the input, while

333 * {@link #next} and {@link #previous} iterate through characters in the

334 * output. This means that there is not necessarily a one-to-one

335 * correspondence between characters returned by <tt>next</tt> and

336 * <tt>previous</tt> and the indices passed to and returned from

337 * <tt>setIndex</tt> and {@link #getIndex}.

338 *

339 */

 int32_t Normalizer::getIndex() const {

     if(bufferPos<buffer.length()) {

342 return currentIndex;

343 } else {

344 return nextIndex;

345 }

346 }

347

348 /**

349 * Retrieve the index of the start of the input text. This is the begin index

350 * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>

351 * over which this <tt>Normalizer</tt> is iterating

352 */

 int32_t Normalizer::startIndex() const {

354 return text->startIndex();

355 }

356

357 /**

358 * Retrieve the index of the end of the input text. This is the end index

359 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>

360 * over which this <tt>Normalizer</tt> is iterating

361 */

 int32_t Normalizer::endIndex() const {

363 return text->endIndex();

364 }

365

366 //-------------------------------------------------------------------------

367 // Property access methods

368 //-------------------------------------------------------------------------

369

370 void

 Normalizer::setMode(UNormalizationMode newMode) 

372 {

373 fUMode = newMode;

374 init();

375 }

376

377 UNormalizationMode

 Normalizer::getUMode() const

379 {

380 return fUMode;

381 }

382

383 void

 Normalizer::setOption(int32_t option, 

385 UBool value)

386 {

387 if (value) {

388 fOptions |= option;

389 } else {

390 fOptions &= (~option);

391 }

392 init();

393 }

394

395 UBool

 Normalizer::getOption(int32_t option) const

397 {

     return (fOptions & option) != 0;

399 }

400

401 /**

402 * Set the input text over which this <tt>Normalizer</tt> will iterate.

403 * The iteration position is set to the beginning of the input text.

404 */

405 void

 Normalizer::setText(const UnicodeString& newText, 

407 UErrorCode &status)

408 {

     if (U_FAILURE(status)) {

410 return;

411 }

     CharacterIterator *newIter = new StringCharacterIterator(newText);

413 if (newIter == NULL) {

414 status = U_MEMORY_ALLOCATION_ERROR;

415 return;

416 }

417 delete text;

418 text = newIter;

419 reset();

420 }

421

422 /**

423 * Set the input text over which this <tt>Normalizer</tt> will iterate.

424 * The iteration position is set to the beginning of the string.

425 */

426 void

 Normalizer::setText(const CharacterIterator& newText, 

428 UErrorCode &status)

429 {

     if (U_FAILURE(status)) {

431 return;

432 }

     CharacterIterator *newIter = newText.clone();

434 if (newIter == NULL) {

435 status = U_MEMORY_ALLOCATION_ERROR;

436 return;

437 }

438 delete text;

439 text = newIter;

440 reset();

441 }

442

443 void

 Normalizer::setText(ConstChar16Ptr newText,

445 int32_t length,

446 UErrorCode &status)

447 {

     if (U_FAILURE(status)) {

449 return;

450 }

     CharacterIterator *newIter = new UCharCharacterIterator(newText, length);

452 if (newIter == NULL) {

453 status = U_MEMORY_ALLOCATION_ERROR;

454 return;

455 }

456 delete text;

457 text = newIter;

458 reset();

459 }

460

461 /**

462 * Copies the text under iteration into the UnicodeString referred to by "result".

463 * @param result Receives a copy of the text under iteration.

464 */

465 void

 Normalizer::getText(UnicodeString&  result) 

467 {

468 text->getText(result);

469 }

470

471 //-------------------------------------------------------------------------

472 // Private utility methods

473 //-------------------------------------------------------------------------

474

 void Normalizer::clearBuffer() {

476 buffer.remove();

477 bufferPos=0;

478 }

479

480 UBool

481 Normalizer::nextNormalize() {

482 clearBuffer();

483 currentIndex=nextIndex;

484 text->setIndex(nextIndex);

     if(!text->hasNext()) {

486 return FALSE;

487 }

488 // Skip at least one character so we make progress.

     UnicodeString segment(text->next32PostInc());

     while(text->hasNext()) {

491 UChar32 c;

         if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {

             text->move32(-1, CharacterIterator::kCurrent);

494 break;

495 }

496 segment.append(c);

497 }

498 nextIndex=text->getIndex();

499 UErrorCode errorCode=U_ZERO_ERROR;

     fNorm2->normalize(segment, buffer, errorCode);

     return U_SUCCESS(errorCode) && !buffer.isEmpty();

502 }

503

504 UBool

505 Normalizer::previousNormalize() {

506 clearBuffer();

507 nextIndex=currentIndex;

508 text->setIndex(currentIndex);

     if(!text->hasPrevious()) {

510 return FALSE;

511 }

512 UnicodeString segment;

     while(text->hasPrevious()) {

514 UChar32 c=text->previous32();

         segment.insert(0, c);

         if(fNorm2->hasBoundaryBefore(c)) {

517 break;

518 }

519 }

520 currentIndex=text->getIndex();

521 UErrorCode errorCode=U_ZERO_ERROR;

     fNorm2->normalize(segment, buffer, errorCode);

523 bufferPos=buffer.length();

     return U_SUCCESS(errorCode) && !buffer.isEmpty();

525 }

526

527 U_NAMESPACE_END

528

529 #endif /* #if !UCONFIG_NO_NORMALIZATION */