git.saurik.com Git - apple/icu.git/blob - icuSources/common/unicode/normalizer2.h

2 // License & terms of use: http://www.unicode.org/copyright.html

3 /*

4 *******************************************************************************

5 *

8 *

9 *******************************************************************************

10 * file name: normalizer2.h

11 * encoding: UTF-8

12 * tab size: 8 (not used)

13 * indentation:4

14 *

15 * created on: 2009nov22

16 * created by: Markus W. Scherer

17 */

19 #ifndef __NORMALIZER2_H__

20 #define __NORMALIZER2_H__

22 /**

23 * \file

24 * \brief C++ API: New API for Unicode Normalization.

25 */

27 #include "unicode/utypes.h"

29 #if U_SHOW_CPLUSPLUS_API

31 #if !UCONFIG_NO_NORMALIZATION

33 #include "unicode/stringpiece.h"

34 #include "unicode/uniset.h"

35 #include "unicode/unistr.h"

36 #include "unicode/unorm2.h"

38 U_NAMESPACE_BEGIN

40 class ByteSink;

42 /**

43 * Unicode normalization functionality for standard Unicode normalization or

44 * for using custom mapping tables.

45 * All instances of this class are unmodifiable/immutable.

46 * Instances returned by getInstance() are singletons that must not be deleted by the caller.

47 * The Normalizer2 class is not intended for public subclassing.

48 *

49 * The primary functions are to produce a normalized string and to detect whether

50 * a string is already normalized.

51 * The most commonly used normalization forms are those defined in

52 * http://www.unicode.org/unicode/reports/tr15/

53 * However, this API supports additional normalization forms for specialized purposes.

54 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)

55 * and can be used in implementations of UTS #46.

56 *

57 * Not only are the standard compose and decompose modes supplied,

58 * but additional modes are provided as documented in the Mode enum.

59 *

60 * Some of the functions in this class identify normalization boundaries.

61 * At a normalization boundary, the portions of the string

62 * before it and starting from it do not interact and can be handled independently.

63 *

64 * The spanQuickCheckYes() stops at a normalization boundary.

65 * When the goal is a normalized string, then the text before the boundary

66 * can be copied, and the remainder can be processed with normalizeSecondAndAppend().

67 *

68 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether

69 * a character is guaranteed to be at a normalization boundary,

70 * regardless of context.

71 * This is used for moving from one normalization boundary to the next

72 * or preceding boundary, and for performing iterative normalization.

73 *

74 * Iterative normalization is useful when only a small portion of a

75 * longer string needs to be processed.

76 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator

77 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()

78 * (to process only the substring for which sort key bytes are computed).

79 *

80 * The set of normalization boundaries returned by these functions may not be

81 * complete: There may be more boundaries that could be returned.

82 * Different functions may return different boundaries.

83 * @stable ICU 4.4

84 */

85 class U_COMMON_API Normalizer2 : public UObject {

86 public:

87 /**

88 * Destructor.

89 * @stable ICU 4.4

90 */

91 ~Normalizer2();

93 /**

94 * Returns a Normalizer2 instance for Unicode NFC normalization.

95 * Same as getInstance(NULL, "nfc", UNORM2_COMPOSE, errorCode).

96 * Returns an unmodifiable singleton instance. Do not delete it.

97 * @param errorCode Standard ICU error code. Its input value must

98 * pass the U_SUCCESS() test, or else the function returns

99 * immediately. Check for U_FAILURE() on output or use with

100 * function chaining. (See User Guide for details.)

101 * @return the requested Normalizer2, if successful

102 * @stable ICU 49

103 */

104 static const Normalizer2 *

105 getNFCInstance(UErrorCode &errorCode);

106

107 /**

108 * Returns a Normalizer2 instance for Unicode NFD normalization.

109 * Same as getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode).

110 * Returns an unmodifiable singleton instance. Do not delete it.

111 * @param errorCode Standard ICU error code. Its input value must

112 * pass the U_SUCCESS() test, or else the function returns

113 * immediately. Check for U_FAILURE() on output or use with

114 * function chaining. (See User Guide for details.)

115 * @return the requested Normalizer2, if successful

116 * @stable ICU 49

117 */

118 static const Normalizer2 *

119 getNFDInstance(UErrorCode &errorCode);

120

121 /**

122 * Returns a Normalizer2 instance for Unicode NFKC normalization.

123 * Same as getInstance(NULL, "nfkc", UNORM2_COMPOSE, errorCode).

124 * Returns an unmodifiable singleton instance. Do not delete it.

125 * @param errorCode Standard ICU error code. Its input value must

126 * pass the U_SUCCESS() test, or else the function returns

127 * immediately. Check for U_FAILURE() on output or use with

128 * function chaining. (See User Guide for details.)

129 * @return the requested Normalizer2, if successful

130 * @stable ICU 49

131 */

132 static const Normalizer2 *

133 getNFKCInstance(UErrorCode &errorCode);

134

135 /**

136 * Returns a Normalizer2 instance for Unicode NFKD normalization.

137 * Same as getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, errorCode).

138 * Returns an unmodifiable singleton instance. Do not delete it.

139 * @param errorCode Standard ICU error code. Its input value must

140 * pass the U_SUCCESS() test, or else the function returns

141 * immediately. Check for U_FAILURE() on output or use with

142 * function chaining. (See User Guide for details.)

143 * @return the requested Normalizer2, if successful

144 * @stable ICU 49

145 */

146 static const Normalizer2 *

147 getNFKDInstance(UErrorCode &errorCode);

148

149 /**

150 * Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.

151 * Same as getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode).

152 * Returns an unmodifiable singleton instance. Do not delete it.

153 * @param errorCode Standard ICU error code. Its input value must

154 * pass the U_SUCCESS() test, or else the function returns

155 * immediately. Check for U_FAILURE() on output or use with

156 * function chaining. (See User Guide for details.)

157 * @return the requested Normalizer2, if successful

158 * @stable ICU 49

159 */

160 static const Normalizer2 *

161 getNFKCCasefoldInstance(UErrorCode &errorCode);

162

163 /**

164 * Returns a Normalizer2 instance which uses the specified data file

165 * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)

166 * and which composes or decomposes text according to the specified mode.

167 * Returns an unmodifiable singleton instance. Do not delete it.

168 *

169 * Use packageName=NULL for data files that are part of ICU's own data.

170 * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.

171 * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.

172 * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.

173 *

174 * @param packageName NULL for ICU built-in data, otherwise application data package name

175 * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file

176 * @param mode normalization mode (compose or decompose etc.)

177 * @param errorCode Standard ICU error code. Its input value must

178 * pass the U_SUCCESS() test, or else the function returns

179 * immediately. Check for U_FAILURE() on output or use with

180 * function chaining. (See User Guide for details.)

181 * @return the requested Normalizer2, if successful

182 * @stable ICU 4.4

183 */

184 static const Normalizer2 *

     getInstance(const char *packageName,

186 const char *name,

187 UNormalization2Mode mode,

188 UErrorCode &errorCode);

189

190 /**

191 * Returns the normalized form of the source string.

192 * @param src source string

193 * @param errorCode Standard ICU error code. Its input value must

194 * pass the U_SUCCESS() test, or else the function returns

195 * immediately. Check for U_FAILURE() on output or use with

196 * function chaining. (See User Guide for details.)

197 * @return normalized src

198 * @stable ICU 4.4

199 */

200 UnicodeString

     normalize(const UnicodeString &src, UErrorCode &errorCode) const {

202 UnicodeString result;

         normalize(src, result, errorCode);

204 return result;

205 }

206 /**

207 * Writes the normalized form of the source string to the destination string

208 * (replacing its contents) and returns the destination string.

209 * The source and destination strings must be different objects.

210 * @param src source string

211 * @param dest destination string; its contents is replaced with normalized src

212 * @param errorCode Standard ICU error code. Its input value must

213 * pass the U_SUCCESS() test, or else the function returns

214 * immediately. Check for U_FAILURE() on output or use with

215 * function chaining. (See User Guide for details.)

216 * @return dest

217 * @stable ICU 4.4

218 */

219 virtual UnicodeString &

     normalize(const UnicodeString &src,

221 UnicodeString &dest,

               UErrorCode &errorCode) const = 0;

223

224 /**

225 * Normalizes a UTF-8 string and optionally records how source substrings

226 * relate to changed and unchanged result substrings.

227 *

228 * Currently implemented completely only for "compose" modes,

229 * such as for NFC, NFKC, and NFKC_Casefold

230 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).

231 * Otherwise currently converts to & from UTF-16 and does not support edits.

232 *

233 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.

234 * @param src Source UTF-8 string.

235 * @param sink A ByteSink to which the normalized UTF-8 result string is written.

236 * sink.Flush() is called at the end.

237 * @param edits Records edits for index mapping, working with styled text,

238 * and getting only changes (if any).

239 * The Edits contents is undefined if any error occurs.

240 * This function calls edits->reset() first unless

241 * options includes U_EDITS_NO_RESET. edits can be nullptr.

242 * @param errorCode Standard ICU error code. Its input value must

243 * pass the U_SUCCESS() test, or else the function returns

244 * immediately. Check for U_FAILURE() on output or use with

245 * function chaining. (See User Guide for details.)

246 * @stable ICU 60

247 */

248 virtual void

     normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,

                   Edits *edits, UErrorCode &errorCode) const;

251

252 /**

253 * Appends the normalized form of the second string to the first string

254 * (merging them at the boundary) and returns the first string.

255 * The result is normalized if the first string was normalized.

256 * The first and second strings must be different objects.

257 * @param first string, should be normalized

258 * @param second string, will be normalized

259 * @param errorCode Standard ICU error code. Its input value must

260 * pass the U_SUCCESS() test, or else the function returns

261 * immediately. Check for U_FAILURE() on output or use with

262 * function chaining. (See User Guide for details.)

263 * @return first

264 * @stable ICU 4.4

265 */

266 virtual UnicodeString &

267 normalizeSecondAndAppend(UnicodeString &first,

268 const UnicodeString &second,

                              UErrorCode &errorCode) const = 0;

270 /**

271 * Appends the second string to the first string

272 * (merging them at the boundary) and returns the first string.

273 * The result is normalized if both the strings were normalized.

274 * The first and second strings must be different objects.

275 * @param first string, should be normalized

276 * @param second string, should be normalized

277 * @param errorCode Standard ICU error code. Its input value must

278 * pass the U_SUCCESS() test, or else the function returns

279 * immediately. Check for U_FAILURE() on output or use with

280 * function chaining. (See User Guide for details.)

281 * @return first

282 * @stable ICU 4.4

283 */

284 virtual UnicodeString &

285 append(UnicodeString &first,

286 const UnicodeString &second,

            UErrorCode &errorCode) const = 0;

288

289 /**

290 * Gets the decomposition mapping of c.

291 * Roughly equivalent to normalizing the String form of c

292 * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function

293 * returns FALSE and does not write a string

294 * if c does not have a decomposition mapping in this instance's data.

295 * This function is independent of the mode of the Normalizer2.

296 * @param c code point

297 * @param decomposition String object which will be set to c's

298 * decomposition mapping, if there is one.

299 * @return TRUE if c has a decomposition, otherwise FALSE

300 * @stable ICU 4.6

301 */

302 virtual UBool

     getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;

304

305 /**

306 * Gets the raw decomposition mapping of c.

307 *

308 * This is similar to the getDecomposition() method but returns the

309 * raw decomposition mapping as specified in UnicodeData.txt or

310 * (for custom data) in the mapping files processed by the gennorm2 tool.

311 * By contrast, getDecomposition() returns the processed,

312 * recursively-decomposed version of this mapping.

313 *

314 * When used on a standard NFKC Normalizer2 instance,

315 * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.

316 *

317 * When used on a standard NFC Normalizer2 instance,

318 * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);

319 * in this case, the result contains either one or two code points (=1..4 char16_ts).

320 *

321 * This function is independent of the mode of the Normalizer2.

322 * The default implementation returns FALSE.

323 * @param c code point

324 * @param decomposition String object which will be set to c's

325 * raw decomposition mapping, if there is one.

326 * @return TRUE if c has a decomposition, otherwise FALSE

327 * @stable ICU 49

328 */

329 virtual UBool

     getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;

331

332 /**

333 * Performs pairwise composition of a & b and returns the composite if there is one.

334 *

335 * Returns a composite code point c only if c has a two-way mapping to a+b.

336 * In standard Unicode normalization, this means that

337 * c has a canonical decomposition to a+b

338 * and c does not have the Full_Composition_Exclusion property.

339 *

340 * This function is independent of the mode of the Normalizer2.

341 * The default implementation returns a negative value.

342 * @param a A (normalization starter) code point.

343 * @param b Another code point.

344 * @return The non-negative composite code point if there is one; otherwise a negative value.

345 * @stable ICU 49

346 */

347 virtual UChar32

     composePair(UChar32 a, UChar32 b) const;

349

350 /**

351 * Gets the combining class of c.

352 * The default implementation returns 0

353 * but all standard implementations return the Unicode Canonical_Combining_Class value.

354 * @param c code point

355 * @return c's combining class

356 * @stable ICU 49

357 */

358 virtual uint8_t

     getCombiningClass(UChar32 c) const;

360

361 /**

362 * Tests if the string is normalized.

363 * Internally, in cases where the quickCheck() method would return "maybe"

364 * (which is only possible for the two COMPOSE modes) this method

365 * resolves to "yes" or "no" to provide a definitive result,

366 * at the cost of doing more work in those cases.

367 * @param s input string

368 * @param errorCode Standard ICU error code. Its input value must

369 * pass the U_SUCCESS() test, or else the function returns

370 * immediately. Check for U_FAILURE() on output or use with

371 * function chaining. (See User Guide for details.)

372 * @return TRUE if s is normalized

373 * @stable ICU 4.4

374 */

375 virtual UBool

     isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;

377 /**

378 * Tests if the UTF-8 string is normalized.

379 * Internally, in cases where the quickCheck() method would return "maybe"

380 * (which is only possible for the two COMPOSE modes) this method

381 * resolves to "yes" or "no" to provide a definitive result,

382 * at the cost of doing more work in those cases.

383 *

384 * This works for all normalization modes,

385 * but it is currently optimized for UTF-8 only for "compose" modes,

386 * such as for NFC, NFKC, and NFKC_Casefold

387 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).

388 * For other modes it currently converts to UTF-16 and calls isNormalized().

389 *

390 * @param s UTF-8 input string

391 * @param errorCode Standard ICU error code. Its input value must

392 * pass the U_SUCCESS() test, or else the function returns

393 * immediately. Check for U_FAILURE() on output or use with

394 * function chaining. (See User Guide for details.)

395 * @return TRUE if s is normalized

396 * @stable ICU 60

397 */

398 virtual UBool

     isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;

400

401

402 /**

403 * Tests if the string is normalized.

404 * For the two COMPOSE modes, the result could be "maybe" in cases that

405 * would take a little more work to resolve definitively.

406 * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster

407 * combination of quick check + normalization, to avoid

408 * re-checking the "yes" prefix.

409 * @param s input string

410 * @param errorCode Standard ICU error code. Its input value must

411 * pass the U_SUCCESS() test, or else the function returns

412 * immediately. Check for U_FAILURE() on output or use with

413 * function chaining. (See User Guide for details.)

414 * @return UNormalizationCheckResult

415 * @stable ICU 4.4

416 */

417 virtual UNormalizationCheckResult

     quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;

419

420 /**

421 * Returns the end of the normalized substring of the input string.

422 * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>

423 * the substring <code>UnicodeString(s, 0, end)</code>

424 * will pass the quick check with a "yes" result.

425 *

426 * The returned end index is usually one or more characters before the

427 * "no" or "maybe" character: The end index is at a normalization boundary.

428 * (See the class documentation for more about normalization boundaries.)

429 *

430 * When the goal is a normalized string and most input strings are expected

431 * to be normalized already, then call this method,

432 * and if it returns a prefix shorter than the input string,

433 * copy that prefix and use normalizeSecondAndAppend() for the remainder.

434 * @param s input string

435 * @param errorCode Standard ICU error code. Its input value must

436 * pass the U_SUCCESS() test, or else the function returns

437 * immediately. Check for U_FAILURE() on output or use with

438 * function chaining. (See User Guide for details.)

439 * @return "yes" span end index

440 * @stable ICU 4.4

441 */

442 virtual int32_t

     spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;

444

445 /**

446 * Tests if the character always has a normalization boundary before it,

447 * regardless of context.

448 * If true, then the character does not normalization-interact with

449 * preceding characters.

450 * In other words, a string containing this character can be normalized

451 * by processing portions before this character and starting from this

452 * character independently.

453 * This is used for iterative normalization. See the class documentation for details.

454 * @param c character to test

455 * @return TRUE if c has a normalization boundary before it

456 * @stable ICU 4.4

457 */

     virtual UBool hasBoundaryBefore(UChar32 c) const = 0;

459

460 /**

461 * Tests if the character always has a normalization boundary after it,

462 * regardless of context.

463 * If true, then the character does not normalization-interact with

464 * following characters.

465 * In other words, a string containing this character can be normalized

466 * by processing portions up to this character and after this

467 * character independently.

468 * This is used for iterative normalization. See the class documentation for details.

469 * Note that this operation may be significantly slower than hasBoundaryBefore().

470 * @param c character to test

471 * @return TRUE if c has a normalization boundary after it

472 * @stable ICU 4.4

473 */

     virtual UBool hasBoundaryAfter(UChar32 c) const = 0;

475

476 /**

477 * Tests if the character is normalization-inert.

478 * If true, then the character does not change, nor normalization-interact with

479 * preceding or following characters.

480 * In other words, a string containing this character can be normalized

481 * by processing portions before this character and after this

482 * character independently.

483 * This is used for iterative normalization. See the class documentation for details.

484 * Note that this operation may be significantly slower than hasBoundaryBefore().

485 * @param c character to test

486 * @return TRUE if c is normalization-inert

487 * @stable ICU 4.4

488 */

     virtual UBool isInert(UChar32 c) const = 0;

490 };

491

492 /**

493 * Normalization filtered by a UnicodeSet.

494 * Normalizes portions of the text contained in the filter set and leaves

495 * portions not contained in the filter set unchanged.

496 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).

497 * Not-in-the-filter text is treated as "is normalized" and "quick check yes".

498 * This class implements all of (and only) the Normalizer2 API.

499 * An instance of this class is unmodifiable/immutable but is constructed and

500 * must be destructed by the owner.

501 * @stable ICU 4.4

502 */

503 class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {

504 public:

505 /**

506 * Constructs a filtered normalizer wrapping any Normalizer2 instance

507 * and a filter set.

508 * Both are aliased and must not be modified or deleted while this object

509 * is used.

510 * The filter set should be frozen; otherwise the performance will suffer greatly.

511 * @param n2 wrapped Normalizer2 instance

512 * @param filterSet UnicodeSet which determines the characters to be normalized

513 * @stable ICU 4.4

514 */

     FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :

             norm2(n2), set(filterSet) {}

517

518 /**

519 * Destructor.

520 * @stable ICU 4.4

521 */

522 ~FilteredNormalizer2();

523

524 /**

525 * Writes the normalized form of the source string to the destination string

526 * (replacing its contents) and returns the destination string.

527 * The source and destination strings must be different objects.

528 * @param src source string

529 * @param dest destination string; its contents is replaced with normalized src

530 * @param errorCode Standard ICU error code. Its input value must

531 * pass the U_SUCCESS() test, or else the function returns

532 * immediately. Check for U_FAILURE() on output or use with

533 * function chaining. (See User Guide for details.)

534 * @return dest

535 * @stable ICU 4.4

536 */

537 virtual UnicodeString &

     normalize(const UnicodeString &src,

539 UnicodeString &dest,

540 UErrorCode &errorCode) const U_OVERRIDE;

541

542 /**

543 * Normalizes a UTF-8 string and optionally records how source substrings

544 * relate to changed and unchanged result substrings.

545 *

546 * Currently implemented completely only for "compose" modes,

547 * such as for NFC, NFKC, and NFKC_Casefold

548 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).

549 * Otherwise currently converts to & from UTF-16 and does not support edits.

550 *

551 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.

552 * @param src Source UTF-8 string.

553 * @param sink A ByteSink to which the normalized UTF-8 result string is written.

554 * sink.Flush() is called at the end.

555 * @param edits Records edits for index mapping, working with styled text,

556 * and getting only changes (if any).

557 * The Edits contents is undefined if any error occurs.

558 * This function calls edits->reset() first unless

559 * options includes U_EDITS_NO_RESET. edits can be nullptr.

560 * @param errorCode Standard ICU error code. Its input value must

561 * pass the U_SUCCESS() test, or else the function returns

562 * immediately. Check for U_FAILURE() on output or use with

563 * function chaining. (See User Guide for details.)

564 * @stable ICU 60

565 */

566 virtual void

     normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,

                   Edits *edits, UErrorCode &errorCode) const U_OVERRIDE;

569

570 /**

571 * Appends the normalized form of the second string to the first string

572 * (merging them at the boundary) and returns the first string.

573 * The result is normalized if the first string was normalized.

574 * The first and second strings must be different objects.

575 * @param first string, should be normalized

576 * @param second string, will be normalized

577 * @param errorCode Standard ICU error code. Its input value must

578 * pass the U_SUCCESS() test, or else the function returns

579 * immediately. Check for U_FAILURE() on output or use with

580 * function chaining. (See User Guide for details.)

581 * @return first

582 * @stable ICU 4.4

583 */

584 virtual UnicodeString &

585 normalizeSecondAndAppend(UnicodeString &first,

586 const UnicodeString &second,

587 UErrorCode &errorCode) const U_OVERRIDE;

588 /**

589 * Appends the second string to the first string

590 * (merging them at the boundary) and returns the first string.

591 * The result is normalized if both the strings were normalized.

592 * The first and second strings must be different objects.

593 * @param first string, should be normalized

594 * @param second string, should be normalized

595 * @param errorCode Standard ICU error code. Its input value must

596 * pass the U_SUCCESS() test, or else the function returns

597 * immediately. Check for U_FAILURE() on output or use with

598 * function chaining. (See User Guide for details.)

599 * @return first

600 * @stable ICU 4.4

601 */

602 virtual UnicodeString &

603 append(UnicodeString &first,

604 const UnicodeString &second,

605 UErrorCode &errorCode) const U_OVERRIDE;

606

607 /**

608 * Gets the decomposition mapping of c.

609 * For details see the base class documentation.

610 *

611 * This function is independent of the mode of the Normalizer2.

612 * @param c code point

613 * @param decomposition String object which will be set to c's

614 * decomposition mapping, if there is one.

615 * @return TRUE if c has a decomposition, otherwise FALSE

616 * @stable ICU 4.6

617 */

618 virtual UBool

     getDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;

620

621 /**

622 * Gets the raw decomposition mapping of c.

623 * For details see the base class documentation.

624 *

625 * This function is independent of the mode of the Normalizer2.

626 * @param c code point

627 * @param decomposition String object which will be set to c's

628 * raw decomposition mapping, if there is one.

629 * @return TRUE if c has a decomposition, otherwise FALSE

630 * @stable ICU 49

631 */

632 virtual UBool

     getRawDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;

634

635 /**

636 * Performs pairwise composition of a & b and returns the composite if there is one.

637 * For details see the base class documentation.

638 *

639 * This function is independent of the mode of the Normalizer2.

640 * @param a A (normalization starter) code point.

641 * @param b Another code point.

642 * @return The non-negative composite code point if there is one; otherwise a negative value.

643 * @stable ICU 49

644 */

645 virtual UChar32

     composePair(UChar32 a, UChar32 b) const U_OVERRIDE;

647

648 /**

649 * Gets the combining class of c.

650 * The default implementation returns 0

651 * but all standard implementations return the Unicode Canonical_Combining_Class value.

652 * @param c code point

653 * @return c's combining class

654 * @stable ICU 49

655 */

656 virtual uint8_t

     getCombiningClass(UChar32 c) const U_OVERRIDE;

658

659 /**

660 * Tests if the string is normalized.

661 * For details see the Normalizer2 base class documentation.

662 * @param s input string

663 * @param errorCode Standard ICU error code. Its input value must

664 * pass the U_SUCCESS() test, or else the function returns

665 * immediately. Check for U_FAILURE() on output or use with

666 * function chaining. (See User Guide for details.)

667 * @return TRUE if s is normalized

668 * @stable ICU 4.4

669 */

670 virtual UBool

     isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;

672 /**

673 * Tests if the UTF-8 string is normalized.

674 * Internally, in cases where the quickCheck() method would return "maybe"

675 * (which is only possible for the two COMPOSE modes) this method

676 * resolves to "yes" or "no" to provide a definitive result,

677 * at the cost of doing more work in those cases.

678 *

679 * This works for all normalization modes,

680 * but it is currently optimized for UTF-8 only for "compose" modes,

681 * such as for NFC, NFKC, and NFKC_Casefold

682 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).

683 * For other modes it currently converts to UTF-16 and calls isNormalized().

684 *

685 * @param s UTF-8 input string

686 * @param errorCode Standard ICU error code. Its input value must

687 * pass the U_SUCCESS() test, or else the function returns

688 * immediately. Check for U_FAILURE() on output or use with

689 * function chaining. (See User Guide for details.)

690 * @return TRUE if s is normalized

691 * @stable ICU 60

692 */

693 virtual UBool

     isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const U_OVERRIDE;

695 /**

696 * Tests if the string is normalized.

697 * For details see the Normalizer2 base class documentation.

698 * @param s input string

699 * @param errorCode Standard ICU error code. Its input value must

700 * pass the U_SUCCESS() test, or else the function returns

701 * immediately. Check for U_FAILURE() on output or use with

702 * function chaining. (See User Guide for details.)

703 * @return UNormalizationCheckResult

704 * @stable ICU 4.4

705 */

706 virtual UNormalizationCheckResult

     quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;

708 /**

709 * Returns the end of the normalized substring of the input string.

710 * For details see the Normalizer2 base class documentation.

711 * @param s input string

712 * @param errorCode Standard ICU error code. Its input value must

713 * pass the U_SUCCESS() test, or else the function returns

714 * immediately. Check for U_FAILURE() on output or use with

715 * function chaining. (See User Guide for details.)

716 * @return "yes" span end index

717 * @stable ICU 4.4

718 */

719 virtual int32_t

     spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;

721

722 /**

723 * Tests if the character always has a normalization boundary before it,

724 * regardless of context.

725 * For details see the Normalizer2 base class documentation.

726 * @param c character to test

727 * @return TRUE if c has a normalization boundary before it

728 * @stable ICU 4.4

729 */

     virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE;

731

732 /**

733 * Tests if the character always has a normalization boundary after it,

734 * regardless of context.

735 * For details see the Normalizer2 base class documentation.

736 * @param c character to test

737 * @return TRUE if c has a normalization boundary after it

738 * @stable ICU 4.4

739 */

     virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE;

741

742 /**

743 * Tests if the character is normalization-inert.

744 * For details see the Normalizer2 base class documentation.

745 * @param c character to test

746 * @return TRUE if c is normalization-inert

747 * @stable ICU 4.4

748 */

     virtual UBool isInert(UChar32 c) const U_OVERRIDE;

750 private:

751 UnicodeString &

     normalize(const UnicodeString &src,

753 UnicodeString &dest,

754 USetSpanCondition spanCondition,

755 UErrorCode &errorCode) const;

756

757 void

     normalizeUTF8(uint32_t options, const char *src, int32_t length,

759 ByteSink &sink, Edits *edits,

760 USetSpanCondition spanCondition,

761 UErrorCode &errorCode) const;

762

763 UnicodeString &

764 normalizeSecondAndAppend(UnicodeString &first,

765 const UnicodeString &second,

766 UBool doNormalize,

767 UErrorCode &errorCode) const;

768

769 const Normalizer2 &norm2;

770 const UnicodeSet &set;

771 };

772

773 U_NAMESPACE_END

774

775 #endif // !UCONFIG_NO_NORMALIZATION

776

777 #endif /* U_SHOW_CPLUSPLUS_API */

778

779 #endif // __NORMALIZER2_H__