git.saurik.com Git - apple/javascriptcore.git/blob

1 /*

2 *******************************************************************************

3 *

6 *

7 *******************************************************************************

8 * file name: unorm2.h

9 * encoding: US-ASCII

10 * tab size: 8 (not used)

11 * indentation:4

12 *

13 * created on: 2009dec15

14 * created by: Markus W. Scherer

15 */

17 #ifndef __UNORM2_H__

18 #define __UNORM2_H__

20 /**

21 * \file

22 * \brief C API: New API for Unicode Normalization.

23 *

24 * Unicode normalization functionality for standard Unicode normalization or

25 * for using custom mapping tables.

26 * All instances of UNormalizer2 are unmodifiable/immutable.

27 * Instances returned by unorm2_getInstance() are singletons that must not be deleted by the caller.

28 * For more details see the Normalizer2 C++ class.

29 */

31 #include "unicode/utypes.h"

32 #include "unicode/localpointer.h"

33 #include "unicode/uset.h"

35 /**

36 * Constants for normalization modes.

37 * For details about standard Unicode normalization forms

38 * and about the algorithms which are also used with custom mapping tables

39 * see http://www.unicode.org/unicode/reports/tr15/

40 * @stable ICU 4.4

41 */

42 typedef enum {

43 /**

44 * Decomposition followed by composition.

45 * Same as standard NFC when using an "nfc" instance.

46 * Same as standard NFKC when using an "nfkc" instance.

47 * For details about standard Unicode normalization forms

48 * see http://www.unicode.org/unicode/reports/tr15/

49 * @stable ICU 4.4

50 */

51 UNORM2_COMPOSE,

52 /**

53 * Map, and reorder canonically.

54 * Same as standard NFD when using an "nfc" instance.

55 * Same as standard NFKD when using an "nfkc" instance.

56 * For details about standard Unicode normalization forms

57 * see http://www.unicode.org/unicode/reports/tr15/

58 * @stable ICU 4.4

59 */

60 UNORM2_DECOMPOSE,

61 /**

62 * "Fast C or D" form.

63 * If a string is in this form, then further decomposition <i>without reordering</i>

64 * would yield the same form as DECOMPOSE.

65 * Text in "Fast C or D" form can be processed efficiently with data tables

66 * that are "canonically closed", that is, that provide equivalent data for

67 * equivalent text, without having to be fully normalized.

68 * Not a standard Unicode normalization form.

69 * Not a unique form: Different FCD strings can be canonically equivalent.

70 * For details see http://www.unicode.org/notes/tn5/#FCD

71 * @stable ICU 4.4

72 */

73 UNORM2_FCD,

74 /**

75 * Compose only contiguously.

76 * Also known as "FCC" or "Fast C Contiguous".

77 * The result will often but not always be in NFC.

78 * The result will conform to FCD which is useful for processing.

79 * Not a standard Unicode normalization form.

80 * For details see http://www.unicode.org/notes/tn5/#FCC

81 * @stable ICU 4.4

82 */

83 UNORM2_COMPOSE_CONTIGUOUS

84 } UNormalization2Mode;

86 /**

87 * Result values for normalization quick check functions.

88 * For details see http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms

89 * @stable ICU 2.0

90 */

91 typedef enum UNormalizationCheckResult {

92 /**

93 * The input string is not in the normalization form.

94 * @stable ICU 2.0

95 */

96 UNORM_NO,

97 /**

98 * The input string is in the normalization form.

99 * @stable ICU 2.0

100 */

101 UNORM_YES,

102 /**

103 * The input string may or may not be in the normalization form.

104 * This value is only returned for composition forms like NFC and FCC,

105 * when a backward-combining character is found for which the surrounding text

106 * would have to be analyzed further.

107 * @stable ICU 2.0

108 */

109 UNORM_MAYBE

110 } UNormalizationCheckResult;

111

112 /**

113 * Opaque C service object type for the new normalization API.

114 * @stable ICU 4.4

115 */

116 struct UNormalizer2;

117 typedef struct UNormalizer2 UNormalizer2; /**< C typedef for struct UNormalizer2. @stable ICU 4.4 */

118

119 #if !UCONFIG_NO_NORMALIZATION

120

121 /**

122 * Returns a UNormalizer2 instance which uses the specified data file

123 * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)

124 * and which composes or decomposes text according to the specified mode.

125 * Returns an unmodifiable singleton instance. Do not delete it.

126 *

127 * Use packageName=NULL for data files that are part of ICU's own data.

128 * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.

129 * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.

130 * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.

131 *

132 * @param packageName NULL for ICU built-in data, otherwise application data package name

133 * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file

134 * @param mode normalization mode (compose or decompose etc.)

135 * @param pErrorCode Standard ICU error code. Its input value must

136 * pass the U_SUCCESS() test, or else the function returns

137 * immediately. Check for U_FAILURE() on output or use with

138 * function chaining. (See User Guide for details.)

139 * @return the requested UNormalizer2, if successful

140 * @stable ICU 4.4

141 */

142 U_STABLE const UNormalizer2 * U_EXPORT2

 unorm2_getInstance(const char *packageName,

144 const char *name,

145 UNormalization2Mode mode,

146 UErrorCode *pErrorCode);

147

148 /**

149 * Constructs a filtered normalizer wrapping any UNormalizer2 instance

150 * and a filter set.

151 * Both are aliased and must not be modified or deleted while this object

152 * is used.

153 * The filter set should be frozen; otherwise the performance will suffer greatly.

154 * @param norm2 wrapped UNormalizer2 instance

155 * @param filterSet USet which determines the characters to be normalized

156 * @param pErrorCode Standard ICU error code. Its input value must

157 * pass the U_SUCCESS() test, or else the function returns

158 * immediately. Check for U_FAILURE() on output or use with

159 * function chaining. (See User Guide for details.)

160 * @return the requested UNormalizer2, if successful

161 * @stable ICU 4.4

162 */

163 U_STABLE UNormalizer2 * U_EXPORT2

 unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode);

165

166 /**

167 * Closes a UNormalizer2 instance from unorm2_openFiltered().

168 * Do not close instances from unorm2_getInstance()!

169 * @param norm2 UNormalizer2 instance to be closed

170 * @stable ICU 4.4

171 */

172 U_STABLE void U_EXPORT2

173 unorm2_close(UNormalizer2 *norm2);

174

175 #if U_SHOW_CPLUSPLUS_API

176

177 U_NAMESPACE_BEGIN

178

179 /**

180 * \class LocalUNormalizer2Pointer

181 * "Smart pointer" class, closes a UNormalizer2 via unorm2_close().

182 * For most methods see the LocalPointerBase base class.

183 *

184 * @see LocalPointerBase

185 * @see LocalPointer

186 * @stable ICU 4.4

187 */

 U_DEFINE_LOCAL_OPEN_POINTER(LocalUNormalizer2Pointer, UNormalizer2, unorm2_close);

189

190 U_NAMESPACE_END

191

192 #endif

193

194 /**

195 * Writes the normalized form of the source string to the destination string

196 * (replacing its contents) and returns the length of the destination string.

197 * The source and destination strings must be different buffers.

198 * @param norm2 UNormalizer2 instance

199 * @param src source string

200 * @param length length of the source string, or -1 if NUL-terminated

201 * @param dest destination string; its contents is replaced with normalized src

202 * @param capacity number of UChars that can be written to dest

203 * @param pErrorCode Standard ICU error code. Its input value must

204 * pass the U_SUCCESS() test, or else the function returns

205 * immediately. Check for U_FAILURE() on output or use with

206 * function chaining. (See User Guide for details.)

207 * @return dest

208 * @stable ICU 4.4

209 */

210 U_STABLE int32_t U_EXPORT2

 unorm2_normalize(const UNormalizer2 *norm2,

                  const UChar *src, int32_t length,

213 UChar *dest, int32_t capacity,

214 UErrorCode *pErrorCode);

215 /**

216 * Appends the normalized form of the second string to the first string

217 * (merging them at the boundary) and returns the length of the first string.

218 * The result is normalized if the first string was normalized.

219 * The first and second strings must be different buffers.

220 * @param norm2 UNormalizer2 instance

221 * @param first string, should be normalized

222 * @param firstLength length of the first string, or -1 if NUL-terminated

223 * @param firstCapacity number of UChars that can be written to first

224 * @param second string, will be normalized

225 * @param secondLength length of the source string, or -1 if NUL-terminated

226 * @param pErrorCode Standard ICU error code. Its input value must

227 * pass the U_SUCCESS() test, or else the function returns

228 * immediately. Check for U_FAILURE() on output or use with

229 * function chaining. (See User Guide for details.)

230 * @return first

231 * @stable ICU 4.4

232 */

233 U_STABLE int32_t U_EXPORT2

 unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,

                                 UChar *first, int32_t firstLength, int32_t firstCapacity,

                                 const UChar *second, int32_t secondLength,

237 UErrorCode *pErrorCode);

238 /**

239 * Appends the second string to the first string

240 * (merging them at the boundary) and returns the length of the first string.

241 * The result is normalized if both the strings were normalized.

242 * The first and second strings must be different buffers.

243 * @param norm2 UNormalizer2 instance

244 * @param first string, should be normalized

245 * @param firstLength length of the first string, or -1 if NUL-terminated

246 * @param firstCapacity number of UChars that can be written to first

247 * @param second string, should be normalized

248 * @param secondLength length of the source string, or -1 if NUL-terminated

249 * @param pErrorCode Standard ICU error code. Its input value must

250 * pass the U_SUCCESS() test, or else the function returns

251 * immediately. Check for U_FAILURE() on output or use with

252 * function chaining. (See User Guide for details.)

253 * @return first

254 * @stable ICU 4.4

255 */

256 U_STABLE int32_t U_EXPORT2

 unorm2_append(const UNormalizer2 *norm2,

               UChar *first, int32_t firstLength, int32_t firstCapacity,

               const UChar *second, int32_t secondLength,

260 UErrorCode *pErrorCode);

261

262 /**

263 * Gets the decomposition mapping of c. Equivalent to unorm2_normalize(string(c))

264 * on a UNORM2_DECOMPOSE UNormalizer2 instance, but much faster.

265 * This function is independent of the mode of the UNormalizer2.

266 * @param norm2 UNormalizer2 instance

267 * @param c code point

268 * @param decomposition String buffer which will be set to c's

269 * decomposition mapping, if there is one.

270 * @param capacity number of UChars that can be written to decomposition

271 * @param pErrorCode Standard ICU error code. Its input value must

272 * pass the U_SUCCESS() test, or else the function returns

273 * immediately. Check for U_FAILURE() on output or use with

274 * function chaining. (See User Guide for details.)

275 * @return the non-negative length of c's decomposition, if there is one; otherwise a negative value

276 * @draft ICU 4.6

277 */

278 U_DRAFT int32_t U_EXPORT2

 unorm2_getDecomposition(const UNormalizer2 *norm2,

                         UChar32 c, UChar *decomposition, int32_t capacity,

281 UErrorCode *pErrorCode);

282

283 /**

284 * Tests if the string is normalized.

285 * Internally, in cases where the quickCheck() method would return "maybe"

286 * (which is only possible for the two COMPOSE modes) this method

287 * resolves to "yes" or "no" to provide a definitive result,

288 * at the cost of doing more work in those cases.

289 * @param norm2 UNormalizer2 instance

290 * @param s input string

291 * @param length length of the string, or -1 if NUL-terminated

292 * @param pErrorCode Standard ICU error code. Its input value must

293 * pass the U_SUCCESS() test, or else the function returns

294 * immediately. Check for U_FAILURE() on output or use with

295 * function chaining. (See User Guide for details.)

296 * @return TRUE if s is normalized

297 * @stable ICU 4.4

298 */

299 U_STABLE UBool U_EXPORT2

 unorm2_isNormalized(const UNormalizer2 *norm2,

                     const UChar *s, int32_t length,

302 UErrorCode *pErrorCode);

303

304 /**

305 * Tests if the string is normalized.

306 * For the two COMPOSE modes, the result could be "maybe" in cases that

307 * would take a little more work to resolve definitively.

308 * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster

309 * combination of quick check + normalization, to avoid

310 * re-checking the "yes" prefix.

311 * @param norm2 UNormalizer2 instance

312 * @param s input string

313 * @param length length of the string, or -1 if NUL-terminated

314 * @param pErrorCode Standard ICU error code. Its input value must

315 * pass the U_SUCCESS() test, or else the function returns

316 * immediately. Check for U_FAILURE() on output or use with

317 * function chaining. (See User Guide for details.)

318 * @return UNormalizationCheckResult

319 * @stable ICU 4.4

320 */

321 U_STABLE UNormalizationCheckResult U_EXPORT2

 unorm2_quickCheck(const UNormalizer2 *norm2,

                   const UChar *s, int32_t length,

324 UErrorCode *pErrorCode);

325

326 /**

327 * Returns the end of the normalized substring of the input string.

328 * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>

329 * the substring <code>UnicodeString(s, 0, end)</code>

330 * will pass the quick check with a "yes" result.

331 *

332 * The returned end index is usually one or more characters before the

333 * "no" or "maybe" character: The end index is at a normalization boundary.

334 * (See the class documentation for more about normalization boundaries.)

335 *

336 * When the goal is a normalized string and most input strings are expected

337 * to be normalized already, then call this method,

338 * and if it returns a prefix shorter than the input string,

339 * copy that prefix and use normalizeSecondAndAppend() for the remainder.

340 * @param norm2 UNormalizer2 instance

341 * @param s input string

342 * @param length length of the string, or -1 if NUL-terminated

343 * @param pErrorCode Standard ICU error code. Its input value must

344 * pass the U_SUCCESS() test, or else the function returns

345 * immediately. Check for U_FAILURE() on output or use with

346 * function chaining. (See User Guide for details.)

347 * @return "yes" span end index

348 * @stable ICU 4.4

349 */

350 U_STABLE int32_t U_EXPORT2

 unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,

                          const UChar *s, int32_t length,

353 UErrorCode *pErrorCode);

354

355 /**

356 * Tests if the character always has a normalization boundary before it,

357 * regardless of context.

358 * For details see the Normalizer2 base class documentation.

359 * @param norm2 UNormalizer2 instance

360 * @param c character to test

361 * @return TRUE if c has a normalization boundary before it

362 * @stable ICU 4.4

363 */

364 U_STABLE UBool U_EXPORT2

 unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c);

366

367 /**

368 * Tests if the character always has a normalization boundary after it,

369 * regardless of context.

370 * For details see the Normalizer2 base class documentation.

371 * @param norm2 UNormalizer2 instance

372 * @param c character to test

373 * @return TRUE if c has a normalization boundary after it

374 * @stable ICU 4.4

375 */

376 U_STABLE UBool U_EXPORT2

 unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c);

378

379 /**

380 * Tests if the character is normalization-inert.

381 * For details see the Normalizer2 base class documentation.

382 * @param norm2 UNormalizer2 instance

383 * @param c character to test

384 * @return TRUE if c is normalization-inert

385 * @stable ICU 4.4

386 */

387 U_STABLE UBool U_EXPORT2

 unorm2_isInert(const UNormalizer2 *norm2, UChar32 c);

389

390 #endif /* !UCONFIG_NO_NORMALIZATION */

391 #endif /* __UNORM2_H__ */