]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/uchar.c
ICU-6.2.8.tar.gz
[apple/icu.git] / icuSources / common / uchar.c
1 /*
2 ********************************************************************************
3 * Copyright (C) 1996-2004, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 ********************************************************************************
6 *
7 * File UCHAR.C
8 *
9 * Modification History:
10 *
11 * Date Name Description
12 * 04/02/97 aliu Creation.
13 * 4/15/99 Madhu Updated all the function definitions for C Implementation
14 * 5/20/99 Madhu Added the function u_getVersion()
15 * 8/19/1999 srl Upgraded scripts to Unicode3.0
16 * 11/11/1999 weiv added u_isalnum(), cleaned comments
17 * 01/11/2000 helena Renamed u_getVersion to u_getUnicodeVersion.
18 * 06/20/2000 helena OS/400 port changes; mostly typecast.
19 ******************************************************************************
20 */
21
22 #include "unicode/utypes.h"
23 #include "unicode/uchar.h"
24 #include "unicode/uscript.h"
25 #include "unicode/udata.h"
26 #include "umutex.h"
27 #include "cmemory.h"
28 #include "ucln_cmn.h"
29 #include "utrie.h"
30 #include "udataswp.h"
31 #include "unormimp.h" /* JAMO_L_BASE etc. */
32 #include "uprops.h"
33
34 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
35
36 /* dynamically loaded Unicode character properties -------------------------- */
37
38 /*
39 * loaded uprops.dat -
40 * for a description of the file format, see icu/source/tools/genprops/store.c
41 */
42 static const char DATA_NAME[] = "uprops";
43 static const char DATA_TYPE[] = "icu";
44
45 static UDataMemory *propsData=NULL;
46 static UErrorCode dataErrorCode=U_ZERO_ERROR;
47
48 static uint8_t formatVersion[4]={ 0, 0, 0, 0 };
49 static UVersionInfo dataVersion={ 0, 0, 0, 0 };
50
51 static UTrie propsTrie={ 0 }, propsVectorsTrie={ 0 };
52 static const uint32_t *pData32=NULL, *props32Table=NULL, *exceptionsTable=NULL, *propsVectors=NULL;
53 static const UChar *ucharsTable=NULL;
54 static int32_t countPropsVectors=0, propsVectorsColumns=0;
55
56 static int8_t havePropsData=0; /* == 0 -> Data has not been loaded.
57 * < 0 -> Error occured attempting to load data.
58 * > 0 -> Data has been successfully loaded.
59 */
60
61 /* index values loaded from uprops.dat */
62 static int32_t indexes[UPROPS_INDEX_COUNT];
63
64 /* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */
65 static int32_t U_CALLCONV
66 getFoldingPropsOffset(uint32_t data) {
67 if(data&0x8000) {
68 return (int32_t)(data&0x7fff);
69 } else {
70 return 0;
71 }
72 }
73
74 static UBool U_CALLCONV
75 isAcceptable(void *context,
76 const char *type, const char *name,
77 const UDataInfo *pInfo) {
78 if(
79 pInfo->size>=20 &&
80 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
81 pInfo->charsetFamily==U_CHARSET_FAMILY &&
82 pInfo->dataFormat[0]==0x55 && /* dataFormat="UPro" */
83 pInfo->dataFormat[1]==0x50 &&
84 pInfo->dataFormat[2]==0x72 &&
85 pInfo->dataFormat[3]==0x6f &&
86 pInfo->formatVersion[0]==3 &&
87 pInfo->formatVersion[2]==UTRIE_SHIFT &&
88 pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
89 ) {
90 uprv_memcpy(formatVersion, pInfo->formatVersion, 4);
91 uprv_memcpy(dataVersion, pInfo->dataVersion, 4);
92 return TRUE;
93 } else {
94 return FALSE;
95 }
96 }
97
98 static UBool U_CALLCONV uchar_cleanup(void)
99 {
100 if (propsData) {
101 udata_close(propsData);
102 propsData=NULL;
103 }
104 pData32=NULL;
105 props32Table=NULL;
106 exceptionsTable=NULL;
107 ucharsTable=NULL;
108 propsVectors=NULL;
109 countPropsVectors=0;
110 dataErrorCode=U_ZERO_ERROR;
111 havePropsData=0;
112
113 return TRUE;
114 }
115
116 struct UCharProps {
117 UDataMemory *propsData;
118 UTrie propsTrie, propsVectorsTrie;
119 const uint32_t *pData32;
120 };
121 typedef struct UCharProps UCharProps;
122
123 /* open uprops.icu */
124 static void
125 _openProps(UCharProps *ucp, UErrorCode *pErrorCode) {
126 const uint32_t *p;
127 int32_t length;
128
129 ucp->propsData=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, pErrorCode);
130 if(U_FAILURE(*pErrorCode)) {
131 return;
132 }
133
134 ucp->pData32=p=(const uint32_t *)udata_getMemory(ucp->propsData);
135
136 /* unserialize the trie; it is directly after the int32_t indexes[UPROPS_INDEX_COUNT] */
137 length=(int32_t)p[UPROPS_PROPS32_INDEX]*4;
138 length=utrie_unserialize(&ucp->propsTrie, (const uint8_t *)(p+UPROPS_INDEX_COUNT), length-64, pErrorCode);
139 if(U_FAILURE(*pErrorCode)) {
140 return;
141 }
142 ucp->propsTrie.getFoldingOffset=getFoldingPropsOffset;
143
144 /* unserialize the properties vectors trie, if any */
145 if( p[UPROPS_ADDITIONAL_TRIE_INDEX]!=0 &&
146 p[UPROPS_ADDITIONAL_VECTORS_INDEX]!=0
147 ) {
148 length=(int32_t)(p[UPROPS_ADDITIONAL_VECTORS_INDEX]-p[UPROPS_ADDITIONAL_TRIE_INDEX])*4;
149 length=utrie_unserialize(&ucp->propsVectorsTrie, (const uint8_t *)(p+p[UPROPS_ADDITIONAL_TRIE_INDEX]), length, pErrorCode);
150 if(U_FAILURE(*pErrorCode)) {
151 uprv_memset(&ucp->propsVectorsTrie, 0, sizeof(ucp->propsVectorsTrie));
152 } else {
153 ucp->propsVectorsTrie.getFoldingOffset=getFoldingPropsOffset;
154 }
155 }
156 }
157
158 U_CFUNC int8_t
159 uprv_loadPropsData(UErrorCode *pErrorCode) {
160 /* load Unicode character properties data from file if necessary */
161
162 /*
163 * This lazy intialization with double-checked locking (without mutex protection for
164 * haveNormData==0) is transiently unsafe under certain circumstances.
165 * Check the readme and use u_init() if necessary.
166 */
167 if(havePropsData==0) {
168 UCharProps ucp={ NULL };
169 UCaseProps *csp;
170
171 if(U_FAILURE(*pErrorCode)) {
172 return havePropsData;
173 }
174
175 /* open the data outside the mutex block */
176 _openProps(&ucp, pErrorCode);
177
178 if(U_SUCCESS(*pErrorCode)) {
179 /* in the mutex block, set the data for this process */
180 umtx_lock(NULL);
181 if(propsData==NULL) {
182 propsData=ucp.propsData;
183 ucp.propsData=NULL;
184 pData32=ucp.pData32;
185 ucp.pData32=NULL;
186 uprv_memcpy(&propsTrie, &ucp.propsTrie, sizeof(propsTrie));
187 uprv_memcpy(&propsVectorsTrie, &ucp.propsVectorsTrie, sizeof(propsVectorsTrie));
188 csp=NULL;
189 }
190
191 /* initialize some variables */
192 uprv_memcpy(indexes, pData32, sizeof(indexes));
193 props32Table=pData32+indexes[UPROPS_PROPS32_INDEX];
194 exceptionsTable=pData32+indexes[UPROPS_EXCEPTIONS_INDEX];
195 ucharsTable=(const UChar *)(pData32+indexes[UPROPS_EXCEPTIONS_TOP_INDEX]);
196
197 /* additional properties */
198 if(indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]!=0) {
199 propsVectors=pData32+indexes[UPROPS_ADDITIONAL_VECTORS_INDEX];
200 countPropsVectors=indexes[UPROPS_RESERVED_INDEX]-indexes[UPROPS_ADDITIONAL_VECTORS_INDEX];
201 propsVectorsColumns=indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX];
202 }
203
204 havePropsData=1;
205 umtx_unlock(NULL);
206 } else {
207 dataErrorCode=*pErrorCode;
208 havePropsData=-1;
209 }
210 ucln_common_registerCleanup(UCLN_COMMON_UCHAR, uchar_cleanup);
211
212 /* if a different thread set it first, then close the extra data */
213 udata_close(ucp.propsData); /* NULL if it was set correctly */
214 }
215
216 return havePropsData;
217 }
218
219
220 static int8_t
221 loadPropsData(void) {
222 UErrorCode errorCode = U_ZERO_ERROR;
223 int8_t retVal = uprv_loadPropsData(&errorCode);
224 return retVal;
225 }
226
227
228 /* Unicode properties data swapping ----------------------------------------- */
229
230 U_CAPI int32_t U_EXPORT2
231 uprops_swap(const UDataSwapper *ds,
232 const void *inData, int32_t length, void *outData,
233 UErrorCode *pErrorCode) {
234 const UDataInfo *pInfo;
235 int32_t headerSize, i;
236
237 int32_t dataIndexes[UPROPS_INDEX_COUNT];
238 const int32_t *inData32;
239
240 /* udata_swapDataHeader checks the arguments */
241 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
242 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
243 return 0;
244 }
245
246 /* check data format and format version */
247 pInfo=(const UDataInfo *)((const char *)inData+4);
248 if(!(
249 pInfo->dataFormat[0]==0x55 && /* dataFormat="UPro" */
250 pInfo->dataFormat[1]==0x50 &&
251 pInfo->dataFormat[2]==0x72 &&
252 pInfo->dataFormat[3]==0x6f &&
253 pInfo->formatVersion[0]==3 &&
254 pInfo->formatVersion[2]==UTRIE_SHIFT &&
255 pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
256 )) {
257 udata_printError(ds, "uprops_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not a Unicode properties file\n",
258 pInfo->dataFormat[0], pInfo->dataFormat[1],
259 pInfo->dataFormat[2], pInfo->dataFormat[3],
260 pInfo->formatVersion[0]);
261 *pErrorCode=U_UNSUPPORTED_ERROR;
262 return 0;
263 }
264
265 /* the properties file must contain at least the indexes array */
266 if(length>=0 && (length-headerSize)<sizeof(dataIndexes)) {
267 udata_printError(ds, "uprops_swap(): too few bytes (%d after header) for a Unicode properties file\n",
268 length-headerSize);
269 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
270 return 0;
271 }
272
273 /* read the indexes */
274 inData32=(const int32_t *)((const char *)inData+headerSize);
275 for(i=0; i<UPROPS_INDEX_COUNT; ++i) {
276 dataIndexes[i]=udata_readInt32(ds, inData32[i]);
277 }
278
279 /*
280 * comments are copied from the data format description in genprops/store.c
281 * indexes[] constants are in uprops.h
282 */
283 if(length>=0) {
284 int32_t *outData32;
285
286 if((length-headerSize)<(4*dataIndexes[UPROPS_RESERVED_INDEX])) {
287 udata_printError(ds, "uprops_swap(): too few bytes (%d after header) for a Unicode properties file\n",
288 length-headerSize);
289 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
290 return 0;
291 }
292
293 outData32=(int32_t *)((char *)outData+headerSize);
294
295 /* copy everything for inaccessible data (padding) */
296 if(inData32!=outData32) {
297 uprv_memcpy(outData32, inData32, 4*dataIndexes[UPROPS_RESERVED_INDEX]);
298 }
299
300 /* swap the indexes[16] */
301 ds->swapArray32(ds, inData32, 4*UPROPS_INDEX_COUNT, outData32, pErrorCode);
302
303 /*
304 * swap the main properties UTrie
305 * PT serialized properties trie, see utrie.h (byte size: 4*(i0-16))
306 */
307 utrie_swap(ds,
308 inData32+UPROPS_INDEX_COUNT,
309 4*(dataIndexes[UPROPS_PROPS32_INDEX]-UPROPS_INDEX_COUNT),
310 outData32+UPROPS_INDEX_COUNT,
311 pErrorCode);
312
313 /*
314 * swap the properties and exceptions words
315 * P const uint32_t props32[i1-i0];
316 * E const uint32_t exceptions[i2-i1];
317 */
318 ds->swapArray32(ds,
319 inData32+dataIndexes[UPROPS_PROPS32_INDEX],
320 4*(dataIndexes[UPROPS_EXCEPTIONS_TOP_INDEX]-dataIndexes[UPROPS_PROPS32_INDEX]),
321 outData32+dataIndexes[UPROPS_PROPS32_INDEX],
322 pErrorCode);
323
324 /*
325 * swap the UChars
326 * U const UChar uchars[2*(i3-i2)];
327 */
328 ds->swapArray16(ds,
329 inData32+dataIndexes[UPROPS_EXCEPTIONS_TOP_INDEX],
330 4*(dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX]-dataIndexes[UPROPS_EXCEPTIONS_TOP_INDEX]),
331 outData32+dataIndexes[UPROPS_EXCEPTIONS_TOP_INDEX],
332 pErrorCode);
333
334 /*
335 * swap the additional UTrie
336 * i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties
337 */
338 utrie_swap(ds,
339 inData32+dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX],
340 4*(dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX]-dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX]),
341 outData32+dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX],
342 pErrorCode);
343
344 /*
345 * swap the properties vectors
346 * PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4];
347 */
348 ds->swapArray32(ds,
349 inData32+dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX],
350 4*(dataIndexes[UPROPS_RESERVED_INDEX]-dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX]),
351 outData32+dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX],
352 pErrorCode);
353 }
354
355 /* i6 reservedItemIndex; -- 32-bit unit index to the top of the properties vectors table */
356 return headerSize+4*dataIndexes[UPROPS_RESERVED_INDEX];
357 }
358
359 /* constants and macros for access to the data ------------------------------ */
360
361 /* getting a uint32_t properties word from the data */
362 #define HAVE_DATA (havePropsData>0 || loadPropsData()>0)
363 #define VALIDATE(c) (((uint32_t)(c))<=0x10ffff && HAVE_DATA)
364 #define GET_PROPS_UNSAFE(c, result) \
365 UTRIE_GET16(&propsTrie, c, result); \
366 (result)=props32Table[(result)]
367 #define GET_PROPS(c, result) \
368 if(HAVE_DATA) { \
369 GET_PROPS_UNSAFE(c, result); \
370 } else { \
371 (result)=0; \
372 }
373
374 /* finding an exception value */
375 #define HAVE_EXCEPTION_VALUE(flags, index) ((flags)&(1UL<<(index)))
376
377 /* number of bits in an 8-bit integer value */
378 #define EXC_GROUP 8
379 static const uint8_t flagsOffset[256]={
380 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
381 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
382 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
383 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
384 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
385 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
386 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
387 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
388 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
389 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
390 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
391 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
392 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
393 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
394 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
395 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
396 };
397
398 #define ADD_EXCEPTION_OFFSET(flags, index, offset) { \
399 if((index)>=EXC_GROUP) { \
400 (offset)+=flagsOffset[(flags)&((1<<EXC_GROUP)-1)]; \
401 (flags)>>=EXC_GROUP; \
402 (index)-=EXC_GROUP; \
403 } \
404 (offset)+=flagsOffset[(flags)&((1<<(index))-1)]; \
405 }
406
407 U_CFUNC UBool
408 uprv_haveProperties(UErrorCode *pErrorCode) {
409 if(U_FAILURE(*pErrorCode)) {
410 return FALSE;
411 }
412 if(havePropsData==0) {
413 uprv_loadPropsData(pErrorCode);
414 }
415 if(havePropsData<0) {
416 *pErrorCode=dataErrorCode;
417 return FALSE;
418 }
419 return TRUE;
420 }
421
422 /* API functions ------------------------------------------------------------ */
423
424 /* Gets the Unicode character's general category.*/
425 U_CAPI int8_t U_EXPORT2
426 u_charType(UChar32 c) {
427 uint32_t props;
428 GET_PROPS(c, props);
429 return (int8_t)GET_CATEGORY(props);
430 }
431
432 /* Enumerate all code points with their general categories. */
433 struct _EnumTypeCallback {
434 UCharEnumTypeRange *enumRange;
435 const void *context;
436 };
437
438 static uint32_t U_CALLCONV
439 _enumTypeValue(const void *context, uint32_t value) {
440 /* access the general category from the 32-bit properties, and those from the 16-bit trie value */
441 return GET_CATEGORY(props32Table[value]);
442 }
443
444 static UBool U_CALLCONV
445 _enumTypeRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
446 /* just cast the value to UCharCategory */
447 return ((struct _EnumTypeCallback *)context)->
448 enumRange(((struct _EnumTypeCallback *)context)->context,
449 start, limit, (UCharCategory)value);
450 }
451
452 U_CAPI void U_EXPORT2
453 u_enumCharTypes(UCharEnumTypeRange *enumRange, const void *context) {
454 struct _EnumTypeCallback callback;
455
456 if(enumRange==NULL || !HAVE_DATA) {
457 return;
458 }
459
460 callback.enumRange=enumRange;
461 callback.context=context;
462 utrie_enum(&propsTrie, _enumTypeValue, _enumTypeRange, &callback);
463 }
464
465 /* Checks if ch is a lower case letter.*/
466 U_CAPI UBool U_EXPORT2
467 u_islower(UChar32 c) {
468 uint32_t props;
469 GET_PROPS(c, props);
470 return (UBool)(GET_CATEGORY(props)==U_LOWERCASE_LETTER);
471 }
472
473 /* Checks if ch is an upper case letter.*/
474 U_CAPI UBool U_EXPORT2
475 u_isupper(UChar32 c) {
476 uint32_t props;
477 GET_PROPS(c, props);
478 return (UBool)(GET_CATEGORY(props)==U_UPPERCASE_LETTER);
479 }
480
481 /* Checks if ch is a title case letter; usually upper case letters.*/
482 U_CAPI UBool U_EXPORT2
483 u_istitle(UChar32 c) {
484 uint32_t props;
485 GET_PROPS(c, props);
486 return (UBool)(GET_CATEGORY(props)==U_TITLECASE_LETTER);
487 }
488
489 /* Checks if ch is a decimal digit. */
490 U_CAPI UBool U_EXPORT2
491 u_isdigit(UChar32 c) {
492 uint32_t props;
493 GET_PROPS(c, props);
494 return (UBool)(GET_CATEGORY(props)==U_DECIMAL_DIGIT_NUMBER);
495 }
496
497 U_CAPI UBool U_EXPORT2
498 u_isxdigit(UChar32 c) {
499 uint32_t props;
500
501 /* check ASCII and Fullwidth ASCII a-fA-F */
502 if(
503 (c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) ||
504 (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41))
505 ) {
506 return TRUE;
507 }
508
509 GET_PROPS(c, props);
510 return (UBool)(GET_CATEGORY(props)==U_DECIMAL_DIGIT_NUMBER);
511 }
512
513 /* Checks if the Unicode character is a letter.*/
514 U_CAPI UBool U_EXPORT2
515 u_isalpha(UChar32 c) {
516 uint32_t props;
517 GET_PROPS(c, props);
518 return (UBool)((CAT_MASK(props)&U_GC_L_MASK)!=0);
519 }
520
521 U_CAPI UBool U_EXPORT2
522 u_isUAlphabetic(UChar32 c) {
523 return (u_getUnicodeProperties(c, 1)&U_MASK(UPROPS_ALPHABETIC))!=0;
524 }
525
526 /* Checks if ch is a letter or a decimal digit */
527 U_CAPI UBool U_EXPORT2
528 u_isalnum(UChar32 c) {
529 uint32_t props;
530 GET_PROPS(c, props);
531 return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_ND_MASK))!=0);
532 }
533
534 /* Checks if ch is a unicode character with assigned character type.*/
535 U_CAPI UBool U_EXPORT2
536 u_isdefined(UChar32 c) {
537 uint32_t props;
538 GET_PROPS(c, props);
539 return (UBool)(GET_CATEGORY(props)!=0);
540 }
541
542 /* Checks if the Unicode character is a base form character that can take a diacritic.*/
543 U_CAPI UBool U_EXPORT2
544 u_isbase(UChar32 c) {
545 uint32_t props;
546 GET_PROPS(c, props);
547 return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_N_MASK|U_GC_MC_MASK|U_GC_ME_MASK))!=0);
548 }
549
550 /* Checks if the Unicode character is a control character.*/
551 U_CAPI UBool U_EXPORT2
552 u_iscntrl(UChar32 c) {
553 uint32_t props;
554 GET_PROPS(c, props);
555 return (UBool)((CAT_MASK(props)&(U_GC_CC_MASK|U_GC_CF_MASK|U_GC_ZL_MASK|U_GC_ZP_MASK))!=0);
556 }
557
558 U_CAPI UBool U_EXPORT2
559 u_isISOControl(UChar32 c) {
560 return (uint32_t)c<=0x9f && (c<=0x1f || c>=0x7f);
561 }
562
563 /* Some control characters that are used as space. */
564 #define IS_THAT_CONTROL_SPACE(c) \
565 (c<=0x9f && ((c>=TAB && c<=CR) || (c>=0x1c && c <=0x1f) || c==NL))
566
567 /* Checks if the Unicode character is a space character.*/
568 U_CAPI UBool U_EXPORT2
569 u_isspace(UChar32 c) {
570 uint32_t props;
571 GET_PROPS(c, props);
572 return (UBool)((CAT_MASK(props)&U_GC_Z_MASK)!=0 || IS_THAT_CONTROL_SPACE(c));
573 }
574
575 U_CAPI UBool U_EXPORT2
576 u_isJavaSpaceChar(UChar32 c) {
577 uint32_t props;
578 GET_PROPS(c, props);
579 return (UBool)((CAT_MASK(props)&U_GC_Z_MASK)!=0);
580 }
581
582 /* Checks if the Unicode character is a whitespace character.*/
583 U_CAPI UBool U_EXPORT2
584 u_isWhitespace(UChar32 c) {
585 uint32_t props;
586 GET_PROPS(c, props);
587 return (UBool)(
588 ((CAT_MASK(props)&U_GC_Z_MASK)!=0 &&
589 c!=NBSP && c!=FIGURESP && c!=NNBSP) || /* exclude no-break spaces */
590 IS_THAT_CONTROL_SPACE(c)
591 );
592 }
593
594 U_CAPI UBool U_EXPORT2
595 u_isblank(UChar32 c) {
596 if((uint32_t)c<=0x9f) {
597 return c==9 || c==0x20; /* TAB or SPACE */
598 } else {
599 /* White_Space but not LS (Zl) or PS (Zp) */
600 return u_isUWhiteSpace(c) && ((c&0xfffffffe)!=0x2028);
601 }
602 }
603
604 U_CAPI UBool U_EXPORT2
605 u_isUWhiteSpace(UChar32 c) {
606 return (u_getUnicodeProperties(c, 1)&U_MASK(UPROPS_WHITE_SPACE))!=0;
607 }
608
609 /* Checks if the Unicode character is printable.*/
610 U_CAPI UBool U_EXPORT2
611 u_isprint(UChar32 c) {
612 uint32_t props;
613 GET_PROPS(c, props);
614 /* comparing ==0 returns FALSE for the categories mentioned */
615 return (UBool)((CAT_MASK(props)&U_GC_C_MASK)==0);
616 }
617
618 U_CAPI UBool U_EXPORT2
619 u_isgraph(UChar32 c) {
620 uint32_t props;
621 GET_PROPS(c, props);
622 /* comparing ==0 returns FALSE for the categories mentioned */
623 return (UBool)((CAT_MASK(props)&
624 (U_GC_CC_MASK|U_GC_CF_MASK|U_GC_CS_MASK|U_GC_CN_MASK|U_GC_Z_MASK))
625 ==0);
626 }
627
628 U_CAPI UBool U_EXPORT2
629 u_ispunct(UChar32 c) {
630 uint32_t props;
631 GET_PROPS(c, props);
632 return (UBool)((CAT_MASK(props)&U_GC_P_MASK)!=0);
633 }
634
635 /* Checks if the Unicode character can start a Unicode identifier.*/
636 U_CAPI UBool U_EXPORT2
637 u_isIDStart(UChar32 c) {
638 /* same as u_isalpha() */
639 uint32_t props;
640 GET_PROPS(c, props);
641 return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_NL_MASK))!=0);
642 }
643
644 /* Checks if the Unicode character can be a Unicode identifier part other than starting the
645 identifier.*/
646 U_CAPI UBool U_EXPORT2
647 u_isIDPart(UChar32 c) {
648 uint32_t props;
649 GET_PROPS(c, props);
650 return (UBool)(
651 (CAT_MASK(props)&
652 (U_GC_ND_MASK|U_GC_NL_MASK|
653 U_GC_L_MASK|
654 U_GC_PC_MASK|U_GC_MC_MASK|U_GC_MN_MASK)
655 )!=0 ||
656 u_isIDIgnorable(c));
657 }
658
659 /*Checks if the Unicode character can be ignorable in a Java or Unicode identifier.*/
660 U_CAPI UBool U_EXPORT2
661 u_isIDIgnorable(UChar32 c) {
662 if(c<=0x9f) {
663 return u_isISOControl(c) && !IS_THAT_CONTROL_SPACE(c);
664 } else {
665 uint32_t props;
666 GET_PROPS(c, props);
667 return (UBool)(GET_CATEGORY(props)==U_FORMAT_CHAR);
668 }
669 }
670
671 /*Checks if the Unicode character can start a Java identifier.*/
672 U_CAPI UBool U_EXPORT2
673 u_isJavaIDStart(UChar32 c) {
674 uint32_t props;
675 GET_PROPS(c, props);
676 return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_SC_MASK|U_GC_PC_MASK))!=0);
677 }
678
679 /*Checks if the Unicode character can be a Java identifier part other than starting the
680 * identifier.
681 */
682 U_CAPI UBool U_EXPORT2
683 u_isJavaIDPart(UChar32 c) {
684 uint32_t props;
685 GET_PROPS(c, props);
686 return (UBool)(
687 (CAT_MASK(props)&
688 (U_GC_ND_MASK|U_GC_NL_MASK|
689 U_GC_L_MASK|
690 U_GC_SC_MASK|U_GC_PC_MASK|
691 U_GC_MC_MASK|U_GC_MN_MASK)
692 )!=0 ||
693 u_isIDIgnorable(c));
694 }
695
696 U_CAPI int32_t U_EXPORT2
697 u_charDigitValue(UChar32 c) {
698 uint32_t props, numericType;
699 GET_PROPS(c, props);
700 numericType=GET_NUMERIC_TYPE(props);
701
702 if(numericType==1) {
703 if(!PROPS_VALUE_IS_EXCEPTION(props)) {
704 return GET_SIGNED_VALUE(props);
705 } else {
706 const uint32_t *pe=GET_EXCEPTIONS(props);
707 uint32_t firstExceptionValue=*pe;
708 if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_NUMERIC_VALUE)) {
709 int i=EXC_NUMERIC_VALUE;
710 ++pe;
711 ADD_EXCEPTION_OFFSET(firstExceptionValue, i, pe);
712 return (int32_t)*pe;
713 }
714 }
715 }
716
717 return -1;
718 }
719
720 U_CAPI double U_EXPORT2
721 u_getNumericValue(UChar32 c) {
722 uint32_t props, numericType;
723 GET_PROPS(c, props);
724 numericType=GET_NUMERIC_TYPE(props);
725
726 if(numericType==0 || numericType>=(int32_t)U_NT_COUNT) {
727 return U_NO_NUMERIC_VALUE;
728 } else {
729 if(!PROPS_VALUE_IS_EXCEPTION(props)) {
730 return GET_SIGNED_VALUE(props);
731 } else {
732 const uint32_t *pe;
733 uint32_t firstExceptionValue;
734
735 double numValue;
736 uint32_t denominator;
737
738 pe=GET_EXCEPTIONS(props);
739 firstExceptionValue=*pe++;
740
741 if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_NUMERIC_VALUE)) {
742 uint32_t flags=firstExceptionValue;
743 int i=EXC_NUMERIC_VALUE;
744 const uint32_t *p=pe;
745 int32_t numerator;
746
747 ADD_EXCEPTION_OFFSET(flags, i, p);
748 numerator=(int32_t)*p;
749
750 /*
751 * There are special values for huge numbers that are powers of ten.
752 * genprops/store.c documents:
753 * if numericValue=0x7fffff00+x then numericValue=10^x
754 */
755 if(numerator<0x7fffff00) {
756 numValue=(double)numerator;
757 } else {
758 numerator&=0xff;
759
760 /* 10^x without math.h */
761 numValue=1.;
762 while(numerator>=4) {
763 numValue*=10000.;
764 numerator-=4;
765 }
766 switch(numerator) {
767 case 3:
768 numValue*=1000.;
769 break;
770 case 2:
771 numValue*=100.;
772 break;
773 case 1:
774 numValue*=10.;
775 break;
776 case 0:
777 default:
778 break;
779 }
780 }
781 } else {
782 numValue=0.;
783 }
784 if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_DENOMINATOR_VALUE)) {
785 uint32_t flags=firstExceptionValue;
786 int i=EXC_DENOMINATOR_VALUE;
787 const uint32_t *p=pe;
788 ADD_EXCEPTION_OFFSET(flags, i, p);
789 denominator=*p;
790 } else {
791 denominator=0;
792 }
793
794 switch(firstExceptionValue&((1UL<<EXC_NUMERIC_VALUE)|(1UL<<EXC_DENOMINATOR_VALUE))) {
795 case 1UL<<EXC_NUMERIC_VALUE:
796 return numValue;
797 case 1UL<<EXC_DENOMINATOR_VALUE:
798 return (double)1./(double)denominator;
799 case (1UL<<EXC_NUMERIC_VALUE)|(1UL<<EXC_DENOMINATOR_VALUE):
800 return numValue/(double)denominator;
801 case 0: /* none (should not occur with numericType>0) */
802 default:
803 return U_NO_NUMERIC_VALUE;
804 }
805 }
806 }
807 }
808
809 /* Gets the character's linguistic directionality.*/
810 U_CAPI UCharDirection U_EXPORT2
811 u_charDirection(UChar32 c) {
812 uint32_t props;
813 GET_PROPS(c, props);
814 return (UCharDirection)GET_BIDI_CLASS(props);
815 }
816
817 U_CAPI UBool U_EXPORT2
818 u_isMirrored(UChar32 c) {
819 uint32_t props;
820 GET_PROPS(c, props);
821 return (UBool)(props&(1UL<<UPROPS_MIRROR_SHIFT) ? TRUE : FALSE);
822 }
823
824 U_CAPI UChar32 U_EXPORT2
825 u_charMirror(UChar32 c) {
826 uint32_t props;
827 GET_PROPS(c, props);
828 if((props&(1UL<<UPROPS_MIRROR_SHIFT))==0) {
829 /* not mirrored - the value is not a mirror offset */
830 return c;
831 } else if(!PROPS_VALUE_IS_EXCEPTION(props)) {
832 return c+GET_SIGNED_VALUE(props);
833 } else {
834 const uint32_t *pe=GET_EXCEPTIONS(props);
835 uint32_t firstExceptionValue=*pe;
836 if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_MIRROR_MAPPING)) {
837 int i=EXC_MIRROR_MAPPING;
838 ++pe;
839 ADD_EXCEPTION_OFFSET(firstExceptionValue, i, pe);
840 return (UChar32)*pe;
841 } else {
842 return c;
843 }
844 }
845 }
846
847 /* ICU 2.1: u_getCombiningClass() moved to unorm.cpp */
848
849 U_CAPI int32_t U_EXPORT2
850 u_digit(UChar32 ch, int8_t radix) {
851 int8_t value;
852 if((uint8_t)(radix-2)<=(36-2)) {
853 value=(int8_t)u_charDigitValue(ch);
854 if(value<0) {
855 /* ch is not a decimal digit, try latin letters */
856 if(ch>=0x61 && ch<=0x7A) {
857 value=(int8_t)(ch-0x57); /* ch - 'a' + 10 */
858 } else if(ch>=0x41 && ch<=0x5A) {
859 value=(int8_t)(ch-0x37); /* ch - 'A' + 10 */
860 } else if(ch>=0xFF41 && ch<=0xFF5A) {
861 value=(int8_t)(ch-0xFF37); /* fullwidth ASCII a-z */
862 } else if(ch>=0xFF21 && ch<=0xFF3A) {
863 value=(int8_t)(ch-0xFF17); /* fullwidth ASCII A-Z */
864 }
865 }
866 } else {
867 value=-1; /* invalid radix */
868 }
869 return (int8_t)((value<radix) ? value : -1);
870 }
871
872 U_CAPI UChar32 U_EXPORT2
873 u_forDigit(int32_t digit, int8_t radix) {
874 if((uint8_t)(radix-2)>(36-2) || (uint32_t)digit>=(uint32_t)radix) {
875 return 0;
876 } else if(digit<10) {
877 return (UChar32)(0x30+digit);
878 } else {
879 return (UChar32)((0x61-10)+digit);
880 }
881 }
882
883 /* miscellaneous, and support for uprops.c ---------------------------------- */
884
885 U_CAPI void U_EXPORT2
886 u_getUnicodeVersion(UVersionInfo versionArray) {
887 if(versionArray!=NULL) {
888 if(HAVE_DATA) {
889 uprv_memcpy(versionArray, dataVersion, U_MAX_VERSION_LENGTH);
890 } else {
891 uprv_memset(versionArray, 0, U_MAX_VERSION_LENGTH);
892 }
893 }
894 }
895
896 U_CFUNC uint32_t
897 u_getUnicodeProperties(UChar32 c, int32_t column) {
898 uint16_t vecIndex;
899
900 if(column==-1) {
901 uint32_t props;
902 GET_PROPS(c, props);
903 return props;
904 } else if( !HAVE_DATA || countPropsVectors==0 ||
905 (uint32_t)c>0x10ffff ||
906 column<0 || column>=propsVectorsColumns
907 ) {
908 return 0;
909 } else {
910 UTRIE_GET16(&propsVectorsTrie, c, vecIndex);
911 return propsVectors[vecIndex+column];
912 }
913 }
914
915 U_CFUNC int32_t
916 uprv_getMaxValues(int32_t column) {
917 if(HAVE_DATA) {
918 switch(column) {
919 case 0:
920 return indexes[UPROPS_MAX_VALUES_INDEX];
921 case 2:
922 return indexes[UPROPS_MAX_VALUES_2_INDEX];
923 default:
924 return 0;
925 }
926 } else {
927 return 0;
928 }
929 }
930
931 /*
932 * get Hangul Syllable Type
933 * implemented here so that uchar.c (uhst_addPropertyStarts())
934 * does not depend on uprops.c (u_getIntPropertyValue(c, UCHAR_HANGUL_SYLLABLE_TYPE))
935 */
936 U_CFUNC UHangulSyllableType
937 uchar_getHST(UChar32 c) {
938 /* purely algorithmic; hardcode known characters, check for assigned new ones */
939 if(c<JAMO_L_BASE) {
940 /* U_HST_NOT_APPLICABLE */
941 } else if(c<=0x11ff) {
942 /* Jamo range */
943 if(c<=0x115f) {
944 /* Jamo L range, HANGUL CHOSEONG ... */
945 if(c==0x115f || c<=0x1159 || u_charType(c)==U_OTHER_LETTER) {
946 return U_HST_LEADING_JAMO;
947 }
948 } else if(c<=0x11a7) {
949 /* Jamo V range, HANGUL JUNGSEONG ... */
950 if(c<=0x11a2 || u_charType(c)==U_OTHER_LETTER) {
951 return U_HST_VOWEL_JAMO;
952 }
953 } else {
954 /* Jamo T range */
955 if(c<=0x11f9 || u_charType(c)==U_OTHER_LETTER) {
956 return U_HST_TRAILING_JAMO;
957 }
958 }
959 } else if((c-=HANGUL_BASE)<0) {
960 /* U_HST_NOT_APPLICABLE */
961 } else if(c<HANGUL_COUNT) {
962 /* Hangul syllable */
963 return c%JAMO_T_COUNT==0 ? U_HST_LV_SYLLABLE : U_HST_LVT_SYLLABLE;
964 }
965 return U_HST_NOT_APPLICABLE;
966 }
967
968 U_CAPI void U_EXPORT2
969 u_charAge(UChar32 c, UVersionInfo versionArray) {
970 if(versionArray!=NULL) {
971 uint32_t version=u_getUnicodeProperties(c, 0)>>UPROPS_AGE_SHIFT;
972 versionArray[0]=(uint8_t)(version>>4);
973 versionArray[1]=(uint8_t)(version&0xf);
974 versionArray[2]=versionArray[3]=0;
975 }
976 }
977
978 U_CAPI UScriptCode U_EXPORT2
979 uscript_getScript(UChar32 c, UErrorCode *pErrorCode) {
980 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
981 return 0;
982 }
983 if((uint32_t)c>0x10ffff) {
984 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
985 return 0;
986 }
987
988 return (UScriptCode)(u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_MASK);
989 }
990
991 U_CAPI UBlockCode U_EXPORT2
992 ublock_getCode(UChar32 c) {
993 return (UBlockCode)((u_getUnicodeProperties(c, 0)&UPROPS_BLOCK_MASK)>>UPROPS_BLOCK_SHIFT);
994 }
995
996 /* property starts for UnicodeSet ------------------------------------------- */
997
998 /* for Hangul_Syllable_Type */
999 U_CAPI void U_EXPORT2
1000 uhst_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode) {
1001 UChar32 c;
1002 int32_t value, value2;
1003
1004 if(U_FAILURE(*pErrorCode)) {
1005 return;
1006 }
1007
1008 if(!HAVE_DATA) {
1009 *pErrorCode=dataErrorCode;
1010 return;
1011 }
1012
1013 /* add code points with hardcoded properties, plus the ones following them */
1014
1015 /*
1016 * Add Jamo type boundaries for UCHAR_HANGUL_SYLLABLE_TYPE.
1017 * First, we add fixed boundaries for the blocks of Jamos.
1018 * Then we check in loops to see where the current Unicode version
1019 * actually stops assigning such Jamos. We start each loop
1020 * at the end of the per-Jamo-block assignments in Unicode 4 or earlier.
1021 * (These have not changed since Unicode 2.)
1022 */
1023 sa->add(sa->set, 0x1100);
1024 value=U_HST_LEADING_JAMO;
1025 for(c=0x115a; c<=0x115f; ++c) {
1026 value2=uchar_getHST(c);
1027 if(value!=value2) {
1028 value=value2;
1029 sa->add(sa->set, c);
1030 }
1031 }
1032
1033 sa->add(sa->set, 0x1160);
1034 value=U_HST_VOWEL_JAMO;
1035 for(c=0x11a3; c<=0x11a7; ++c) {
1036 value2=uchar_getHST(c);
1037 if(value!=value2) {
1038 value=value2;
1039 sa->add(sa->set, c);
1040 }
1041 }
1042
1043 sa->add(sa->set, 0x11a8);
1044 value=U_HST_TRAILING_JAMO;
1045 for(c=0x11fa; c<=0x11ff; ++c) {
1046 value2=uchar_getHST(c);
1047 if(value!=value2) {
1048 value=value2;
1049 sa->add(sa->set, c);
1050 }
1051 }
1052
1053 /* Add Hangul type boundaries for UCHAR_HANGUL_SYLLABLE_TYPE. */
1054 for(c=HANGUL_BASE; c<(HANGUL_BASE+HANGUL_COUNT); c+=JAMO_T_COUNT) {
1055 sa->add(sa->set, c);
1056 sa->add(sa->set, c+1);
1057 }
1058 sa->add(sa->set, c);
1059 }
1060
1061 static UBool U_CALLCONV
1062 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
1063 /* add the start code point to the USet */
1064 USetAdder *sa=(USetAdder *)context;
1065 sa->add(sa->set, start);
1066 return TRUE;
1067 }
1068
1069 #define USET_ADD_CP_AND_NEXT(sa, cp) sa->add(sa->set, cp); sa->add(sa->set, cp+1)
1070
1071 U_CAPI void U_EXPORT2
1072 uchar_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode) {
1073 if(U_FAILURE(*pErrorCode)) {
1074 return;
1075 }
1076
1077 if(!HAVE_DATA) {
1078 *pErrorCode=dataErrorCode;
1079 return;
1080 }
1081
1082 /* add the start code point of each same-value range of each trie */
1083 utrie_enum(&propsTrie, NULL, _enumPropertyStartsRange, sa);
1084 utrie_enum(&propsVectorsTrie, NULL, _enumPropertyStartsRange, sa);
1085
1086 /* add code points with hardcoded properties, plus the ones following them */
1087
1088 /* add for IS_THAT_CONTROL_SPACE() */
1089 sa->add(sa->set, TAB); /* range TAB..CR */
1090 sa->add(sa->set, CR+1);
1091 sa->add(sa->set, 0x1c);
1092 sa->add(sa->set, 0x1f+1);
1093 USET_ADD_CP_AND_NEXT(sa, NL);
1094
1095 /* add for u_isIDIgnorable() what was not added above */
1096 sa->add(sa->set, DEL); /* range DEL..NBSP-1, NBSP added below */
1097 sa->add(sa->set, HAIRSP);
1098 sa->add(sa->set, RLM+1);
1099 sa->add(sa->set, INHSWAP);
1100 sa->add(sa->set, NOMDIG+1);
1101 USET_ADD_CP_AND_NEXT(sa, ZWNBSP);
1102
1103 /* add no-break spaces for u_isWhitespace() what was not added above */
1104 USET_ADD_CP_AND_NEXT(sa, NBSP);
1105 USET_ADD_CP_AND_NEXT(sa, FIGURESP);
1106 USET_ADD_CP_AND_NEXT(sa, NNBSP);
1107
1108 /* add for u_charDigitValue() */
1109 USET_ADD_CP_AND_NEXT(sa, 0x3007);
1110 USET_ADD_CP_AND_NEXT(sa, 0x4e00);
1111 USET_ADD_CP_AND_NEXT(sa, 0x4e8c);
1112 USET_ADD_CP_AND_NEXT(sa, 0x4e09);
1113 USET_ADD_CP_AND_NEXT(sa, 0x56db);
1114 USET_ADD_CP_AND_NEXT(sa, 0x4e94);
1115 USET_ADD_CP_AND_NEXT(sa, 0x516d);
1116 USET_ADD_CP_AND_NEXT(sa, 0x4e03);
1117 USET_ADD_CP_AND_NEXT(sa, 0x516b);
1118 USET_ADD_CP_AND_NEXT(sa, 0x4e5d);
1119
1120 /* add for u_digit() */
1121 sa->add(sa->set, U_a);
1122 sa->add(sa->set, U_z+1);
1123 sa->add(sa->set, U_A);
1124 sa->add(sa->set, U_Z+1);
1125
1126 /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */
1127 sa->add(sa->set, WJ); /* range WJ..NOMDIG */
1128 sa->add(sa->set, 0xfff0);
1129 sa->add(sa->set, 0xfffb+1);
1130 sa->add(sa->set, 0xe0000);
1131 sa->add(sa->set, 0xe0fff+1);
1132
1133 /* add for UCHAR_GRAPHEME_BASE and others */
1134 USET_ADD_CP_AND_NEXT(sa, CGJ);
1135
1136 /* add for UCHAR_JOINING_TYPE */
1137 sa->add(sa->set, ZWNJ); /* range ZWNJ..ZWJ */
1138 sa->add(sa->set, ZWJ+1);
1139 }