2 ********************************************************************************
3 * Copyright (C) 1996-2004, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 ********************************************************************************
9 * Modification History:
11 * Date Name Description
12 * 04/02/97 aliu Creation.
13 * 4/15/99 Madhu Updated all the function definitions for C Implementation
14 * 5/20/99 Madhu Added the function u_getVersion()
15 * 8/19/1999 srl Upgraded scripts to Unicode3.0
16 * 11/11/1999 weiv added u_isalnum(), cleaned comments
17 * 01/11/2000 helena Renamed u_getVersion to u_getUnicodeVersion.
18 * 06/20/2000 helena OS/400 port changes; mostly typecast.
19 ******************************************************************************
22 #include "unicode/utypes.h"
23 #include "unicode/uchar.h"
24 #include "unicode/uscript.h"
25 #include "unicode/udata.h"
31 #include "unormimp.h" /* JAMO_L_BASE etc. */
34 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
36 /* dynamically loaded Unicode character properties -------------------------- */
40 * for a description of the file format, see icu/source/tools/genprops/store.c
42 static const char DATA_NAME
[] = "uprops";
43 static const char DATA_TYPE
[] = "icu";
45 static UDataMemory
*propsData
=NULL
;
46 static UErrorCode dataErrorCode
=U_ZERO_ERROR
;
48 static uint8_t formatVersion
[4]={ 0, 0, 0, 0 };
49 static UVersionInfo dataVersion
={ 0, 0, 0, 0 };
51 static UTrie propsTrie
={ 0 }, propsVectorsTrie
={ 0 };
52 static const uint32_t *pData32
=NULL
, *props32Table
=NULL
, *exceptionsTable
=NULL
, *propsVectors
=NULL
;
53 static const UChar
*ucharsTable
=NULL
;
54 static int32_t countPropsVectors
=0, propsVectorsColumns
=0;
56 static int8_t havePropsData
=0; /* == 0 -> Data has not been loaded.
57 * < 0 -> Error occured attempting to load data.
58 * > 0 -> Data has been successfully loaded.
61 /* index values loaded from uprops.dat */
62 static int32_t indexes
[UPROPS_INDEX_COUNT
];
64 /* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */
65 static int32_t U_CALLCONV
66 getFoldingPropsOffset(uint32_t data
) {
68 return (int32_t)(data
&0x7fff);
74 static UBool U_CALLCONV
75 isAcceptable(void *context
,
76 const char *type
, const char *name
,
77 const UDataInfo
*pInfo
) {
80 pInfo
->isBigEndian
==U_IS_BIG_ENDIAN
&&
81 pInfo
->charsetFamily
==U_CHARSET_FAMILY
&&
82 pInfo
->dataFormat
[0]==0x55 && /* dataFormat="UPro" */
83 pInfo
->dataFormat
[1]==0x50 &&
84 pInfo
->dataFormat
[2]==0x72 &&
85 pInfo
->dataFormat
[3]==0x6f &&
86 pInfo
->formatVersion
[0]==3 &&
87 pInfo
->formatVersion
[2]==UTRIE_SHIFT
&&
88 pInfo
->formatVersion
[3]==UTRIE_INDEX_SHIFT
90 uprv_memcpy(formatVersion
, pInfo
->formatVersion
, 4);
91 uprv_memcpy(dataVersion
, pInfo
->dataVersion
, 4);
98 static UBool U_CALLCONV
uchar_cleanup(void)
101 udata_close(propsData
);
106 exceptionsTable
=NULL
;
110 dataErrorCode
=U_ZERO_ERROR
;
117 UDataMemory
*propsData
;
118 UTrie propsTrie
, propsVectorsTrie
;
119 const uint32_t *pData32
;
121 typedef struct UCharProps UCharProps
;
123 /* open uprops.icu */
125 _openProps(UCharProps
*ucp
, UErrorCode
*pErrorCode
) {
129 ucp
->propsData
=udata_openChoice(NULL
, DATA_TYPE
, DATA_NAME
, isAcceptable
, NULL
, pErrorCode
);
130 if(U_FAILURE(*pErrorCode
)) {
134 ucp
->pData32
=p
=(const uint32_t *)udata_getMemory(ucp
->propsData
);
136 /* unserialize the trie; it is directly after the int32_t indexes[UPROPS_INDEX_COUNT] */
137 length
=(int32_t)p
[UPROPS_PROPS32_INDEX
]*4;
138 length
=utrie_unserialize(&ucp
->propsTrie
, (const uint8_t *)(p
+UPROPS_INDEX_COUNT
), length
-64, pErrorCode
);
139 if(U_FAILURE(*pErrorCode
)) {
142 ucp
->propsTrie
.getFoldingOffset
=getFoldingPropsOffset
;
144 /* unserialize the properties vectors trie, if any */
145 if( p
[UPROPS_ADDITIONAL_TRIE_INDEX
]!=0 &&
146 p
[UPROPS_ADDITIONAL_VECTORS_INDEX
]!=0
148 length
=(int32_t)(p
[UPROPS_ADDITIONAL_VECTORS_INDEX
]-p
[UPROPS_ADDITIONAL_TRIE_INDEX
])*4;
149 length
=utrie_unserialize(&ucp
->propsVectorsTrie
, (const uint8_t *)(p
+p
[UPROPS_ADDITIONAL_TRIE_INDEX
]), length
, pErrorCode
);
150 if(U_FAILURE(*pErrorCode
)) {
151 uprv_memset(&ucp
->propsVectorsTrie
, 0, sizeof(ucp
->propsVectorsTrie
));
153 ucp
->propsVectorsTrie
.getFoldingOffset
=getFoldingPropsOffset
;
159 uprv_loadPropsData(UErrorCode
*pErrorCode
) {
160 /* load Unicode character properties data from file if necessary */
163 * This lazy intialization with double-checked locking (without mutex protection for
164 * haveNormData==0) is transiently unsafe under certain circumstances.
165 * Check the readme and use u_init() if necessary.
167 if(havePropsData
==0) {
168 UCharProps ucp
={ NULL
};
171 if(U_FAILURE(*pErrorCode
)) {
172 return havePropsData
;
175 /* open the data outside the mutex block */
176 _openProps(&ucp
, pErrorCode
);
178 if(U_SUCCESS(*pErrorCode
)) {
179 /* in the mutex block, set the data for this process */
181 if(propsData
==NULL
) {
182 propsData
=ucp
.propsData
;
186 uprv_memcpy(&propsTrie
, &ucp
.propsTrie
, sizeof(propsTrie
));
187 uprv_memcpy(&propsVectorsTrie
, &ucp
.propsVectorsTrie
, sizeof(propsVectorsTrie
));
191 /* initialize some variables */
192 uprv_memcpy(indexes
, pData32
, sizeof(indexes
));
193 props32Table
=pData32
+indexes
[UPROPS_PROPS32_INDEX
];
194 exceptionsTable
=pData32
+indexes
[UPROPS_EXCEPTIONS_INDEX
];
195 ucharsTable
=(const UChar
*)(pData32
+indexes
[UPROPS_EXCEPTIONS_TOP_INDEX
]);
197 /* additional properties */
198 if(indexes
[UPROPS_ADDITIONAL_VECTORS_INDEX
]!=0) {
199 propsVectors
=pData32
+indexes
[UPROPS_ADDITIONAL_VECTORS_INDEX
];
200 countPropsVectors
=indexes
[UPROPS_RESERVED_INDEX
]-indexes
[UPROPS_ADDITIONAL_VECTORS_INDEX
];
201 propsVectorsColumns
=indexes
[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX
];
207 dataErrorCode
=*pErrorCode
;
210 ucln_common_registerCleanup(UCLN_COMMON_UCHAR
, uchar_cleanup
);
212 /* if a different thread set it first, then close the extra data */
213 udata_close(ucp
.propsData
); /* NULL if it was set correctly */
216 return havePropsData
;
221 loadPropsData(void) {
222 UErrorCode errorCode
= U_ZERO_ERROR
;
223 int8_t retVal
= uprv_loadPropsData(&errorCode
);
228 /* Unicode properties data swapping ----------------------------------------- */
230 U_CAPI
int32_t U_EXPORT2
231 uprops_swap(const UDataSwapper
*ds
,
232 const void *inData
, int32_t length
, void *outData
,
233 UErrorCode
*pErrorCode
) {
234 const UDataInfo
*pInfo
;
235 int32_t headerSize
, i
;
237 int32_t dataIndexes
[UPROPS_INDEX_COUNT
];
238 const int32_t *inData32
;
240 /* udata_swapDataHeader checks the arguments */
241 headerSize
=udata_swapDataHeader(ds
, inData
, length
, outData
, pErrorCode
);
242 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
246 /* check data format and format version */
247 pInfo
=(const UDataInfo
*)((const char *)inData
+4);
249 pInfo
->dataFormat
[0]==0x55 && /* dataFormat="UPro" */
250 pInfo
->dataFormat
[1]==0x50 &&
251 pInfo
->dataFormat
[2]==0x72 &&
252 pInfo
->dataFormat
[3]==0x6f &&
253 pInfo
->formatVersion
[0]==3 &&
254 pInfo
->formatVersion
[2]==UTRIE_SHIFT
&&
255 pInfo
->formatVersion
[3]==UTRIE_INDEX_SHIFT
257 udata_printError(ds
, "uprops_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not a Unicode properties file\n",
258 pInfo
->dataFormat
[0], pInfo
->dataFormat
[1],
259 pInfo
->dataFormat
[2], pInfo
->dataFormat
[3],
260 pInfo
->formatVersion
[0]);
261 *pErrorCode
=U_UNSUPPORTED_ERROR
;
265 /* the properties file must contain at least the indexes array */
266 if(length
>=0 && (length
-headerSize
)<sizeof(dataIndexes
)) {
267 udata_printError(ds
, "uprops_swap(): too few bytes (%d after header) for a Unicode properties file\n",
269 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
273 /* read the indexes */
274 inData32
=(const int32_t *)((const char *)inData
+headerSize
);
275 for(i
=0; i
<UPROPS_INDEX_COUNT
; ++i
) {
276 dataIndexes
[i
]=udata_readInt32(ds
, inData32
[i
]);
280 * comments are copied from the data format description in genprops/store.c
281 * indexes[] constants are in uprops.h
286 if((length
-headerSize
)<(4*dataIndexes
[UPROPS_RESERVED_INDEX
])) {
287 udata_printError(ds
, "uprops_swap(): too few bytes (%d after header) for a Unicode properties file\n",
289 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
293 outData32
=(int32_t *)((char *)outData
+headerSize
);
295 /* copy everything for inaccessible data (padding) */
296 if(inData32
!=outData32
) {
297 uprv_memcpy(outData32
, inData32
, 4*dataIndexes
[UPROPS_RESERVED_INDEX
]);
300 /* swap the indexes[16] */
301 ds
->swapArray32(ds
, inData32
, 4*UPROPS_INDEX_COUNT
, outData32
, pErrorCode
);
304 * swap the main properties UTrie
305 * PT serialized properties trie, see utrie.h (byte size: 4*(i0-16))
308 inData32
+UPROPS_INDEX_COUNT
,
309 4*(dataIndexes
[UPROPS_PROPS32_INDEX
]-UPROPS_INDEX_COUNT
),
310 outData32
+UPROPS_INDEX_COUNT
,
314 * swap the properties and exceptions words
315 * P const uint32_t props32[i1-i0];
316 * E const uint32_t exceptions[i2-i1];
319 inData32
+dataIndexes
[UPROPS_PROPS32_INDEX
],
320 4*(dataIndexes
[UPROPS_EXCEPTIONS_TOP_INDEX
]-dataIndexes
[UPROPS_PROPS32_INDEX
]),
321 outData32
+dataIndexes
[UPROPS_PROPS32_INDEX
],
326 * U const UChar uchars[2*(i3-i2)];
329 inData32
+dataIndexes
[UPROPS_EXCEPTIONS_TOP_INDEX
],
330 4*(dataIndexes
[UPROPS_ADDITIONAL_TRIE_INDEX
]-dataIndexes
[UPROPS_EXCEPTIONS_TOP_INDEX
]),
331 outData32
+dataIndexes
[UPROPS_EXCEPTIONS_TOP_INDEX
],
335 * swap the additional UTrie
336 * i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties
339 inData32
+dataIndexes
[UPROPS_ADDITIONAL_TRIE_INDEX
],
340 4*(dataIndexes
[UPROPS_ADDITIONAL_VECTORS_INDEX
]-dataIndexes
[UPROPS_ADDITIONAL_TRIE_INDEX
]),
341 outData32
+dataIndexes
[UPROPS_ADDITIONAL_TRIE_INDEX
],
345 * swap the properties vectors
346 * PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4];
349 inData32
+dataIndexes
[UPROPS_ADDITIONAL_VECTORS_INDEX
],
350 4*(dataIndexes
[UPROPS_RESERVED_INDEX
]-dataIndexes
[UPROPS_ADDITIONAL_VECTORS_INDEX
]),
351 outData32
+dataIndexes
[UPROPS_ADDITIONAL_VECTORS_INDEX
],
355 /* i6 reservedItemIndex; -- 32-bit unit index to the top of the properties vectors table */
356 return headerSize
+4*dataIndexes
[UPROPS_RESERVED_INDEX
];
359 /* constants and macros for access to the data ------------------------------ */
361 /* getting a uint32_t properties word from the data */
362 #define HAVE_DATA (havePropsData>0 || loadPropsData()>0)
363 #define VALIDATE(c) (((uint32_t)(c))<=0x10ffff && HAVE_DATA)
364 #define GET_PROPS_UNSAFE(c, result) \
365 UTRIE_GET16(&propsTrie, c, result); \
366 (result)=props32Table[(result)]
367 #define GET_PROPS(c, result) \
369 GET_PROPS_UNSAFE(c, result); \
374 /* finding an exception value */
375 #define HAVE_EXCEPTION_VALUE(flags, index) ((flags)&(1UL<<(index)))
377 /* number of bits in an 8-bit integer value */
379 static const uint8_t flagsOffset
[256]={
380 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
381 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
382 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
383 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
384 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
385 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
386 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
387 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
388 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
389 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
390 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
391 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
392 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
393 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
394 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
395 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
398 #define ADD_EXCEPTION_OFFSET(flags, index, offset) { \
399 if((index)>=EXC_GROUP) { \
400 (offset)+=flagsOffset[(flags)&((1<<EXC_GROUP)-1)]; \
401 (flags)>>=EXC_GROUP; \
402 (index)-=EXC_GROUP; \
404 (offset)+=flagsOffset[(flags)&((1<<(index))-1)]; \
408 uprv_haveProperties(UErrorCode
*pErrorCode
) {
409 if(U_FAILURE(*pErrorCode
)) {
412 if(havePropsData
==0) {
413 uprv_loadPropsData(pErrorCode
);
415 if(havePropsData
<0) {
416 *pErrorCode
=dataErrorCode
;
422 /* API functions ------------------------------------------------------------ */
424 /* Gets the Unicode character's general category.*/
425 U_CAPI
int8_t U_EXPORT2
426 u_charType(UChar32 c
) {
429 return (int8_t)GET_CATEGORY(props
);
432 /* Enumerate all code points with their general categories. */
433 struct _EnumTypeCallback
{
434 UCharEnumTypeRange
*enumRange
;
438 static uint32_t U_CALLCONV
439 _enumTypeValue(const void *context
, uint32_t value
) {
440 /* access the general category from the 32-bit properties, and those from the 16-bit trie value */
441 return GET_CATEGORY(props32Table
[value
]);
444 static UBool U_CALLCONV
445 _enumTypeRange(const void *context
, UChar32 start
, UChar32 limit
, uint32_t value
) {
446 /* just cast the value to UCharCategory */
447 return ((struct _EnumTypeCallback
*)context
)->
448 enumRange(((struct _EnumTypeCallback
*)context
)->context
,
449 start
, limit
, (UCharCategory
)value
);
452 U_CAPI
void U_EXPORT2
453 u_enumCharTypes(UCharEnumTypeRange
*enumRange
, const void *context
) {
454 struct _EnumTypeCallback callback
;
456 if(enumRange
==NULL
|| !HAVE_DATA
) {
460 callback
.enumRange
=enumRange
;
461 callback
.context
=context
;
462 utrie_enum(&propsTrie
, _enumTypeValue
, _enumTypeRange
, &callback
);
465 /* Checks if ch is a lower case letter.*/
466 U_CAPI UBool U_EXPORT2
467 u_islower(UChar32 c
) {
470 return (UBool
)(GET_CATEGORY(props
)==U_LOWERCASE_LETTER
);
473 /* Checks if ch is an upper case letter.*/
474 U_CAPI UBool U_EXPORT2
475 u_isupper(UChar32 c
) {
478 return (UBool
)(GET_CATEGORY(props
)==U_UPPERCASE_LETTER
);
481 /* Checks if ch is a title case letter; usually upper case letters.*/
482 U_CAPI UBool U_EXPORT2
483 u_istitle(UChar32 c
) {
486 return (UBool
)(GET_CATEGORY(props
)==U_TITLECASE_LETTER
);
489 /* Checks if ch is a decimal digit. */
490 U_CAPI UBool U_EXPORT2
491 u_isdigit(UChar32 c
) {
494 return (UBool
)(GET_CATEGORY(props
)==U_DECIMAL_DIGIT_NUMBER
);
497 U_CAPI UBool U_EXPORT2
498 u_isxdigit(UChar32 c
) {
501 /* check ASCII and Fullwidth ASCII a-fA-F */
503 (c
<=0x66 && c
>=0x41 && (c
<=0x46 || c
>=0x61)) ||
504 (c
>=0xff21 && c
<=0xff46 && (c
<=0xff26 || c
>=0xff41))
510 return (UBool
)(GET_CATEGORY(props
)==U_DECIMAL_DIGIT_NUMBER
);
513 /* Checks if the Unicode character is a letter.*/
514 U_CAPI UBool U_EXPORT2
515 u_isalpha(UChar32 c
) {
518 return (UBool
)((CAT_MASK(props
)&U_GC_L_MASK
)!=0);
521 U_CAPI UBool U_EXPORT2
522 u_isUAlphabetic(UChar32 c
) {
523 return (u_getUnicodeProperties(c
, 1)&U_MASK(UPROPS_ALPHABETIC
))!=0;
526 /* Checks if ch is a letter or a decimal digit */
527 U_CAPI UBool U_EXPORT2
528 u_isalnum(UChar32 c
) {
531 return (UBool
)((CAT_MASK(props
)&(U_GC_L_MASK
|U_GC_ND_MASK
))!=0);
534 /* Checks if ch is a unicode character with assigned character type.*/
535 U_CAPI UBool U_EXPORT2
536 u_isdefined(UChar32 c
) {
539 return (UBool
)(GET_CATEGORY(props
)!=0);
542 /* Checks if the Unicode character is a base form character that can take a diacritic.*/
543 U_CAPI UBool U_EXPORT2
544 u_isbase(UChar32 c
) {
547 return (UBool
)((CAT_MASK(props
)&(U_GC_L_MASK
|U_GC_N_MASK
|U_GC_MC_MASK
|U_GC_ME_MASK
))!=0);
550 /* Checks if the Unicode character is a control character.*/
551 U_CAPI UBool U_EXPORT2
552 u_iscntrl(UChar32 c
) {
555 return (UBool
)((CAT_MASK(props
)&(U_GC_CC_MASK
|U_GC_CF_MASK
|U_GC_ZL_MASK
|U_GC_ZP_MASK
))!=0);
558 U_CAPI UBool U_EXPORT2
559 u_isISOControl(UChar32 c
) {
560 return (uint32_t)c
<=0x9f && (c
<=0x1f || c
>=0x7f);
563 /* Some control characters that are used as space. */
564 #define IS_THAT_CONTROL_SPACE(c) \
565 (c<=0x9f && ((c>=TAB && c<=CR) || (c>=0x1c && c <=0x1f) || c==NL))
567 /* Checks if the Unicode character is a space character.*/
568 U_CAPI UBool U_EXPORT2
569 u_isspace(UChar32 c
) {
572 return (UBool
)((CAT_MASK(props
)&U_GC_Z_MASK
)!=0 || IS_THAT_CONTROL_SPACE(c
));
575 U_CAPI UBool U_EXPORT2
576 u_isJavaSpaceChar(UChar32 c
) {
579 return (UBool
)((CAT_MASK(props
)&U_GC_Z_MASK
)!=0);
582 /* Checks if the Unicode character is a whitespace character.*/
583 U_CAPI UBool U_EXPORT2
584 u_isWhitespace(UChar32 c
) {
588 ((CAT_MASK(props
)&U_GC_Z_MASK
)!=0 &&
589 c
!=NBSP
&& c
!=FIGURESP
&& c
!=NNBSP
) || /* exclude no-break spaces */
590 IS_THAT_CONTROL_SPACE(c
)
594 U_CAPI UBool U_EXPORT2
595 u_isblank(UChar32 c
) {
596 if((uint32_t)c
<=0x9f) {
597 return c
==9 || c
==0x20; /* TAB or SPACE */
599 /* White_Space but not LS (Zl) or PS (Zp) */
600 return u_isUWhiteSpace(c
) && ((c
&0xfffffffe)!=0x2028);
604 U_CAPI UBool U_EXPORT2
605 u_isUWhiteSpace(UChar32 c
) {
606 return (u_getUnicodeProperties(c
, 1)&U_MASK(UPROPS_WHITE_SPACE
))!=0;
609 /* Checks if the Unicode character is printable.*/
610 U_CAPI UBool U_EXPORT2
611 u_isprint(UChar32 c
) {
614 /* comparing ==0 returns FALSE for the categories mentioned */
615 return (UBool
)((CAT_MASK(props
)&U_GC_C_MASK
)==0);
618 U_CAPI UBool U_EXPORT2
619 u_isgraph(UChar32 c
) {
622 /* comparing ==0 returns FALSE for the categories mentioned */
623 return (UBool
)((CAT_MASK(props
)&
624 (U_GC_CC_MASK
|U_GC_CF_MASK
|U_GC_CS_MASK
|U_GC_CN_MASK
|U_GC_Z_MASK
))
628 U_CAPI UBool U_EXPORT2
629 u_ispunct(UChar32 c
) {
632 return (UBool
)((CAT_MASK(props
)&U_GC_P_MASK
)!=0);
635 /* Checks if the Unicode character can start a Unicode identifier.*/
636 U_CAPI UBool U_EXPORT2
637 u_isIDStart(UChar32 c
) {
638 /* same as u_isalpha() */
641 return (UBool
)((CAT_MASK(props
)&(U_GC_L_MASK
|U_GC_NL_MASK
))!=0);
644 /* Checks if the Unicode character can be a Unicode identifier part other than starting the
646 U_CAPI UBool U_EXPORT2
647 u_isIDPart(UChar32 c
) {
652 (U_GC_ND_MASK
|U_GC_NL_MASK
|
654 U_GC_PC_MASK
|U_GC_MC_MASK
|U_GC_MN_MASK
)
659 /*Checks if the Unicode character can be ignorable in a Java or Unicode identifier.*/
660 U_CAPI UBool U_EXPORT2
661 u_isIDIgnorable(UChar32 c
) {
663 return u_isISOControl(c
) && !IS_THAT_CONTROL_SPACE(c
);
667 return (UBool
)(GET_CATEGORY(props
)==U_FORMAT_CHAR
);
671 /*Checks if the Unicode character can start a Java identifier.*/
672 U_CAPI UBool U_EXPORT2
673 u_isJavaIDStart(UChar32 c
) {
676 return (UBool
)((CAT_MASK(props
)&(U_GC_L_MASK
|U_GC_SC_MASK
|U_GC_PC_MASK
))!=0);
679 /*Checks if the Unicode character can be a Java identifier part other than starting the
682 U_CAPI UBool U_EXPORT2
683 u_isJavaIDPart(UChar32 c
) {
688 (U_GC_ND_MASK
|U_GC_NL_MASK
|
690 U_GC_SC_MASK
|U_GC_PC_MASK
|
691 U_GC_MC_MASK
|U_GC_MN_MASK
)
696 U_CAPI
int32_t U_EXPORT2
697 u_charDigitValue(UChar32 c
) {
698 uint32_t props
, numericType
;
700 numericType
=GET_NUMERIC_TYPE(props
);
703 if(!PROPS_VALUE_IS_EXCEPTION(props
)) {
704 return GET_SIGNED_VALUE(props
);
706 const uint32_t *pe
=GET_EXCEPTIONS(props
);
707 uint32_t firstExceptionValue
=*pe
;
708 if(HAVE_EXCEPTION_VALUE(firstExceptionValue
, EXC_NUMERIC_VALUE
)) {
709 int i
=EXC_NUMERIC_VALUE
;
711 ADD_EXCEPTION_OFFSET(firstExceptionValue
, i
, pe
);
720 U_CAPI
double U_EXPORT2
721 u_getNumericValue(UChar32 c
) {
722 uint32_t props
, numericType
;
724 numericType
=GET_NUMERIC_TYPE(props
);
726 if(numericType
==0 || numericType
>=(int32_t)U_NT_COUNT
) {
727 return U_NO_NUMERIC_VALUE
;
729 if(!PROPS_VALUE_IS_EXCEPTION(props
)) {
730 return GET_SIGNED_VALUE(props
);
733 uint32_t firstExceptionValue
;
736 uint32_t denominator
;
738 pe
=GET_EXCEPTIONS(props
);
739 firstExceptionValue
=*pe
++;
741 if(HAVE_EXCEPTION_VALUE(firstExceptionValue
, EXC_NUMERIC_VALUE
)) {
742 uint32_t flags
=firstExceptionValue
;
743 int i
=EXC_NUMERIC_VALUE
;
744 const uint32_t *p
=pe
;
747 ADD_EXCEPTION_OFFSET(flags
, i
, p
);
748 numerator
=(int32_t)*p
;
751 * There are special values for huge numbers that are powers of ten.
752 * genprops/store.c documents:
753 * if numericValue=0x7fffff00+x then numericValue=10^x
755 if(numerator
<0x7fffff00) {
756 numValue
=(double)numerator
;
760 /* 10^x without math.h */
762 while(numerator
>=4) {
784 if(HAVE_EXCEPTION_VALUE(firstExceptionValue
, EXC_DENOMINATOR_VALUE
)) {
785 uint32_t flags
=firstExceptionValue
;
786 int i
=EXC_DENOMINATOR_VALUE
;
787 const uint32_t *p
=pe
;
788 ADD_EXCEPTION_OFFSET(flags
, i
, p
);
794 switch(firstExceptionValue
&((1UL<<EXC_NUMERIC_VALUE
)|(1UL<<EXC_DENOMINATOR_VALUE
))) {
795 case 1UL<<EXC_NUMERIC_VALUE
:
797 case 1UL<<EXC_DENOMINATOR_VALUE
:
798 return (double)1./(double)denominator
;
799 case (1UL<<EXC_NUMERIC_VALUE
)|(1UL<<EXC_DENOMINATOR_VALUE
):
800 return numValue
/(double)denominator
;
801 case 0: /* none (should not occur with numericType>0) */
803 return U_NO_NUMERIC_VALUE
;
809 /* Gets the character's linguistic directionality.*/
810 U_CAPI UCharDirection U_EXPORT2
811 u_charDirection(UChar32 c
) {
814 return (UCharDirection
)GET_BIDI_CLASS(props
);
817 U_CAPI UBool U_EXPORT2
818 u_isMirrored(UChar32 c
) {
821 return (UBool
)(props
&(1UL<<UPROPS_MIRROR_SHIFT
) ? TRUE
: FALSE
);
824 U_CAPI UChar32 U_EXPORT2
825 u_charMirror(UChar32 c
) {
828 if((props
&(1UL<<UPROPS_MIRROR_SHIFT
))==0) {
829 /* not mirrored - the value is not a mirror offset */
831 } else if(!PROPS_VALUE_IS_EXCEPTION(props
)) {
832 return c
+GET_SIGNED_VALUE(props
);
834 const uint32_t *pe
=GET_EXCEPTIONS(props
);
835 uint32_t firstExceptionValue
=*pe
;
836 if(HAVE_EXCEPTION_VALUE(firstExceptionValue
, EXC_MIRROR_MAPPING
)) {
837 int i
=EXC_MIRROR_MAPPING
;
839 ADD_EXCEPTION_OFFSET(firstExceptionValue
, i
, pe
);
847 /* ICU 2.1: u_getCombiningClass() moved to unorm.cpp */
849 U_CAPI
int32_t U_EXPORT2
850 u_digit(UChar32 ch
, int8_t radix
) {
852 if((uint8_t)(radix
-2)<=(36-2)) {
853 value
=(int8_t)u_charDigitValue(ch
);
855 /* ch is not a decimal digit, try latin letters */
856 if(ch
>=0x61 && ch
<=0x7A) {
857 value
=(int8_t)(ch
-0x57); /* ch - 'a' + 10 */
858 } else if(ch
>=0x41 && ch
<=0x5A) {
859 value
=(int8_t)(ch
-0x37); /* ch - 'A' + 10 */
860 } else if(ch
>=0xFF41 && ch
<=0xFF5A) {
861 value
=(int8_t)(ch
-0xFF37); /* fullwidth ASCII a-z */
862 } else if(ch
>=0xFF21 && ch
<=0xFF3A) {
863 value
=(int8_t)(ch
-0xFF17); /* fullwidth ASCII A-Z */
867 value
=-1; /* invalid radix */
869 return (int8_t)((value
<radix
) ? value
: -1);
872 U_CAPI UChar32 U_EXPORT2
873 u_forDigit(int32_t digit
, int8_t radix
) {
874 if((uint8_t)(radix
-2)>(36-2) || (uint32_t)digit
>=(uint32_t)radix
) {
876 } else if(digit
<10) {
877 return (UChar32
)(0x30+digit
);
879 return (UChar32
)((0x61-10)+digit
);
883 /* miscellaneous, and support for uprops.c ---------------------------------- */
885 U_CAPI
void U_EXPORT2
886 u_getUnicodeVersion(UVersionInfo versionArray
) {
887 if(versionArray
!=NULL
) {
889 uprv_memcpy(versionArray
, dataVersion
, U_MAX_VERSION_LENGTH
);
891 uprv_memset(versionArray
, 0, U_MAX_VERSION_LENGTH
);
897 u_getUnicodeProperties(UChar32 c
, int32_t column
) {
904 } else if( !HAVE_DATA
|| countPropsVectors
==0 ||
905 (uint32_t)c
>0x10ffff ||
906 column
<0 || column
>=propsVectorsColumns
910 UTRIE_GET16(&propsVectorsTrie
, c
, vecIndex
);
911 return propsVectors
[vecIndex
+column
];
916 uprv_getMaxValues(int32_t column
) {
920 return indexes
[UPROPS_MAX_VALUES_INDEX
];
922 return indexes
[UPROPS_MAX_VALUES_2_INDEX
];
932 * get Hangul Syllable Type
933 * implemented here so that uchar.c (uhst_addPropertyStarts())
934 * does not depend on uprops.c (u_getIntPropertyValue(c, UCHAR_HANGUL_SYLLABLE_TYPE))
936 U_CFUNC UHangulSyllableType
937 uchar_getHST(UChar32 c
) {
938 /* purely algorithmic; hardcode known characters, check for assigned new ones */
940 /* U_HST_NOT_APPLICABLE */
941 } else if(c
<=0x11ff) {
944 /* Jamo L range, HANGUL CHOSEONG ... */
945 if(c
==0x115f || c
<=0x1159 || u_charType(c
)==U_OTHER_LETTER
) {
946 return U_HST_LEADING_JAMO
;
948 } else if(c
<=0x11a7) {
949 /* Jamo V range, HANGUL JUNGSEONG ... */
950 if(c
<=0x11a2 || u_charType(c
)==U_OTHER_LETTER
) {
951 return U_HST_VOWEL_JAMO
;
955 if(c
<=0x11f9 || u_charType(c
)==U_OTHER_LETTER
) {
956 return U_HST_TRAILING_JAMO
;
959 } else if((c
-=HANGUL_BASE
)<0) {
960 /* U_HST_NOT_APPLICABLE */
961 } else if(c
<HANGUL_COUNT
) {
962 /* Hangul syllable */
963 return c%JAMO_T_COUNT
==0 ? U_HST_LV_SYLLABLE
: U_HST_LVT_SYLLABLE
;
965 return U_HST_NOT_APPLICABLE
;
968 U_CAPI
void U_EXPORT2
969 u_charAge(UChar32 c
, UVersionInfo versionArray
) {
970 if(versionArray
!=NULL
) {
971 uint32_t version
=u_getUnicodeProperties(c
, 0)>>UPROPS_AGE_SHIFT
;
972 versionArray
[0]=(uint8_t)(version
>>4);
973 versionArray
[1]=(uint8_t)(version
&0xf);
974 versionArray
[2]=versionArray
[3]=0;
978 U_CAPI UScriptCode U_EXPORT2
979 uscript_getScript(UChar32 c
, UErrorCode
*pErrorCode
) {
980 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
983 if((uint32_t)c
>0x10ffff) {
984 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
988 return (UScriptCode
)(u_getUnicodeProperties(c
, 0)&UPROPS_SCRIPT_MASK
);
991 U_CAPI UBlockCode U_EXPORT2
992 ublock_getCode(UChar32 c
) {
993 return (UBlockCode
)((u_getUnicodeProperties(c
, 0)&UPROPS_BLOCK_MASK
)>>UPROPS_BLOCK_SHIFT
);
996 /* property starts for UnicodeSet ------------------------------------------- */
998 /* for Hangul_Syllable_Type */
999 U_CAPI
void U_EXPORT2
1000 uhst_addPropertyStarts(USetAdder
*sa
, UErrorCode
*pErrorCode
) {
1002 int32_t value
, value2
;
1004 if(U_FAILURE(*pErrorCode
)) {
1009 *pErrorCode
=dataErrorCode
;
1013 /* add code points with hardcoded properties, plus the ones following them */
1016 * Add Jamo type boundaries for UCHAR_HANGUL_SYLLABLE_TYPE.
1017 * First, we add fixed boundaries for the blocks of Jamos.
1018 * Then we check in loops to see where the current Unicode version
1019 * actually stops assigning such Jamos. We start each loop
1020 * at the end of the per-Jamo-block assignments in Unicode 4 or earlier.
1021 * (These have not changed since Unicode 2.)
1023 sa
->add(sa
->set
, 0x1100);
1024 value
=U_HST_LEADING_JAMO
;
1025 for(c
=0x115a; c
<=0x115f; ++c
) {
1026 value2
=uchar_getHST(c
);
1029 sa
->add(sa
->set
, c
);
1033 sa
->add(sa
->set
, 0x1160);
1034 value
=U_HST_VOWEL_JAMO
;
1035 for(c
=0x11a3; c
<=0x11a7; ++c
) {
1036 value2
=uchar_getHST(c
);
1039 sa
->add(sa
->set
, c
);
1043 sa
->add(sa
->set
, 0x11a8);
1044 value
=U_HST_TRAILING_JAMO
;
1045 for(c
=0x11fa; c
<=0x11ff; ++c
) {
1046 value2
=uchar_getHST(c
);
1049 sa
->add(sa
->set
, c
);
1053 /* Add Hangul type boundaries for UCHAR_HANGUL_SYLLABLE_TYPE. */
1054 for(c
=HANGUL_BASE
; c
<(HANGUL_BASE
+HANGUL_COUNT
); c
+=JAMO_T_COUNT
) {
1055 sa
->add(sa
->set
, c
);
1056 sa
->add(sa
->set
, c
+1);
1058 sa
->add(sa
->set
, c
);
1061 static UBool U_CALLCONV
1062 _enumPropertyStartsRange(const void *context
, UChar32 start
, UChar32 limit
, uint32_t value
) {
1063 /* add the start code point to the USet */
1064 USetAdder
*sa
=(USetAdder
*)context
;
1065 sa
->add(sa
->set
, start
);
1069 #define USET_ADD_CP_AND_NEXT(sa, cp) sa->add(sa->set, cp); sa->add(sa->set, cp+1)
1071 U_CAPI
void U_EXPORT2
1072 uchar_addPropertyStarts(USetAdder
*sa
, UErrorCode
*pErrorCode
) {
1073 if(U_FAILURE(*pErrorCode
)) {
1078 *pErrorCode
=dataErrorCode
;
1082 /* add the start code point of each same-value range of each trie */
1083 utrie_enum(&propsTrie
, NULL
, _enumPropertyStartsRange
, sa
);
1084 utrie_enum(&propsVectorsTrie
, NULL
, _enumPropertyStartsRange
, sa
);
1086 /* add code points with hardcoded properties, plus the ones following them */
1088 /* add for IS_THAT_CONTROL_SPACE() */
1089 sa
->add(sa
->set
, TAB
); /* range TAB..CR */
1090 sa
->add(sa
->set
, CR
+1);
1091 sa
->add(sa
->set
, 0x1c);
1092 sa
->add(sa
->set
, 0x1f+1);
1093 USET_ADD_CP_AND_NEXT(sa
, NL
);
1095 /* add for u_isIDIgnorable() what was not added above */
1096 sa
->add(sa
->set
, DEL
); /* range DEL..NBSP-1, NBSP added below */
1097 sa
->add(sa
->set
, HAIRSP
);
1098 sa
->add(sa
->set
, RLM
+1);
1099 sa
->add(sa
->set
, INHSWAP
);
1100 sa
->add(sa
->set
, NOMDIG
+1);
1101 USET_ADD_CP_AND_NEXT(sa
, ZWNBSP
);
1103 /* add no-break spaces for u_isWhitespace() what was not added above */
1104 USET_ADD_CP_AND_NEXT(sa
, NBSP
);
1105 USET_ADD_CP_AND_NEXT(sa
, FIGURESP
);
1106 USET_ADD_CP_AND_NEXT(sa
, NNBSP
);
1108 /* add for u_charDigitValue() */
1109 USET_ADD_CP_AND_NEXT(sa
, 0x3007);
1110 USET_ADD_CP_AND_NEXT(sa
, 0x4e00);
1111 USET_ADD_CP_AND_NEXT(sa
, 0x4e8c);
1112 USET_ADD_CP_AND_NEXT(sa
, 0x4e09);
1113 USET_ADD_CP_AND_NEXT(sa
, 0x56db);
1114 USET_ADD_CP_AND_NEXT(sa
, 0x4e94);
1115 USET_ADD_CP_AND_NEXT(sa
, 0x516d);
1116 USET_ADD_CP_AND_NEXT(sa
, 0x4e03);
1117 USET_ADD_CP_AND_NEXT(sa
, 0x516b);
1118 USET_ADD_CP_AND_NEXT(sa
, 0x4e5d);
1120 /* add for u_digit() */
1121 sa
->add(sa
->set
, U_a
);
1122 sa
->add(sa
->set
, U_z
+1);
1123 sa
->add(sa
->set
, U_A
);
1124 sa
->add(sa
->set
, U_Z
+1);
1126 /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */
1127 sa
->add(sa
->set
, WJ
); /* range WJ..NOMDIG */
1128 sa
->add(sa
->set
, 0xfff0);
1129 sa
->add(sa
->set
, 0xfffb+1);
1130 sa
->add(sa
->set
, 0xe0000);
1131 sa
->add(sa
->set
, 0xe0fff+1);
1133 /* add for UCHAR_GRAPHEME_BASE and others */
1134 USET_ADD_CP_AND_NEXT(sa
, CGJ
);
1136 /* add for UCHAR_JOINING_TYPE */
1137 sa
->add(sa
->set
, ZWNJ
); /* range ZWNJ..ZWJ */
1138 sa
->add(sa
->set
, ZWJ
+1);