icuSources/common/uprops.c

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2002-2003, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  uprops.h
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2002feb24
  14 *   created by: Markus W. Scherer
  15 *
  16 *   Implementations for mostly non-core Unicode character properties
  17 *   stored in uprops.icu.
  18 */
  19
  20 #include "unicode/utypes.h"
  21 #include "unicode/uchar.h"
  22 #include "unicode/uscript.h"
  23 #include "cstring.h"
  24 #include "unormimp.h"
  25 #include "uprops.h"
  26
  27 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
  28
  29 /**
  30  * Unicode property names and property value names are compared
  31  * "loosely". Property[Value]Aliases.txt say:
  32  *   "With loose matching of property names, the case distinctions, whitespace,
  33  *    and '_' are ignored."
  34  *
  35  * This function does just that, for ASCII (char *) name strings.
  36  * It is almost identical to ucnv_compareNames() but also ignores
  37  * ASCII White_Space characters (U+0009..U+000d).
  38  *
  39  * @internal
  40  */
  41 U_CAPI int32_t U_EXPORT2
  42 uprv_comparePropertyNames(const char *name1, const char *name2) {
  43     int32_t rc;
  44     unsigned char c1, c2;
  45
  46     for(;;) {
  47         /* Ignore delimiters '-', '_', and ASCII White_Space */
  48         while((c1=(unsigned char)*name1)=='-' || c1=='_' ||
  49               c1==' ' || c1=='\t' || c1=='\n' || c1=='\v' || c1=='\f' || c1=='\r'
  50         ) {
  51             ++name1;
  52         }
  53         while((c2=(unsigned char)*name2)=='-' || c2=='_' ||
  54               c2==' ' || c2=='\t' || c2=='\n' || c2=='\v' || c2=='\f' || c2=='\r'
  55         ) {
  56             ++name2;
  57         }
  58
  59         /* If we reach the ends of both strings then they match */
  60         if((c1|c2)==0) {
  61             return 0;
  62         }
  63
  64         /* Case-insensitive comparison */
  65         if(c1!=c2) {
  66             rc=(int32_t)(unsigned char)uprv_tolower(c1)-(int32_t)(unsigned char)uprv_tolower(c2);
  67             if(rc!=0) {
  68                 return rc;
  69             }
  70         }
  71
  72         ++name1;
  73         ++name2;
  74     }
  75 }
  76
  77 /* API functions ------------------------------------------------------------ */
  78
  79 U_CAPI void U_EXPORT2
  80 u_charAge(UChar32 c, UVersionInfo versionArray) {
  81     if(versionArray!=NULL) {
  82         uint32_t version=u_getUnicodeProperties(c, 0)>>UPROPS_AGE_SHIFT;
  83         versionArray[0]=(uint8_t)(version>>4);
  84         versionArray[1]=(uint8_t)(version&0xf);
  85         versionArray[2]=versionArray[3]=0;
  86     }
  87 }
  88
  89 U_CAPI UScriptCode U_EXPORT2
  90 uscript_getScript(UChar32 c, UErrorCode *pErrorCode) {
  91     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
  92         return 0;
  93     }
  94     if((uint32_t)c>0x10ffff) {
  95         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  96         return 0;
  97     }
  98
  99     return (UScriptCode)(u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_MASK);
 100 }
 101
 102 U_CAPI UBlockCode U_EXPORT2
 103 ublock_getCode(UChar32 c) {
 104     return (UBlockCode)((u_getUnicodeProperties(c, 0)&UPROPS_BLOCK_MASK)>>UPROPS_BLOCK_SHIFT);
 105 }
 106
 107 static const struct {
 108     int32_t column;
 109     uint32_t mask;
 110 } binProps[]={
 111     /*
 112      * column and mask values for binary properties from u_getUnicodeProperties().
 113      * Must be in order of corresponding UProperty,
 114      * and there must be exacly one entry per binary UProperty.
 115      */
 116     {  1, U_MASK(UPROPS_ALPHABETIC) },
 117     {  1, U_MASK(UPROPS_ASCII_HEX_DIGIT) },
 118     {  1, U_MASK(UPROPS_BIDI_CONTROL) },
 119     { -1, U_MASK(UPROPS_MIRROR_SHIFT) },
 120     {  1, U_MASK(UPROPS_DASH) },
 121     {  1, U_MASK(UPROPS_DEFAULT_IGNORABLE_CODE_POINT) },
 122     {  1, U_MASK(UPROPS_DEPRECATED) },
 123     {  1, U_MASK(UPROPS_DIACRITIC) },
 124     {  1, U_MASK(UPROPS_EXTENDER) },
 125     {  0, 0 },                                  /* UCHAR_FULL_COMPOSITION_EXCLUSION */
 126     {  1, U_MASK(UPROPS_GRAPHEME_BASE) },
 127     {  1, U_MASK(UPROPS_GRAPHEME_EXTEND) },
 128     {  1, U_MASK(UPROPS_GRAPHEME_LINK) },
 129     {  1, U_MASK(UPROPS_HEX_DIGIT) },
 130     {  1, U_MASK(UPROPS_HYPHEN) },
 131     {  1, U_MASK(UPROPS_ID_CONTINUE) },
 132     {  1, U_MASK(UPROPS_ID_START) },
 133     {  1, U_MASK(UPROPS_IDEOGRAPHIC) },
 134     {  1, U_MASK(UPROPS_IDS_BINARY_OPERATOR) },
 135     {  1, U_MASK(UPROPS_IDS_TRINARY_OPERATOR) },
 136     {  1, U_MASK(UPROPS_JOIN_CONTROL) },
 137     {  1, U_MASK(UPROPS_LOGICAL_ORDER_EXCEPTION) },
 138     {  1, U_MASK(UPROPS_LOWERCASE) },
 139     {  1, U_MASK(UPROPS_MATH) },
 140     {  1, U_MASK(UPROPS_NONCHARACTER_CODE_POINT) },
 141     {  1, U_MASK(UPROPS_QUOTATION_MARK) },
 142     {  1, U_MASK(UPROPS_RADICAL) },
 143     {  1, U_MASK(UPROPS_SOFT_DOTTED) },
 144     {  1, U_MASK(UPROPS_TERMINAL_PUNCTUATION) },
 145     {  1, U_MASK(UPROPS_UNIFIED_IDEOGRAPH) },
 146     {  1, U_MASK(UPROPS_UPPERCASE) },
 147     {  1, U_MASK(UPROPS_WHITE_SPACE) },
 148     {  1, U_MASK(UPROPS_XID_CONTINUE) },
 149     {  1, U_MASK(UPROPS_XID_START) },
 150     { -1, U_MASK(UPROPS_CASE_SENSITIVE_SHIFT) }
 151 };
 152
 153 U_CAPI UBool U_EXPORT2
 154 u_hasBinaryProperty(UChar32 c, UProperty which) {
 155     /* c is range-checked in the functions that are called from here */
 156     if(which<UCHAR_BINARY_START || UCHAR_BINARY_LIMIT<=which) {
 157         /* not a known binary property */
 158         return FALSE;
 159     } else if(which==UCHAR_FULL_COMPOSITION_EXCLUSION) {
 160 #if !UCONFIG_NO_NORMALIZATION
 161         return unorm_internalIsFullCompositionExclusion(c);
 162 #else
 163         return FALSE;
 164 #endif
 165     } else {
 166         /* systematic, directly stored properties */
 167         return (u_getUnicodeProperties(c, binProps[which].column)&binProps[which].mask)!=0;
 168     }
 169 }
 170
 171 U_CAPI UBool U_EXPORT2
 172 u_isUAlphabetic(UChar32 c) {
 173     return u_hasBinaryProperty(c, UCHAR_ALPHABETIC);
 174 }
 175
 176 U_CAPI UBool U_EXPORT2
 177 u_isULowercase(UChar32 c) {
 178     return u_hasBinaryProperty(c, UCHAR_LOWERCASE);
 179 }
 180
 181 U_CAPI UBool U_EXPORT2
 182 u_isUUppercase(UChar32 c) {
 183     return u_hasBinaryProperty(c, UCHAR_UPPERCASE);
 184 }
 185
 186 U_CAPI UBool U_EXPORT2
 187 u_isUWhiteSpace(UChar32 c) {
 188     return u_hasBinaryProperty(c, UCHAR_WHITE_SPACE);
 189 }
 190
 191 U_CAPI UBool U_EXPORT2
 192 uprv_isRuleWhiteSpace(UChar32 c) {
 193     /* "white space" in the sense of ICU rule parsers: Cf+White_Space */
 194     return
 195         u_charType(c)==U_FORMAT_CHAR ||
 196         u_hasBinaryProperty(c, UCHAR_WHITE_SPACE);
 197 }
 198
 199 static const UChar _PATTERN[] = {
 200     /* "[[:Cf:][:WSpace:]]" */
 201     91, 91, 58, 67, 102, 58, 93, 91, 58, 87,
 202     83, 112, 97, 99, 101, 58, 93, 93, 0
 203 };
 204
 205 U_CAPI USet* U_EXPORT2
 206 uprv_openRuleWhiteSpaceSet(UErrorCode* ec) {
 207     return uset_openPattern(_PATTERN,
 208                             sizeof(_PATTERN)/sizeof(_PATTERN[0])-1, ec);
 209 }
 210
 211 U_CAPI int32_t U_EXPORT2
 212 u_getIntPropertyValue(UChar32 c, UProperty which) {
 213     UErrorCode errorCode;
 214
 215     if(which<UCHAR_BINARY_START) {
 216         return 0; /* undefined */
 217     } else if(which<UCHAR_BINARY_LIMIT) {
 218         return (int32_t)u_hasBinaryProperty(c, which);
 219     } else if(which<UCHAR_INT_START) {
 220         return 0; /* undefined */
 221     } else if(which<UCHAR_INT_LIMIT) {
 222         switch(which) {
 223         case UCHAR_BIDI_CLASS:
 224             return (int32_t)u_charDirection(c);
 225         case UCHAR_BLOCK:
 226             return (int32_t)ublock_getCode(c);
 227         case UCHAR_CANONICAL_COMBINING_CLASS:
 228 #if !UCONFIG_NO_NORMALIZATION
 229             return u_getCombiningClass(c);
 230 #else
 231             return 0;
 232 #endif
 233         case UCHAR_DECOMPOSITION_TYPE:
 234             return (int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_DT_MASK);
 235         case UCHAR_EAST_ASIAN_WIDTH:
 236             return (int32_t)(u_getUnicodeProperties(c, 0)&UPROPS_EA_MASK)>>UPROPS_EA_SHIFT;
 237         case UCHAR_GENERAL_CATEGORY:
 238             return (int32_t)u_charType(c);
 239         case UCHAR_JOINING_GROUP:
 240             return (int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_JG_MASK)>>UPROPS_JG_SHIFT;
 241         case UCHAR_JOINING_TYPE:
 242             return (int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_JT_MASK)>>UPROPS_JT_SHIFT;
 243         case UCHAR_LINE_BREAK:
 244             return (int32_t)(u_getUnicodeProperties(c, 0)&UPROPS_LB_MASK)>>UPROPS_LB_SHIFT;
 245         case UCHAR_NUMERIC_TYPE:
 246             return (int32_t)GET_NUMERIC_TYPE(u_getUnicodeProperties(c, -1));
 247         case UCHAR_SCRIPT:
 248             errorCode=U_ZERO_ERROR;
 249             return (int32_t)uscript_getScript(c, &errorCode);
 250         case UCHAR_HANGUL_SYLLABLE_TYPE:
 251             /* purely algorithmic; hardcode known characters, check for assigned new ones */
 252             if(c<JAMO_L_BASE) {
 253                 /* U_HST_NOT_APPLICABLE */
 254             } else if(c<=0x11ff) {
 255                 /* Jamo range */
 256                 if(c<=0x115f) {
 257                     /* Jamo L range, HANGUL CHOSEONG ... */
 258                     if(c==0x115f || c<=0x1159 || u_charType(c)==U_OTHER_LETTER) {
 259                         return U_HST_LEADING_JAMO;
 260                     }
 261                 } else if(c<=0x11a7) {
 262                     /* Jamo V range, HANGUL JUNGSEONG ... */
 263                     if(c<=0x11a2 || u_charType(c)==U_OTHER_LETTER) {
 264                         return U_HST_VOWEL_JAMO;
 265                     }
 266                 } else {
 267                     /* Jamo T range */
 268                     if(c<=0x11f9 || u_charType(c)==U_OTHER_LETTER) {
 269                         return U_HST_TRAILING_JAMO;
 270                     }
 271                 }
 272             } else if((c-=HANGUL_BASE)<0) {
 273                 /* U_HST_NOT_APPLICABLE */
 274             } else if(c<HANGUL_COUNT) {
 275                 /* Hangul syllable */
 276                 return c%JAMO_T_COUNT==0 ? U_HST_LV_SYLLABLE : U_HST_LVT_SYLLABLE;
 277             }
 278             return U_HST_NOT_APPLICABLE;
 279         default:
 280             return 0; /* undefined */
 281         }
 282     } else if(which==UCHAR_GENERAL_CATEGORY_MASK) {
 283         return U_MASK(u_charType(c));
 284     } else {
 285         return 0; /* undefined */
 286     }
 287 }
 288
 289 U_CAPI int32_t U_EXPORT2
 290 u_getIntPropertyMinValue(UProperty which) {
 291     return 0; /* all binary/enum/int properties have a minimum value of 0 */
 292 }
 293
 294 U_CAPI int32_t U_EXPORT2
 295 u_getIntPropertyMaxValue(UProperty which) {
 296     int32_t max;
 297
 298     if(which<UCHAR_BINARY_START) {
 299         return -1; /* undefined */
 300     } else if(which<UCHAR_BINARY_LIMIT) {
 301         return 1; /* maximum TRUE for all binary properties */
 302     } else if(which<UCHAR_INT_START) {
 303         return -1; /* undefined */
 304     } else if(which<UCHAR_INT_LIMIT) {
 305         switch(which) {
 306         case UCHAR_BIDI_CLASS:
 307             return (int32_t)U_CHAR_DIRECTION_COUNT-1;
 308         case UCHAR_BLOCK:
 309             max=(uprv_getMaxValues(0)&UPROPS_BLOCK_MASK)>>UPROPS_BLOCK_SHIFT;
 310             return max!=0 ? max : (int32_t)UBLOCK_COUNT-1;
 311         case UCHAR_CANONICAL_COMBINING_CLASS:
 312             return 0xff; /* TODO do we need to be more precise, getting the actual maximum? */
 313         case UCHAR_DECOMPOSITION_TYPE:
 314             max=uprv_getMaxValues(2)&UPROPS_DT_MASK;
 315             return max!=0 ? max : (int32_t)U_DT_COUNT-1;
 316         case UCHAR_EAST_ASIAN_WIDTH:
 317             max=(uprv_getMaxValues(0)&UPROPS_EA_MASK)>>UPROPS_EA_SHIFT;
 318             return max!=0 ? max : (int32_t)U_EA_COUNT-1;
 319         case UCHAR_GENERAL_CATEGORY:
 320             return (int32_t)U_CHAR_CATEGORY_COUNT-1;
 321         case UCHAR_JOINING_GROUP:
 322             max=(uprv_getMaxValues(2)&UPROPS_JG_MASK)>>UPROPS_JG_SHIFT;
 323             return max!=0 ? max : (int32_t)U_JG_COUNT-1;
 324         case UCHAR_JOINING_TYPE:
 325             max=(uprv_getMaxValues(2)&UPROPS_JT_MASK)>>UPROPS_JT_SHIFT;
 326             return max!=0 ? max : (int32_t)U_JT_COUNT-1;
 327         case UCHAR_LINE_BREAK:
 328             max=(uprv_getMaxValues(0)&UPROPS_LB_MASK)>>UPROPS_LB_SHIFT;
 329             return max!=0 ? max : (int32_t)U_LB_COUNT-1;
 330         case UCHAR_NUMERIC_TYPE:
 331             return (int32_t)U_NT_COUNT-1;
 332         case UCHAR_SCRIPT:
 333             max=uprv_getMaxValues(0)&UPROPS_SCRIPT_MASK;
 334             return max!=0 ? max : (int32_t)USCRIPT_CODE_LIMIT-1;
 335         case UCHAR_HANGUL_SYLLABLE_TYPE:
 336             return (int32_t)U_HST_COUNT-1;
 337         default:
 338             return -1; /* undefined */
 339         }
 340     } else {
 341         return -1; /* undefined */
 342     }
 343 }
 344
 345 /*----------------------------------------------------------------
 346  * Inclusions list
 347  *----------------------------------------------------------------*/
 348
 349 /*
 350  * Return a set of characters for property enumeration.
 351  * The set implicitly contains 0x110000 as well, which is one more than the highest
 352  * Unicode code point.
 353  *
 354  * This set is used as an ordered list - its code points are ordered, and
 355  * consecutive code points (in Unicode code point order) in the set define a range.
 356  * For each two consecutive characters (start, limit) in the set,
 357  * all of the UCD/normalization and related properties for
 358  * all code points start..limit-1 are all the same,
 359  * except for character names and ISO comments.
 360  *
 361  * All Unicode code points U+0000..U+10ffff are covered by these ranges.
 362  * The ranges define a partition of the Unicode code space.
 363  * ICU uses the inclusions set to enumerate properties for generating
 364  * UnicodeSets containing all code points that have a certain property value.
 365  *
 366  * The Inclusion List is generated from the UCD. It is generated
 367  * by enumerating the data tries, and code points for hardcoded properties
 368  * are added as well.
 369  *
 370  * --------------------------------------------------------------------------
 371  *
 372  * The following are ideas for getting properties-unique code point ranges,
 373  * with possible optimizations beyond the current implementation.
 374  * These optimizations would require more code and be more fragile.
 375  * The current implementation generates one single list (set) for all properties.
 376  *
 377  * To enumerate properties efficiently, one needs to know ranges of
 378  * repetitive values, so that the value of only each start code point
 379  * can be applied to the whole range.
 380  * This information is in principle available in the uprops.icu/unorm.icu data.
 381  *
 382  * There are two obstacles:
 383  *
 384  * 1. Some properties are computed from multiple data structures,
 385  *    making it necessary to get repetitive ranges by intersecting
 386  *    ranges from multiple tries.
 387  *
 388  * 2. It is not economical to write code for getting repetitive ranges
 389  *    that are precise for each of some 50 properties.
 390  *
 391  * Compromise ideas:
 392  *
 393  * - Get ranges per trie, not per individual property.
 394  *   Each range contains the same values for a whole group of properties.
 395  *   This would generate currently five range sets, two for uprops.icu tries
 396  *   and three for unorm.icu tries.
 397  *
 398  * - Combine sets of ranges for multiple tries to get sufficient sets
 399  *   for properties, e.g., the uprops.icu main and auxiliary tries
 400  *   for all non-normalization properties.
 401  *
 402  * Ideas for representing ranges and combining them:
 403  *
 404  * - A UnicodeSet could hold just the start code points of ranges.
 405  *   Multiple sets are easily combined by or-ing them together.
 406  *
 407  * - Alternatively, a UnicodeSet could hold each even-numbered range.
 408  *   All ranges could be enumerated by using each start code point
 409  *   (for the even-numbered ranges) as well as each limit (end+1) code point
 410  *   (for the odd-numbered ranges).
 411  *   It should be possible to combine two such sets by xor-ing them,
 412  *   but no more than two.
 413  *
 414  * The second way to represent ranges may(?!) yield smaller UnicodeSet arrays,
 415  * but the first one is certainly simpler and applicable for combining more than
 416  * two range sets.
 417  *
 418  * It is possible to combine all range sets for all uprops/unorm tries into one
 419  * set that can be used for all properties.
 420  * As an optimization, there could be less-combined range sets for certain
 421  * groups of properties.
 422  * The relationship of which less-combined range set to use for which property
 423  * depends on the implementation of the properties and must be hardcoded
 424  * - somewhat error-prone and higher maintenance but can be tested easily
 425  * by building property sets "the simple way" in test code.
 426  *
 427  * ---
 428  *
 429  * Do not use a UnicodeSet pattern because that causes infinite recursion;
 430  * UnicodeSet depends on the inclusions set.
 431  */
 432 U_CAPI void U_EXPORT2
 433 uprv_getInclusions(USet* set, UErrorCode *pErrorCode) {
 434     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
 435         return;
 436     }
 437
 438     uset_clear(set);
 439
 440 #if !UCONFIG_NO_NORMALIZATION
 441     unorm_addPropertyStarts(set, pErrorCode);
 442 #endif
 443     uchar_addPropertyStarts(set, pErrorCode);
 444 }