icuSources/i18n/ucol_tok.cpp

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2001-2008, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  ucol_tok.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created 02/22/2001
  14 *   created by: Vladimir Weinstein
  15 *
  16 * This module reads a tailoring rule string and produces a list of
  17 * tokens that will be turned into collation elements
  18 *
  19 */
  20
  21 #include "unicode/utypes.h"
  22
  23 #if !UCONFIG_NO_COLLATION
  24
  25 #include "unicode/ustring.h"
  26 #include "unicode/uchar.h"
  27 #include "unicode/uniset.h"
  28
  29 #include "ucol_tok.h"
  30 #include "ucol_bld.h"
  31 #include "cmemory.h"
  32 #include "util.h"
  33
  34 U_CDECL_BEGIN
  35 static int32_t U_CALLCONV
  36 uhash_hashTokens(const UHashTok k)
  37 {
  38     int32_t hash = 0;
  39     //uint32_t key = (uint32_t)k.integer;
  40     UColToken *key = (UColToken *)k.pointer;
  41     if (key != 0) {
  42         //int32_t len = (key & 0xFF000000)>>24;
  43         int32_t len = (key->source & 0xFF000000)>>24;
  44         int32_t inc = ((len - 32) / 32) + 1;
  45
  46         //const UChar *p = (key & 0x00FFFFFF) + rulesToParse;
  47         const UChar *p = (key->source & 0x00FFFFFF) + key->rulesToParse;
  48         const UChar *limit = p + len;
  49
  50         while (p<limit) {
  51             hash = (hash * 37) + *p;
  52             p += inc;
  53         }
  54     }
  55     return hash;
  56 }
  57
  58 static UBool U_CALLCONV
  59 uhash_compareTokens(const UHashTok key1, const UHashTok key2)
  60 {
  61     //uint32_t p1 = (uint32_t) key1.integer;
  62     //uint32_t p2 = (uint32_t) key2.integer;
  63     UColToken *p1 = (UColToken *)key1.pointer;
  64     UColToken *p2 = (UColToken *)key2.pointer;
  65     const UChar *s1 = (p1->source & 0x00FFFFFF) + p1->rulesToParse;
  66     const UChar *s2 = (p2->source & 0x00FFFFFF) + p2->rulesToParse;
  67     uint32_t s1L = ((p1->source & 0xFF000000) >> 24);
  68     uint32_t s2L = ((p2->source & 0xFF000000) >> 24);
  69     const UChar *end = s1+s1L-1;
  70
  71     if (p1 == p2) {
  72         return TRUE;
  73     }
  74     if (p1->source == 0 || p2->source == 0) {
  75         return FALSE;
  76     }
  77     if(s1L != s2L) {
  78         return FALSE;
  79     }
  80     if(p1->source == p2->source) {
  81         return TRUE;
  82     }
  83     while((s1 < end) && *s1 == *s2) {
  84         ++s1;
  85         ++s2;
  86     }
  87     if(*s1 == *s2) {
  88         return TRUE;
  89     } else {
  90         return FALSE;
  91     }
  92 }
  93 U_CDECL_END
  94
  95 /*static inline void U_CALLCONV
  96 uhash_freeBlockWrapper(void *obj) {
  97     uhash_freeBlock(obj);
  98 }*/
  99
 100
 101 typedef struct {
 102     uint32_t startCE;
 103     uint32_t startContCE;
 104     uint32_t limitCE;
 105     uint32_t limitContCE;
 106 } indirectBoundaries;
 107
 108 /* these values are used for finding CE values for indirect positioning. */
 109 /* Indirect positioning is a mechanism for allowing resets on symbolic   */
 110 /* values. It only works for resets and you cannot tailor indirect names */
 111 /* An indirect name can define either an anchor point or a range. An     */
 112 /* anchor point behaves in exactly the same way as a code point in reset */
 113 /* would, except that it cannot be tailored. A range (we currently only  */
 114 /* know for the [top] range will explicitly set the upper bound for      */
 115 /* generated CEs, thus allowing for better control over how many CEs can */
 116 /* be squeezed between in the range without performance penalty.         */
 117 /* In that respect, we use [top] for tailoring of locales that use CJK   */
 118 /* characters. Other indirect values are currently a pure convenience,   */
 119 /* they can be used to assure that the CEs will be always positioned in  */
 120 /* the same place relative to a point with known properties (e.g. first  */
 121 /* primary ignorable). */
 122 static indirectBoundaries ucolIndirectBoundaries[15];
 123 /*
 124 static indirectBoundaries ucolIndirectBoundaries[11] = {
 125 { UCOL_RESET_TOP_VALUE,               0,
 126 UCOL_NEXT_TOP_VALUE,                0 },
 127 { UCOL_FIRST_PRIMARY_IGNORABLE,       0,
 128 0,                                  0 },
 129 { UCOL_LAST_PRIMARY_IGNORABLE,        UCOL_LAST_PRIMARY_IGNORABLE_CONT,
 130 0,                                  0 },
 131 { UCOL_FIRST_SECONDARY_IGNORABLE,     0,
 132 0,                                  0 },
 133 { UCOL_LAST_SECONDARY_IGNORABLE,      0,
 134 0,                                  0 },
 135 { UCOL_FIRST_TERTIARY_IGNORABLE,      0,
 136 0,                                  0 },
 137 { UCOL_LAST_TERTIARY_IGNORABLE,       0,
 138 0,                                  0 },
 139 { UCOL_FIRST_VARIABLE,                0,
 140 0,                                  0 },
 141 { UCOL_LAST_VARIABLE,                 0,
 142 0,                                  0 },
 143 { UCOL_FIRST_NON_VARIABLE,            0,
 144 0,                                  0 },
 145 { UCOL_LAST_NON_VARIABLE,             0,
 146 0,                                  0 },
 147 };
 148 */
 149
 150 static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) {
 151
 152     // Set values for the top - TODO: once we have values for all the indirects, we are going
 153     // to initalize here.
 154     ucolIndirectBoundaries[indexR].startCE = start[0];
 155     ucolIndirectBoundaries[indexR].startContCE = start[1];
 156     if(end) {
 157         ucolIndirectBoundaries[indexR].limitCE = end[0];
 158         ucolIndirectBoundaries[indexR].limitContCE = end[1];
 159     } else {
 160         ucolIndirectBoundaries[indexR].limitCE = 0;
 161         ucolIndirectBoundaries[indexR].limitContCE = 0;
 162     }
 163 }
 164
 165
 166 static inline
 167 void syntaxError(const UChar* rules,
 168                  int32_t pos,
 169                  int32_t rulesLen,
 170                  UParseError* parseError)
 171 {
 172     parseError->offset = pos;
 173     parseError->line = 0 ; /* we are not using line numbers */
 174
 175     // for pre-context
 176     int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
 177     int32_t stop  = pos;
 178
 179     u_memcpy(parseError->preContext,rules+start,stop-start);
 180     //null terminate the buffer
 181     parseError->preContext[stop-start] = 0;
 182
 183     //for post-context
 184     start = pos+1;
 185     stop  = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) :
 186     rulesLen;
 187
 188     if(start < stop) {
 189         u_memcpy(parseError->postContext,rules+start,stop-start);
 190         //null terminate the buffer
 191         parseError->postContext[stop-start]= 0;
 192     } else {
 193         parseError->postContext[0] = 0;
 194     }
 195 }
 196
 197 static
 198 void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) {
 199     switch(attrib) {
 200     case UCOL_HIRAGANA_QUATERNARY_MODE:
 201         opts->hiraganaQ = value;
 202         break;
 203     case UCOL_FRENCH_COLLATION:
 204         opts->frenchCollation = value;
 205         break;
 206     case UCOL_ALTERNATE_HANDLING:
 207         opts->alternateHandling = value;
 208         break;
 209     case UCOL_CASE_FIRST:
 210         opts->caseFirst = value;
 211         break;
 212     case UCOL_CASE_LEVEL:
 213         opts->caseLevel = value;
 214         break;
 215     case UCOL_NORMALIZATION_MODE:
 216         opts->normalizationMode = value;
 217         break;
 218     case UCOL_STRENGTH:
 219         opts->strength = value;
 220         break;
 221     case UCOL_NUMERIC_COLLATION:
 222         opts->numericCollation = value;
 223         break;
 224     case UCOL_ATTRIBUTE_COUNT:
 225     default:
 226         break;
 227     }
 228 }
 229
 230 #define UTOK_OPTION_COUNT 20
 231
 232 static UBool didInit = FALSE;
 233 /* we can be strict, or we can be lenient */
 234 /* I'd surely be lenient with the option arguments */
 235 /* maybe even with options */
 236 U_STRING_DECL(suboption_00, "non-ignorable", 13);
 237 U_STRING_DECL(suboption_01, "shifted",        7);
 238
 239 U_STRING_DECL(suboption_02, "lower",          5);
 240 U_STRING_DECL(suboption_03, "upper",          5);
 241 U_STRING_DECL(suboption_04, "off",            3);
 242 U_STRING_DECL(suboption_05, "on",             2);
 243 U_STRING_DECL(suboption_06, "1",              1);
 244 U_STRING_DECL(suboption_07, "2",              1);
 245 U_STRING_DECL(suboption_08, "3",              1);
 246 U_STRING_DECL(suboption_09, "4",              1);
 247 U_STRING_DECL(suboption_10, "I",              1);
 248
 249 U_STRING_DECL(suboption_11, "primary",        7);
 250 U_STRING_DECL(suboption_12, "secondary",      9);
 251 U_STRING_DECL(suboption_13, "tertiary",       8);
 252 U_STRING_DECL(suboption_14, "variable",       8);
 253 U_STRING_DECL(suboption_15, "regular",        7);
 254 U_STRING_DECL(suboption_16, "implicit",       8);
 255 U_STRING_DECL(suboption_17, "trailing",       8);
 256
 257
 258 U_STRING_DECL(option_00,    "undefined",      9);
 259 U_STRING_DECL(option_01,    "rearrange",      9);
 260 U_STRING_DECL(option_02,    "alternate",      9);
 261 U_STRING_DECL(option_03,    "backwards",      9);
 262 U_STRING_DECL(option_04,    "variable top",  12);
 263 U_STRING_DECL(option_05,    "top",            3);
 264 U_STRING_DECL(option_06,    "normalization", 13);
 265 U_STRING_DECL(option_07,    "caseLevel",      9);
 266 U_STRING_DECL(option_08,    "caseFirst",      9);
 267 U_STRING_DECL(option_09,    "scriptOrder",   11);
 268 U_STRING_DECL(option_10,    "charsetname",   11);
 269 U_STRING_DECL(option_11,    "charset",        7);
 270 U_STRING_DECL(option_12,    "before",         6);
 271 U_STRING_DECL(option_13,    "hiraganaQ",      9);
 272 U_STRING_DECL(option_14,    "strength",       8);
 273 U_STRING_DECL(option_15,    "first",          5);
 274 U_STRING_DECL(option_16,    "last",           4);
 275 U_STRING_DECL(option_17,    "optimize",       8);
 276 U_STRING_DECL(option_18,    "suppressContractions",         20);
 277 U_STRING_DECL(option_19,    "numericOrdering",              15);
 278
 279
 280 /*
 281 [last variable] last variable value
 282 [last primary ignorable] largest CE for primary ignorable
 283 [last secondary ignorable] largest CE for secondary ignorable
 284 [last tertiary ignorable] largest CE for tertiary ignorable
 285 [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
 286 */
 287
 288
 289 static const ucolTokSuboption alternateSub[2] = {
 290     {suboption_00, 13, UCOL_NON_IGNORABLE},
 291     {suboption_01,  7, UCOL_SHIFTED}
 292 };
 293
 294 static const ucolTokSuboption caseFirstSub[3] = {
 295     {suboption_02, 5, UCOL_LOWER_FIRST},
 296     {suboption_03,  5, UCOL_UPPER_FIRST},
 297     {suboption_04,  3, UCOL_OFF},
 298 };
 299
 300 static const ucolTokSuboption onOffSub[2] = {
 301     {suboption_04, 3, UCOL_OFF},
 302     {suboption_05, 2, UCOL_ON}
 303 };
 304
 305 static const ucolTokSuboption frenchSub[1] = {
 306     {suboption_07, 1, UCOL_ON}
 307 };
 308
 309 static const ucolTokSuboption beforeSub[3] = {
 310     {suboption_06, 1, UCOL_PRIMARY},
 311     {suboption_07, 1, UCOL_SECONDARY},
 312     {suboption_08, 1, UCOL_TERTIARY}
 313 };
 314
 315 static const ucolTokSuboption strengthSub[5] = {
 316     {suboption_06, 1, UCOL_PRIMARY},
 317     {suboption_07, 1, UCOL_SECONDARY},
 318     {suboption_08, 1, UCOL_TERTIARY},
 319     {suboption_09, 1, UCOL_QUATERNARY},
 320     {suboption_10, 1, UCOL_IDENTICAL},
 321 };
 322
 323 static const ucolTokSuboption firstLastSub[7] = {
 324     {suboption_11, 7, UCOL_PRIMARY},
 325     {suboption_12, 9, UCOL_PRIMARY},
 326     {suboption_13, 8, UCOL_PRIMARY},
 327     {suboption_14, 8, UCOL_PRIMARY},
 328     {suboption_15, 7, UCOL_PRIMARY},
 329     {suboption_16, 8, UCOL_PRIMARY},
 330     {suboption_17, 8, UCOL_PRIMARY},
 331 };
 332
 333 enum OptionNumber {
 334     OPTION_ALTERNATE_HANDLING = 0,
 335     OPTION_FRENCH_COLLATION,
 336     OPTION_CASE_LEVEL,
 337     OPTION_CASE_FIRST,
 338     OPTION_NORMALIZATION_MODE,
 339     OPTION_HIRAGANA_QUATERNARY,
 340     OPTION_STRENGTH,
 341     OPTION_NUMERIC_COLLATION,
 342     OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,
 343     OPTION_VARIABLE_TOP,
 344     OPTION_REARRANGE,
 345     OPTION_BEFORE,
 346     OPTION_TOP,
 347     OPTION_FIRST,
 348     OPTION_LAST,
 349     OPTION_OPTIMIZE,
 350     OPTION_SUPPRESS_CONTRACTIONS,
 351     OPTION_UNDEFINED,
 352     OPTION_SCRIPT_ORDER,
 353     OPTION_CHARSET_NAME,
 354     OPTION_CHARSET
 355 } ;
 356
 357 static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
 358     /*00*/ {option_02,  9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */
 359     /*01*/ {option_03,  9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards"      */
 360     /*02*/ {option_07,  9, onOffSub, 2, UCOL_CASE_LEVEL},  /*"caseLevel"      */
 361     /*03*/ {option_08,  9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst"   */
 362     /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */
 363     /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */
 364     /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */
 365     /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION},  /*"numericOrdering"*/
 366     /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top"   */
 367     /*09*/ {option_01,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange"      */
 368     /*10*/ {option_12,  6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before"    */
 369     /*11*/ {option_05,  3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top"            */
 370     /*12*/ {option_15,  5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */
 371     /*13*/ {option_16,  4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */
 372     /*14*/ {option_17,  8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize"      */
 373     /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions"      */
 374     /*16*/ {option_00,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined"      */
 375     /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder"    */
 376     /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname"    */
 377     /*19*/ {option_11,  7, NULL, 0, UCOL_ATTRIBUTE_COUNT}  /*"charset"        */
 378 };
 379
 380 static
 381 int32_t u_strncmpNoCase(const UChar     *s1,
 382                         const UChar     *s2,
 383                         int32_t     n)
 384 {
 385     if(n > 0) {
 386         int32_t rc;
 387         for(;;) {
 388             rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2);
 389             if(rc != 0 || *s1 == 0 || --n == 0) {
 390                 return rc;
 391             }
 392             ++s1;
 393             ++s2;
 394         }
 395     }
 396     return 0;
 397 }
 398
 399 static
 400 void ucol_uprv_tok_initData() {
 401     if(!didInit) {
 402         U_STRING_INIT(suboption_00, "non-ignorable", 13);
 403         U_STRING_INIT(suboption_01, "shifted",        7);
 404
 405         U_STRING_INIT(suboption_02, "lower",          5);
 406         U_STRING_INIT(suboption_03, "upper",          5);
 407         U_STRING_INIT(suboption_04, "off",            3);
 408         U_STRING_INIT(suboption_05, "on",             2);
 409
 410         U_STRING_INIT(suboption_06, "1",              1);
 411         U_STRING_INIT(suboption_07, "2",              1);
 412         U_STRING_INIT(suboption_08, "3",              1);
 413         U_STRING_INIT(suboption_09, "4",              1);
 414         U_STRING_INIT(suboption_10, "I",              1);
 415
 416         U_STRING_INIT(suboption_11, "primary",        7);
 417         U_STRING_INIT(suboption_12, "secondary",      9);
 418         U_STRING_INIT(suboption_13, "tertiary",       8);
 419         U_STRING_INIT(suboption_14, "variable",       8);
 420         U_STRING_INIT(suboption_15, "regular",        7);
 421         U_STRING_INIT(suboption_16, "implicit",       8);
 422         U_STRING_INIT(suboption_17, "trailing",       8);
 423
 424
 425         U_STRING_INIT(option_00, "undefined",      9);
 426         U_STRING_INIT(option_01, "rearrange",      9);
 427         U_STRING_INIT(option_02, "alternate",      9);
 428         U_STRING_INIT(option_03, "backwards",      9);
 429         U_STRING_INIT(option_04, "variable top",  12);
 430         U_STRING_INIT(option_05, "top",            3);
 431         U_STRING_INIT(option_06, "normalization", 13);
 432         U_STRING_INIT(option_07, "caseLevel",      9);
 433         U_STRING_INIT(option_08, "caseFirst",      9);
 434         U_STRING_INIT(option_09, "scriptOrder",   11);
 435         U_STRING_INIT(option_10, "charsetname",   11);
 436         U_STRING_INIT(option_11, "charset",        7);
 437         U_STRING_INIT(option_12, "before",         6);
 438         U_STRING_INIT(option_13, "hiraganaQ",      9);
 439         U_STRING_INIT(option_14, "strength",       8);
 440         U_STRING_INIT(option_15, "first",          5);
 441         U_STRING_INIT(option_16, "last",           4);
 442         U_STRING_INIT(option_17, "optimize",       8);
 443         U_STRING_INIT(option_18, "suppressContractions",         20);
 444         U_STRING_INIT(option_19, "numericOrdering",      15);
 445         didInit = TRUE;
 446     }
 447 }
 448
 449
 450 // This function reads basic options to set in the runtime collator
 451 // used by data driven tests. Should not support build time options
 452 U_CAPI const UChar * U_EXPORT2
 453 ucol_tok_getNextArgument(const UChar *start, const UChar *end,
 454                          UColAttribute *attrib, UColAttributeValue *value,
 455                          UErrorCode *status)
 456 {
 457     uint32_t i = 0;
 458     int32_t j=0;
 459     UBool foundOption = FALSE;
 460     const UChar *optionArg = NULL;
 461
 462     ucol_uprv_tok_initData();
 463
 464     while(start < end && u_isWhitespace(*start)) { /* eat whitespace */
 465         start++;
 466     }
 467     if(start >= end) {
 468         return NULL;
 469     }
 470     /* skip opening '[' */
 471     if(*start == 0x005b) {
 472         start++;
 473     } else {
 474         *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '['
 475         return NULL;
 476     }
 477
 478     while(i < UTOK_OPTION_COUNT) {
 479         if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
 480             foundOption = TRUE;
 481             if(end - start > rulesOptions[i].optionLen) {
 482                 optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */
 483                 while(u_isWhitespace(*optionArg)) { /* eat whitespace */
 484                     optionArg++;
 485                 }
 486             }
 487             break;
 488         }
 489         i++;
 490     }
 491
 492     if(!foundOption) {
 493         *status = U_ILLEGAL_ARGUMENT_ERROR;
 494         return NULL;
 495     }
 496
 497     if(optionArg) {
 498         for(j = 0; j<rulesOptions[i].subSize; j++) {
 499             if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
 500                 //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
 501                 *attrib = rulesOptions[i].attr;
 502                 *value = rulesOptions[i].subopts[j].attrVal;
 503                 optionArg += rulesOptions[i].subopts[j].subLen;
 504                 while(u_isWhitespace(*optionArg)) { /* eat whitespace */
 505                     optionArg++;
 506                 }
 507                 if(*optionArg == 0x005d) {
 508                     optionArg++;
 509                     return optionArg;
 510                 } else {
 511                     *status = U_ILLEGAL_ARGUMENT_ERROR;
 512                     return NULL;
 513                 }
 514             }
 515         }
 516     }
 517     *status = U_ILLEGAL_ARGUMENT_ERROR;
 518     return NULL;
 519 }
 520
 521 static
 522 USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) {
 523     while(*start != 0x005b) { /* advance while we find the first '[' */
 524         start++;
 525     }
 526     // now we need to get a balanced set of '[]'. The problem is that a set can have
 527     // many, and *end point to the first closing '['
 528     int32_t noOpenBraces = 1;
 529     int32_t current = 1; // skip the opening brace
 530     while(start+current < end && noOpenBraces != 0) {
 531         if(start[current] == 0x005b) {
 532             noOpenBraces++;
 533         } else if(start[current] == 0x005D) { // closing brace
 534             noOpenBraces--;
 535         }
 536         current++;
 537     }
 538
 539     if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) {
 540         *status = U_ILLEGAL_ARGUMENT_ERROR;
 541         return NULL;
 542     }
 543     return uset_openPattern(start, current, status);
 544 }
 545
 546 static
 547 int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) {
 548     int32_t i = 0;
 549     ucol_uprv_tok_initData();
 550
 551     while(u_isWhitespace(*start)) { /* eat whitespace */
 552         start++;
 553     }
 554     while(i < UTOK_OPTION_COUNT) {
 555         if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
 556             if(end - start > rulesOptions[i].optionLen) {
 557                 *optionArg = start+rulesOptions[i].optionLen; /* start of the options*/
 558                 while(u_isWhitespace(**optionArg)) { /* eat whitespace */
 559                     (*optionArg)++;
 560                 }
 561             }
 562             break;
 563         }
 564         i++;
 565     }
 566     if(i == UTOK_OPTION_COUNT) {
 567         i = -1; // didn't find an option
 568     }
 569     return i;
 570 }
 571
 572
 573 // reads and conforms to various options in rules
 574 // end is the position of the first closing ']'
 575 // However, some of the options take an UnicodeSet definition
 576 // which needs to duplicate the closing ']'
 577 // for example: '[copy [\uAC00-\uD7FF]]'
 578 // These options will move end to the second ']' and the
 579 // caller will set the current to it.
 580 static
 581 uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) {
 582     const UChar* start = src->current;
 583     int32_t i = 0;
 584     int32_t j=0;
 585     const UChar *optionArg = NULL;
 586
 587     uint8_t result = 0;
 588
 589     start++; /*skip opening '['*/
 590     i = ucol_uprv_tok_readOption(start, src->end, &optionArg);
 591     if(optionArg) {
 592         src->current = optionArg;
 593     }
 594
 595     if(i < 0) {
 596         *status = U_ILLEGAL_ARGUMENT_ERROR;
 597     } else {
 598         int32_t noOpenBraces = 1;
 599         switch(i) {
 600     case OPTION_ALTERNATE_HANDLING:
 601     case OPTION_FRENCH_COLLATION:
 602     case OPTION_CASE_LEVEL:
 603     case OPTION_CASE_FIRST:
 604     case OPTION_NORMALIZATION_MODE:
 605     case OPTION_HIRAGANA_QUATERNARY:
 606     case OPTION_STRENGTH:
 607     case OPTION_NUMERIC_COLLATION:
 608         if(optionArg) {
 609             for(j = 0; j<rulesOptions[i].subSize; j++) {
 610                 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
 611                     ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
 612                     result =  UCOL_TOK_SUCCESS;
 613                 }
 614             }
 615         }
 616         if(result == 0) {
 617             *status = U_ILLEGAL_ARGUMENT_ERROR;
 618         }
 619         break;
 620     case OPTION_VARIABLE_TOP:
 621         result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP;
 622         break;
 623     case OPTION_REARRANGE:
 624         result = UCOL_TOK_SUCCESS;
 625         break;
 626     case OPTION_BEFORE:
 627         if(optionArg) {
 628             for(j = 0; j<rulesOptions[i].subSize; j++) {
 629                 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
 630                     result = UCOL_TOK_SUCCESS | rulesOptions[i].subopts[j].attrVal + 1;
 631                 }
 632             }
 633         }
 634         if(result == 0) {
 635             *status = U_ILLEGAL_ARGUMENT_ERROR;
 636         }
 637         break;
 638     case OPTION_TOP: /* we are going to have an array with structures of limit CEs */
 639         /* index to this array will be src->parsedToken.indirectIndex*/
 640         src->parsedToken.indirectIndex = 0;
 641         result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;
 642         break;
 643     case OPTION_FIRST:
 644     case OPTION_LAST: /* first, last */
 645         for(j = 0; j<rulesOptions[i].subSize; j++) {
 646             if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
 647                 // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first
 648                 // element of indirect boundaries is reserved for top.
 649                 src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2);
 650                 result =  UCOL_TOK_SUCCESS | UCOL_TOK_TOP;;
 651             }
 652         }
 653         if(result == 0) {
 654             *status = U_ILLEGAL_ARGUMENT_ERROR;
 655         }
 656         break;
 657     case OPTION_OPTIMIZE:
 658     case OPTION_SUPPRESS_CONTRACTIONS:  // copy and remove are handled before normalization
 659         // we need to move end here
 660         src->current++; // skip opening brace
 661         while(src->current < src->end && noOpenBraces != 0) {
 662             if(*src->current == 0x005b) {
 663                 noOpenBraces++;
 664             } else if(*src->current == 0x005D) { // closing brace
 665                 noOpenBraces--;
 666             }
 667             src->current++;
 668         }
 669         result = UCOL_TOK_SUCCESS;
 670         break;
 671     default:
 672         *status = U_UNSUPPORTED_ERROR;
 673         break;
 674         }
 675     }
 676     src->current = u_memchr(src->current, 0x005d, src->end-src->current);
 677     return result;
 678 }
 679
 680
 681 inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) {
 682     if(src->extraCurrent+len >= src->extraEnd) {
 683         /* reallocate */
 684         UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar));
 685         if(newSrc != NULL) {
 686             src->current = newSrc + (src->current - src->source);
 687             src->extraCurrent = newSrc + (src->extraCurrent - src->source);
 688             src->end = newSrc + (src->end - src->source);
 689             src->extraEnd = newSrc + (src->extraEnd-src->source)*2;
 690             src->sourceCurrent = newSrc + (src->sourceCurrent-src->source);
 691             src->source = newSrc;
 692         } else {
 693             *status = U_MEMORY_ALLOCATION_ERROR;
 694         }
 695     }
 696     if(len == 1) {
 697         *src->extraCurrent++ = *stuff;
 698     } else {
 699         uprv_memcpy(src->extraCurrent, stuff, len*sizeof(UChar));
 700         src->extraCurrent += len;
 701     }
 702
 703
 704 }
 705
 706 inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) {
 707     /*
 708     top = TRUE;
 709     */
 710     UChar buff[5];
 711     src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
 712     buff[0] = 0xFFFE;
 713     buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
 714     buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
 715     if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
 716         src->parsedToken.charsLen = 3;
 717         ucol_tok_addToExtraCurrent(src, buff, 3, status);
 718     } else {
 719         buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
 720         buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
 721         src->parsedToken.charsLen = 5;
 722         ucol_tok_addToExtraCurrent(src, buff, 5, status);
 723     }
 724     return TRUE;
 725 }
 726
 727 static UBool isCharNewLine(UChar c){
 728     switch(c){
 729     case 0x000A: /* LF  */
 730     case 0x000D: /* CR  */
 731     case 0x000C: /* FF  */
 732     case 0x0085: /* NEL */
 733     case 0x2028: /* LS  */
 734     case 0x2029: /* PS  */
 735         return TRUE;
 736     default:
 737         return FALSE;
 738     }
 739 }
 740
 741 U_CAPI const UChar* U_EXPORT2
 742 ucol_tok_parseNextToken(UColTokenParser *src,
 743                         UBool startOfRules,
 744                         UParseError *parseError,
 745                         UErrorCode *status)
 746 {
 747     /* parsing part */
 748     UBool variableTop = FALSE;
 749     UBool top = FALSE;
 750     UBool inChars = TRUE;
 751     UBool inQuote = FALSE;
 752     UBool wasInQuote = FALSE;
 753     uint8_t before = 0;
 754     UBool isEscaped = FALSE;
 755     // TODO: replace these variables with src->parsedToken counterparts
 756     // no need to use them anymore since we have src->parsedToken.
 757     // Ideally, token parser would be a nice class... Once, when I have
 758     // more time (around 2020 probably).
 759     uint32_t newExtensionLen = 0;
 760     uint32_t extensionOffset = 0;
 761     uint32_t newStrength = UCOL_TOK_UNSET;
 762     UChar buff[10];
 763
 764     src->parsedToken.charsOffset = 0;  src->parsedToken.charsLen = 0;
 765     src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;
 766     src->parsedToken.indirectIndex = 0;
 767
 768     while (src->current < src->end) {
 769         UChar ch = *(src->current);
 770
 771         if (inQuote) {
 772             if (ch == 0x0027/*'\''*/) {
 773                 inQuote = FALSE;
 774             } else {
 775                 if ((src->parsedToken.charsLen == 0) || inChars) {
 776                     if(src->parsedToken.charsLen == 0) {
 777                         src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
 778                     }
 779                     src->parsedToken.charsLen++;
 780                 } else {
 781                     if(newExtensionLen == 0) {
 782                         extensionOffset = (uint32_t)(src->extraCurrent - src->source);
 783                     }
 784                     newExtensionLen++;
 785                 }
 786             }
 787         }else if(isEscaped){
 788             isEscaped =FALSE;
 789             if (newStrength == UCOL_TOK_UNSET) {
 790                 *status = U_INVALID_FORMAT_ERROR;
 791                 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
 792                 return NULL;
 793                 // enabling rules to start with non-tokens a < b
 794                 // newStrength = UCOL_TOK_RESET;
 795             }
 796             if(ch != 0x0000  && src->current != src->end) {
 797                 if (inChars) {
 798                     if(src->parsedToken.charsLen == 0) {
 799                         src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
 800                     }
 801                     src->parsedToken.charsLen++;
 802                 } else {
 803                     if(newExtensionLen == 0) {
 804                         extensionOffset = (uint32_t)(src->current - src->source);
 805                     }
 806                     newExtensionLen++;
 807                 }
 808             }
 809         }else {
 810             if(!uprv_isRuleWhiteSpace(ch)) {
 811                 /* Sets the strength for this entry */
 812                 switch (ch) {
 813                 case 0x003D/*'='*/ :
 814                     if (newStrength != UCOL_TOK_UNSET) {
 815                         goto EndOfLoop;
 816                     }
 817
 818                     /* if we start with strength, we'll reset to top */
 819                     if(startOfRules == TRUE) {
 820                         src->parsedToken.indirectIndex = 5;
 821                         top = ucol_tok_doSetTop(src, status);
 822                         newStrength = UCOL_TOK_RESET;
 823                         goto EndOfLoop;
 824                     }
 825                     newStrength = UCOL_IDENTICAL;
 826                     break;
 827
 828                 case 0x002C/*','*/:
 829                     if (newStrength != UCOL_TOK_UNSET) {
 830                         goto EndOfLoop;
 831                     }
 832
 833                     /* if we start with strength, we'll reset to top */
 834                     if(startOfRules == TRUE) {
 835                         src->parsedToken.indirectIndex = 5;
 836                         top = ucol_tok_doSetTop(src, status);
 837                         newStrength = UCOL_TOK_RESET;
 838                         goto EndOfLoop;
 839                     }
 840                     newStrength = UCOL_TERTIARY;
 841                     break;
 842
 843                 case  0x003B/*';'*/:
 844                     if (newStrength != UCOL_TOK_UNSET) {
 845                         goto EndOfLoop;
 846                     }
 847
 848                     /* if we start with strength, we'll reset to top */
 849                     if(startOfRules == TRUE) {
 850                         src->parsedToken.indirectIndex = 5;
 851                         top = ucol_tok_doSetTop(src, status);
 852                         newStrength = UCOL_TOK_RESET;
 853                         goto EndOfLoop;
 854                     }
 855                     newStrength = UCOL_SECONDARY;
 856                     break;
 857
 858                 case 0x003C/*'<'*/:
 859                     if (newStrength != UCOL_TOK_UNSET) {
 860                         goto EndOfLoop;
 861                     }
 862
 863                     /* if we start with strength, we'll reset to top */
 864                     if(startOfRules == TRUE) {
 865                         src->parsedToken.indirectIndex = 5;
 866                         top = ucol_tok_doSetTop(src, status);
 867                         newStrength = UCOL_TOK_RESET;
 868                         goto EndOfLoop;
 869                     }
 870                     /* before this, do a scan to verify whether this is */
 871                     /* another strength */
 872                     if(*(src->current+1) == 0x003C) {
 873                         src->current++;
 874                         if(*(src->current+1) == 0x003C) {
 875                             src->current++; /* three in a row! */
 876                             newStrength = UCOL_TERTIARY;
 877                         } else { /* two in a row */
 878                             newStrength = UCOL_SECONDARY;
 879                         }
 880                     } else { /* just one */
 881                         newStrength = UCOL_PRIMARY;
 882                     }
 883                     break;
 884
 885                 case 0x0026/*'&'*/:
 886                     if (newStrength != UCOL_TOK_UNSET) {
 887                         /**/
 888                         goto EndOfLoop;
 889                     }
 890
 891                     newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
 892                     break;
 893
 894                 case 0x005b/*'['*/:
 895                     /* options - read an option, analyze it */
 896                     if(u_strchr(src->current, 0x005d /*']'*/) != NULL) {
 897                         uint8_t result = ucol_uprv_tok_readAndSetOption(src, status);
 898                         if(U_SUCCESS(*status)) {
 899                             if(result & UCOL_TOK_TOP) {
 900                                 if(newStrength == UCOL_TOK_RESET) {
 901                                     top = ucol_tok_doSetTop(src, status);
 902                                     if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
 903                                         src->parsedToken.charsLen+=2;
 904                                         buff[0] = 0x002d;
 905                                         buff[1] = before;
 906                                         ucol_tok_addToExtraCurrent(src, buff, 2, status);
 907                                     }
 908
 909                                     src->current++;
 910                                     goto EndOfLoop;
 911                                 } else {
 912                                     *status = U_INVALID_FORMAT_ERROR;
 913                                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
 914                                 }
 915                             } else if(result & UCOL_TOK_VARIABLE_TOP) {
 916                                 if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
 917                                     variableTop = TRUE;
 918                                     src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
 919                                     src->parsedToken.charsLen = 1;
 920                                     buff[0] = 0xFFFF;
 921                                     ucol_tok_addToExtraCurrent(src, buff, 1, status);
 922                                     src->current++;
 923                                     goto EndOfLoop;
 924                                 } else {
 925                                     *status = U_INVALID_FORMAT_ERROR;
 926                                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
 927                                 }
 928                             } else if (result & UCOL_TOK_BEFORE){
 929                                 if(newStrength == UCOL_TOK_RESET) {
 930                                     before = result & UCOL_TOK_BEFORE;
 931                                 } else {
 932                                     *status = U_INVALID_FORMAT_ERROR;
 933                                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
 934
 935                                 }
 936                             }
 937                         } else {
 938                             *status = U_INVALID_FORMAT_ERROR;
 939                             syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
 940                             return NULL;
 941                         }
 942                     }
 943                     break;
 944                 case 0x0021/*! skip java thai modifier reordering*/:
 945                     break;
 946                 case 0x002F/*'/'*/:
 947                     wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
 948                     inChars = FALSE; /* we're now processing expansion */
 949                     break;
 950                 case 0x005C /* back slash for escaped chars */:
 951                     isEscaped = TRUE;
 952                     break;
 953                     /* found a quote, we're gonna start copying */
 954                 case 0x0027/*'\''*/:
 955                     if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
 956                         *status = U_INVALID_FORMAT_ERROR;
 957                         syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
 958                         return NULL;
 959                         // enabling rules to start with a non-token character a < b
 960                         // newStrength = UCOL_TOK_RESET;
 961                     }
 962
 963                     inQuote = TRUE;
 964
 965                     if(inChars) { /* we're doing characters */
 966                         if(wasInQuote == FALSE) {
 967                             src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
 968                         }
 969                         if (src->parsedToken.charsLen != 0) {
 970                             ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
 971                         }
 972                         src->parsedToken.charsLen++;
 973                     } else { /* we're doing an expansion */
 974                         if(wasInQuote == FALSE) {
 975                             extensionOffset = (uint32_t)(src->extraCurrent - src->source);
 976                         }
 977                         if (newExtensionLen != 0) {
 978                             ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status);
 979                         }
 980                         newExtensionLen++;
 981                     }
 982
 983                     wasInQuote = TRUE;
 984
 985                     ch = *(++(src->current));
 986                     if(ch == 0x0027) { /* copy the double quote */
 987                         ucol_tok_addToExtraCurrent(src, &ch, 1, status);
 988                         inQuote = FALSE;
 989                     }
 990                     break;
 991
 992                     /* '@' is french only if the strength is not currently set */
 993                     /* if it is, it's just a regular character in collation rules */
 994                 case 0x0040/*'@'*/:
 995                     if (newStrength == UCOL_TOK_UNSET) {
 996                         src->opts->frenchCollation = UCOL_ON;
 997                         break;
 998                     }
 999
1000                 case 0x007C /*|*/: /* this means we have actually been reading prefix part */
1001                     // we want to store read characters to the prefix part and continue reading
1002                     // the characters (proper way would be to restart reading the chars, but in
1003                     // that case we would have to complicate the token hasher, which I do not
1004                     // intend to play with. Instead, we will do prefixes when prefixes are due
1005                     // (before adding the elements).
1006                     src->parsedToken.prefixOffset = src->parsedToken.charsOffset;
1007                     src->parsedToken.prefixLen = src->parsedToken.charsLen;
1008
1009                     if(inChars) { /* we're doing characters */
1010                         if(wasInQuote == FALSE) {
1011                             src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1012                         }
1013                         if (src->parsedToken.charsLen != 0) {
1014                             ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
1015                         }
1016                         src->parsedToken.charsLen++;
1017                     }
1018
1019                     wasInQuote = TRUE;
1020
1021                     do {
1022                         ch = *(++(src->current));
1023                         // skip whitespace between '|' and the character
1024                     } while (uprv_isRuleWhiteSpace(ch));
1025                     break;
1026
1027                     //charsOffset = 0;
1028                     //newCharsLen = 0;
1029                     //break; // We want to store the whole prefix/character sequence. If we break
1030                     // the '|' is going to get lost.
1031                 case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */
1032                     do {
1033                         ch = *(++(src->current));
1034                     } while (!isCharNewLine(ch));
1035
1036                     break;
1037                 default:
1038                     if (newStrength == UCOL_TOK_UNSET) {
1039                         *status = U_INVALID_FORMAT_ERROR;
1040                         syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1041                         return NULL;
1042                     }
1043
1044                     if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
1045                         *status = U_INVALID_FORMAT_ERROR;
1046                         syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1047                         return NULL;
1048                     }
1049
1050                     if(ch == 0x0000 && src->current+1 == src->end) {
1051                         break;
1052                     }
1053
1054                     if (inChars) {
1055                         if(src->parsedToken.charsLen == 0) {
1056                             src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
1057                         }
1058                         src->parsedToken.charsLen++;
1059                     } else {
1060                         if(newExtensionLen == 0) {
1061                             extensionOffset = (uint32_t)(src->current - src->source);
1062                         }
1063                         newExtensionLen++;
1064                     }
1065
1066                     break;
1067                 }
1068             }
1069         }
1070
1071         if(wasInQuote) {
1072             if(ch != 0x27) {
1073                 if(inQuote || !uprv_isRuleWhiteSpace(ch)) {
1074                     ucol_tok_addToExtraCurrent(src, &ch, 1, status);
1075                 }
1076             }
1077         }
1078
1079         src->current++;
1080     }
1081
1082 EndOfLoop:
1083     wasInQuote = FALSE;
1084     if (newStrength == UCOL_TOK_UNSET) {
1085         return NULL;
1086     }
1087
1088     if (src->parsedToken.charsLen == 0 && top == FALSE) {
1089         syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1090         *status = U_INVALID_FORMAT_ERROR;
1091         return NULL;
1092     }
1093
1094     src->parsedToken.strength = newStrength;
1095     src->parsedToken.extensionOffset = extensionOffset;
1096     src->parsedToken.extensionLen = newExtensionLen;
1097     src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before;
1098
1099     return src->current;
1100 }
1101
1102 /*
1103 Processing Description
1104 1 Build a ListList. Each list has a header, which contains two lists (positive
1105 and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and
1106 reset may be null.
1107 2 As you process, you keep a LAST pointer that points to the last token you
1108 handled.
1109 */
1110
1111 static UColToken *ucol_tok_initAReset(UColTokenParser *src, UChar *expand, uint32_t *expandNext,
1112                                       UParseError *parseError, UErrorCode *status)
1113 {
1114     if(src->resultLen == src->listCapacity) {
1115         // Unfortunately, this won't work, as we store addresses of lhs in token
1116         src->listCapacity *= 2;
1117         src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader));
1118         if(src->lh == NULL) {
1119             *status = U_MEMORY_ALLOCATION_ERROR;
1120             return NULL;
1121         }
1122     }
1123     /* do the reset thing */
1124     UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
1125     /* test for NULL */
1126     if (sourceToken == NULL) {
1127         *status = U_MEMORY_ALLOCATION_ERROR;
1128         return NULL;
1129     }
1130     sourceToken->rulesToParse = src->source;
1131     sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1132     sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
1133
1134     sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
1135     sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
1136
1137     // keep the flags around so that we know about before
1138     sourceToken->flags = src->parsedToken.flags;
1139
1140     if(src->parsedToken.prefixOffset != 0) {
1141         // this is a syntax error
1142         *status = U_INVALID_FORMAT_ERROR;
1143         syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
1144         uprv_free(sourceToken);
1145         return 0;
1146     } else {
1147         sourceToken->prefix = 0;
1148     }
1149
1150     sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
1151     sourceToken->strength = UCOL_TOK_RESET;
1152     sourceToken->next = NULL;
1153     sourceToken->previous = NULL;
1154     sourceToken->noOfCEs = 0;
1155     sourceToken->noOfExpCEs = 0;
1156     sourceToken->listHeader = &src->lh[src->resultLen];
1157
1158     src->lh[src->resultLen].first = NULL;
1159     src->lh[src->resultLen].last = NULL;
1160     src->lh[src->resultLen].first = NULL;
1161     src->lh[src->resultLen].last = NULL;
1162
1163     src->lh[src->resultLen].reset = sourceToken;
1164
1165     /*
1166     3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
1167     First convert all expansions into normal form. Examples:
1168     If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
1169     d * ... into &x * c/y * d * ...
1170     Note: reset values can never have expansions, although they can cause the
1171     very next item to have one. They may be contractions, if they are found
1172     earlier in the list.
1173     */
1174     *expandNext = 0;
1175     if(expand != NULL) {
1176         /* check to see if there is an expansion */
1177         if(src->parsedToken.charsLen > 1) {
1178             uint32_t resetCharsOffset;
1179             resetCharsOffset = (uint32_t)(expand - src->source);
1180             sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset;
1181             *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset);
1182         }
1183     }
1184
1185     src->resultLen++;
1186
1187     uhash_put(src->tailored, sourceToken, sourceToken, status);
1188
1189     return sourceToken;
1190 }
1191
1192 static
1193 inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) {
1194     if(U_FAILURE(*status)) {
1195         return NULL;
1196     }
1197     /* this is a virgin before - we need to fish the anchor from the UCA */
1198     collIterate s;
1199     uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND;
1200     uint32_t CE, SecondCE;
1201     uint32_t invPos;
1202     if(sourceToken != NULL) {
1203         uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s);
1204     } else {
1205         uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s);
1206     }
1207
1208     baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;
1209     baseContCE = ucol_getNextCE(src->UCA, &s, status);
1210     if(baseContCE == UCOL_NO_MORE_CES) {
1211         baseContCE = 0;
1212     }
1213
1214
1215     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1216     uint32_t ch = 0;
1217     uint32_t expandNext = 0;
1218     UColToken key;
1219
1220     if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
1221         uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16;
1222         uint32_t raw = uprv_uca_getRawFromImplicit(primary);
1223         ch = uprv_uca_getCodePointFromRaw(raw-1);
1224         uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
1225         CE = primaryCE & UCOL_PRIMARYMASK | 0x0505;
1226         SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER;
1227
1228         src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1229         *src->extraCurrent++ = 0xFFFE;
1230         *src->extraCurrent++ = (UChar)ch;
1231         src->parsedToken.charsLen++;
1232
1233         key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
1234         key.rulesToParse = src->source;
1235
1236         //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1237         sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1238
1239         if(sourceToken == NULL) {
1240             src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1241             if(isContinuation(SecondCE)) {
1242                 src->lh[src->resultLen].baseContCE = SecondCE;
1243             } else {
1244                 src->lh[src->resultLen].baseContCE = 0;
1245             }
1246             src->lh[src->resultLen].nextCE = 0;
1247             src->lh[src->resultLen].nextContCE = 0;
1248             src->lh[src->resultLen].previousCE = 0;
1249             src->lh[src->resultLen].previousContCE = 0;
1250
1251             src->lh[src->resultLen].indirect = FALSE;
1252
1253             sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1254         }
1255
1256     } else {
1257         invPos = ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
1258
1259         // we got the previous CE. Now we need to see if the difference between
1260         // the two CEs is really of the requested strength.
1261         // if it's a bigger difference (we asked for secondary and got primary), we
1262         // need to modify the CE.
1263         if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) {
1264             // adjust the strength
1265             // now we are in the situation where our baseCE should actually be modified in
1266             // order to get the CE in the right position.
1267             if(strength == UCOL_SECONDARY) {
1268                 CE = baseCE - 0x0200;
1269             } else { // strength == UCOL_TERTIARY
1270                 CE = baseCE - 0x02;
1271             }
1272             if(baseContCE) {
1273                 if(strength == UCOL_SECONDARY) {
1274                     SecondCE = baseContCE - 0x0200;
1275                 } else { // strength == UCOL_TERTIARY
1276                     SecondCE = baseContCE - 0x02;
1277                 }
1278             }
1279         }
1280
1281 #if 0
1282         // the code below relies on getting a code point from the inverse table, in order to be
1283         // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
1284         // 1. There are many code points that have the same CE
1285         // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
1286         // Also, in case when there is no equivalent strength before an element, we have to actually
1287         // construct one. For example, &[before 2]a << x won't result in x << a, because the element
1288         // before a is a primary difference.
1289
1290         //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
1291
1292
1293         ch = CETable[3*invPos+2];
1294
1295         if((ch &  UCOL_INV_SIZEMASK) != 0) {
1296             uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts);
1297             uint32_t offset = (ch & UCOL_INV_OFFSETMASK);
1298             ch = conts[offset];
1299         }
1300
1301         *src->extraCurrent++ = (UChar)ch;
1302         src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1);
1303         src->parsedToken.charsLen = 1;
1304
1305         // We got an UCA before. However, this might have been tailored.
1306         // example:
1307         // &\u30ca = \u306a
1308         // &[before 3]\u306a<<<\u306a|\u309d
1309
1310
1311         // uint32_t key = (*newCharsLen << 24) | *charsOffset;
1312         key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
1313         key.rulesToParse = src->source;
1314
1315         //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1316         sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1317 #endif
1318
1319         // here is how it should be. The situation such as &[before 1]a < x, should be
1320         // resolved exactly as if we wrote &a > x.
1321         // therefore, I don't really care if the UCA value before a has been changed.
1322         // However, I do care if the strength between my element and the previous element
1323         // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
1324         // have to construct the base CE.
1325
1326
1327
1328         // if we found a tailored thing, we have to use the UCA value and construct
1329         // a new reset token with constructed name
1330         //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
1331         // character to which we want to anchor is already tailored.
1332         // We need to construct a new token which will be the anchor
1333         // point
1334         //*(src->extraCurrent-1) = 0xFFFE;
1335         //*src->extraCurrent++ = (UChar)ch;
1336         // grab before
1337         src->parsedToken.charsOffset -= 10;
1338         src->parsedToken.charsLen += 10;
1339         src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1340         if(isContinuation(SecondCE)) {
1341             src->lh[src->resultLen].baseContCE = SecondCE;
1342         } else {
1343             src->lh[src->resultLen].baseContCE = 0;
1344         }
1345         src->lh[src->resultLen].nextCE = 0;
1346         src->lh[src->resultLen].nextContCE = 0;
1347         src->lh[src->resultLen].previousCE = 0;
1348         src->lh[src->resultLen].previousContCE = 0;
1349
1350         src->lh[src->resultLen].indirect = FALSE;
1351
1352         sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1353         //}
1354     }
1355
1356     return sourceToken;
1357
1358 }
1359
1360 uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) {
1361     UColToken *lastToken = NULL;
1362     const UChar *parseEnd = NULL;
1363     uint32_t expandNext = 0;
1364     UBool variableTop = FALSE;
1365     UBool top = FALSE;
1366     uint16_t specs = 0;
1367     UColTokListHeader *ListList = NULL;
1368
1369     src->parsedToken.strength = UCOL_TOK_UNSET;
1370
1371     ListList = src->lh;
1372
1373     if(U_FAILURE(*status)) {
1374         return 0;
1375     }
1376
1377     while(src->current < src->end) {
1378         src->parsedToken.prefixOffset = 0;
1379
1380         parseEnd = ucol_tok_parseNextToken(src,
1381             (UBool)(lastToken == NULL),
1382             parseError,
1383             status);
1384
1385         specs = src->parsedToken.flags;
1386
1387
1388         variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0);
1389         top = ((specs & UCOL_TOK_TOP) != 0);
1390
1391         if(U_SUCCESS(*status) && parseEnd != NULL) {
1392             UColToken *sourceToken = NULL;
1393             //uint32_t key = 0;
1394             uint32_t lastStrength = UCOL_TOK_UNSET;
1395
1396             if(lastToken != NULL ) {
1397                 lastStrength = lastToken->strength;
1398             }
1399
1400             //key = newCharsLen << 24 | charsOffset;
1401             UColToken key;
1402             key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1403             key.rulesToParse = src->source;
1404
1405             /*  4 Lookup each source in the CharsToToken map, and find a sourceToken */
1406             sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1407
1408             if(src->parsedToken.strength != UCOL_TOK_RESET) {
1409                 if(lastToken == NULL) { /* this means that rules haven't started properly */
1410                     *status = U_INVALID_FORMAT_ERROR;
1411                     syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
1412                     return 0;
1413                 }
1414                 /*  6 Otherwise (when relation != reset) */
1415                 if(sourceToken == NULL) {
1416                     /* If sourceToken is null, create new one, */
1417                     sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
1418                     /* test for NULL */
1419                     if (sourceToken == NULL) {
1420                         *status = U_MEMORY_ALLOCATION_ERROR;
1421                         return 0;
1422                     }
1423                     sourceToken->rulesToParse = src->source;
1424                     sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1425
1426                     sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
1427
1428                     sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset;
1429                     sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset);
1430
1431                     sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
1432                     sourceToken->next = NULL;
1433                     sourceToken->previous = NULL;
1434                     sourceToken->noOfCEs = 0;
1435                     sourceToken->noOfExpCEs = 0;
1436                     // keep the flags around so that we know about before
1437                     sourceToken->flags = src->parsedToken.flags;
1438                     uhash_put(src->tailored, sourceToken, sourceToken, status);
1439                     if(U_FAILURE(*status)) {
1440                         return 0;
1441                     }
1442                 } else {
1443                     /* we could have fished out a reset here */
1444                     if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) {
1445                         /* otherwise remove sourceToken from where it was. */
1446                         if(sourceToken->next != NULL) {
1447                             if(sourceToken->next->strength > sourceToken->strength) {
1448                                 sourceToken->next->strength = sourceToken->strength;
1449                             }
1450                             sourceToken->next->previous = sourceToken->previous;
1451                         } else {
1452                             sourceToken->listHeader->last = sourceToken->previous;
1453                         }
1454
1455                         if(sourceToken->previous != NULL) {
1456                             sourceToken->previous->next = sourceToken->next;
1457                         } else {
1458                             sourceToken->listHeader->first = sourceToken->next;
1459                         }
1460                         sourceToken->next = NULL;
1461                         sourceToken->previous = NULL;
1462                     }
1463                 }
1464
1465                 sourceToken->strength = src->parsedToken.strength;
1466                 sourceToken->listHeader = lastToken->listHeader;
1467
1468                 /*
1469                 1.  Find the strongest strength in each list, and set strongestP and strongestN
1470                 accordingly in the headers.
1471                 */
1472                 if(lastStrength == UCOL_TOK_RESET
1473                     || sourceToken->listHeader->first == 0) {
1474                         /* If LAST is a reset
1475                         insert sourceToken in the list. */
1476                         if(sourceToken->listHeader->first == 0) {
1477                             sourceToken->listHeader->first = sourceToken;
1478                             sourceToken->listHeader->last = sourceToken;
1479                         } else { /* we need to find a place for us */
1480                             /* and we'll get in front of the same strength */
1481                             if(sourceToken->listHeader->first->strength <= sourceToken->strength) {
1482                                 sourceToken->next = sourceToken->listHeader->first;
1483                                 sourceToken->next->previous = sourceToken;
1484                                 sourceToken->listHeader->first = sourceToken;
1485                                 sourceToken->previous = NULL;
1486                             } else {
1487                                 lastToken = sourceToken->listHeader->first;
1488                                 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
1489                                     lastToken = lastToken->next;
1490                                 }
1491                                 if(lastToken->next != NULL) {
1492                                     lastToken->next->previous = sourceToken;
1493                                 } else {
1494                                     sourceToken->listHeader->last = sourceToken;
1495                                 }
1496                                 sourceToken->previous = lastToken;
1497                                 sourceToken->next = lastToken->next;
1498                                 lastToken->next = sourceToken;
1499                             }
1500                         }
1501                     } else {
1502                         /* Otherwise (when LAST is not a reset)
1503                         if polarity (LAST) == polarity(relation), insert sourceToken after LAST,
1504                         otherwise insert before.
1505                         when inserting after or before, search to the next position with the same
1506                         strength in that direction. (This is called postpone insertion).         */
1507                         if(sourceToken != lastToken) {
1508                             if(lastToken->polarity == sourceToken->polarity) {
1509                                 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
1510                                     lastToken = lastToken->next;
1511                                 }
1512                                 sourceToken->previous = lastToken;
1513                                 if(lastToken->next != NULL) {
1514                                     lastToken->next->previous = sourceToken;
1515                                 } else {
1516                                     sourceToken->listHeader->last = sourceToken;
1517                                 }
1518
1519                                 sourceToken->next = lastToken->next;
1520                                 lastToken->next = sourceToken;
1521                             } else {
1522                                 while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) {
1523                                     lastToken = lastToken->previous;
1524                                 }
1525                                 sourceToken->next = lastToken;
1526                                 if(lastToken->previous != NULL) {
1527                                     lastToken->previous->next = sourceToken;
1528                                 } else {
1529                                     sourceToken->listHeader->first = sourceToken;
1530                                 }
1531                                 sourceToken->previous = lastToken->previous;
1532                                 lastToken->previous = sourceToken;
1533                             }
1534                         } else { /* repeated one thing twice in rules, stay with the stronger strength */
1535                             if(lastStrength < sourceToken->strength) {
1536                                 sourceToken->strength = lastStrength;
1537                             }
1538                         }
1539                     }
1540
1541                     /* if the token was a variable top, we're gonna put it in */
1542                     if(variableTop == TRUE && src->varTop == NULL) {
1543                         variableTop = FALSE;
1544                         src->varTop = sourceToken;
1545                     }
1546
1547                     // Treat the expansions.
1548                     // There are two types of expansions: explicit (x / y) and reset based propagating expansions
1549                     // (&abc * d * e <=> &ab * d / c * e / c)
1550                     // if both of them are in effect for a token, they are combined.
1551
1552                     sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
1553
1554                     if(expandNext != 0) {
1555                         if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */
1556                             expandNext = 0;
1557                         } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */
1558                             sourceToken->expansion = expandNext;
1559                         } else { /* there is both explicit and implicit expansion. We need to make a combination */
1560                             uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar));
1561                             uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar));
1562                             sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source));
1563                             src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen;
1564                         }
1565                     }
1566
1567                     // This is just for debugging purposes
1568                     if(sourceToken->expansion != 0) {
1569                         sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
1570                     } else {
1571                         sourceToken->debugExpansion = 0;
1572                     }
1573                     // if the previous token was a reset before, the strength of this
1574                     // token must match the strength of before. Otherwise we have an
1575                     // undefined situation.
1576                     // In other words, we currently have a cludge which we use to
1577                     // represent &a >> x. This is written as &[before 2]a << x.
1578                     if((lastToken->flags & UCOL_TOK_BEFORE) != 0) {
1579                         uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1;
1580                         if(beforeStrength != sourceToken->strength) {
1581                             *status = U_INVALID_FORMAT_ERROR;
1582                             syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
1583                             return 0;
1584                         }
1585                     }
1586             } else {
1587                 if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
1588                     /* if the previous token was also a reset, */
1589                     /*this means that we have two consecutive resets */
1590                     /* and we want to remove the previous one if empty*/
1591                     if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
1592                         src->resultLen--;
1593                     }
1594                 }
1595
1596                 if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
1597                     uint32_t searchCharsLen = src->parsedToken.charsLen;
1598                     while(searchCharsLen > 1 && sourceToken == NULL) {
1599                         searchCharsLen--;
1600                         //key = searchCharsLen << 24 | charsOffset;
1601                         UColToken key;
1602                         key.source = searchCharsLen << 24 | src->parsedToken.charsOffset;
1603                         key.rulesToParse = src->source;
1604                         sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1605                     }
1606                     if(sourceToken != NULL) {
1607                         expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen);
1608                     }
1609                 }
1610
1611                 if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */
1612                     if(top == FALSE) { /* there is no indirection */
1613                         uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
1614                         if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
1615                             /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
1616                             while(sourceToken->strength > strength && sourceToken->previous != NULL) {
1617                                 sourceToken = sourceToken->previous;
1618                             }
1619                             /* here, either we hit the strength or NULL */
1620                             if(sourceToken->strength == strength) {
1621                                 if(sourceToken->previous != NULL) {
1622                                     sourceToken = sourceToken->previous;
1623                                 } else { /* start of list */
1624                                     sourceToken = sourceToken->listHeader->reset;
1625                                 }
1626                             } else { /* we hit NULL */
1627                                 /* we should be doing the else part */
1628                                 sourceToken = sourceToken->listHeader->reset;
1629                                 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
1630                             }
1631                         } else {
1632                             sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
1633                         }
1634                     } else { /* this is both before and indirection */
1635                         top = FALSE;
1636                         ListList[src->resultLen].previousCE = 0;
1637                         ListList[src->resultLen].previousContCE = 0;
1638                         ListList[src->resultLen].indirect = TRUE;
1639                         /* we need to do slightly more work. we need to get the baseCE using the */
1640                         /* inverse UCA & getPrevious. The next bound is not set, and will be decided */
1641                         /* in ucol_bld */
1642                         uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
1643                         uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
1644                         uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F;
1645                         uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
1646
1647                         UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1648                         if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
1649                             uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16;
1650                             uint32_t raw = uprv_uca_getRawFromImplicit(primary);
1651                             uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
1652                             CE = primaryCE & UCOL_PRIMARYMASK | 0x0505;
1653                             SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER;
1654                         } else {
1655                             /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/
1656                             ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
1657                         }
1658
1659                         ListList[src->resultLen].baseCE = CE;
1660                         ListList[src->resultLen].baseContCE = SecondCE;
1661                         ListList[src->resultLen].nextCE = 0;
1662                         ListList[src->resultLen].nextContCE = 0;
1663
1664                         sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1665                     }
1666                 }
1667
1668
1669                 /*  5 If the relation is a reset:
1670                 If sourceToken is null
1671                 Create new list, create new sourceToken, make the baseCE from source, put
1672                 the sourceToken in ListHeader of the new list */
1673                 if(sourceToken == NULL) {
1674                     /*
1675                     3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
1676                     First convert all expansions into normal form. Examples:
1677                     If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
1678                     d * ... into &x * c/y * d * ...
1679                     Note: reset values can never have expansions, although they can cause the
1680                     very next item to have one. They may be contractions, if they are found
1681                     earlier in the list.
1682                     */
1683                     if(top == FALSE) {
1684                         collIterate s;
1685                         uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
1686
1687                         uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s);
1688
1689                         CE = ucol_getNextCE(src->UCA, &s, status);
1690                         UChar *expand = s.pos;
1691                         SecondCE = ucol_getNextCE(src->UCA, &s, status);
1692
1693                         ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1694                         if(isContinuation(SecondCE)) {
1695                             ListList[src->resultLen].baseContCE = SecondCE;
1696                         } else {
1697                             ListList[src->resultLen].baseContCE = 0;
1698                         }
1699                         ListList[src->resultLen].nextCE = 0;
1700                         ListList[src->resultLen].nextContCE = 0;
1701                         ListList[src->resultLen].previousCE = 0;
1702                         ListList[src->resultLen].previousContCE = 0;
1703                         ListList[src->resultLen].indirect = FALSE;
1704                         sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status);
1705                     } else { /* top == TRUE */
1706                         /* just use the supplied values */
1707                         top = FALSE;
1708                         ListList[src->resultLen].previousCE = 0;
1709                         ListList[src->resultLen].previousContCE = 0;
1710                         ListList[src->resultLen].indirect = TRUE;
1711                         ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
1712                         ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
1713                         ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
1714                         ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;
1715
1716                         sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1717
1718                     }
1719                 } else { /* reset to something already in rules */
1720                     top = FALSE;
1721                 }
1722             }
1723             /*  7 After all this, set LAST to point to sourceToken, and goto step 3. */
1724             lastToken = sourceToken;
1725         } else {
1726             if(U_FAILURE(*status)) {
1727                 return 0;
1728             }
1729         }
1730     }
1731
1732     if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
1733         src->resultLen--;
1734     }
1735     return src->resultLen;
1736 }
1737
1738 void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint32_t rulesLength, const UCollator *UCA, UErrorCode *status) {
1739     U_NAMESPACE_USE
1740
1741     uint32_t nSize = 0;
1742     uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE);
1743     if(U_FAILURE(*status)) {
1744         return;
1745     }
1746
1747     // set everything to zero, so that we can clean up gracefully
1748     uprv_memset(src, 0, sizeof(UColTokenParser));
1749
1750     // first we need to find options that don't like to be normalized,
1751     // like copy and remove...
1752     //const UChar *openBrace = rules;
1753     int32_t optionNumber = -1;
1754     const UChar *setStart;
1755     uint32_t i = 0;
1756     while(i < rulesLength) {
1757         if(rules[i] == 0x005B) {
1758             // while((openBrace = u_strchr(openBrace, 0x005B)) != NULL) { // find open braces
1759             //optionNumber = ucol_uprv_tok_readOption(openBrace+1, rules+rulesLength, &setStart);
1760             optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart);
1761             if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */
1762                 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
1763                 if(U_SUCCESS(*status)) {
1764                     if(src->copySet == NULL) {
1765                         src->copySet = newSet;
1766                     } else {
1767                         uset_addAll(src->copySet, newSet);
1768                         uset_close(newSet);
1769                     }
1770                 } else {
1771                     return;
1772                 }
1773             } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {
1774                 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
1775                 if(U_SUCCESS(*status)) {
1776                     if(src->removeSet == NULL) {
1777                         src->removeSet = newSet;
1778                     } else {
1779                         uset_addAll(src->removeSet, newSet);
1780                         uset_close(newSet);
1781                     }
1782                 } else {
1783                     return;
1784                 }
1785             }
1786         }
1787         //openBrace++;
1788         i++;
1789     }
1790
1791     src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar));
1792     /* test for NULL */
1793     if (src->source == NULL) {
1794         *status = U_MEMORY_ALLOCATION_ERROR;
1795         return;
1796     }
1797     uprv_memset(src->source, 0, estimatedSize*sizeof(UChar));
1798     nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status);
1799     if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) {
1800         *status = U_ZERO_ERROR;
1801         src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
1802         /* test for NULL */
1803         if (src->source == NULL) {
1804             *status = U_MEMORY_ALLOCATION_ERROR;
1805             return;
1806         }
1807         nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);
1808     }
1809     src->current = src->source;
1810     src->end = src->source+nSize;
1811     src->sourceCurrent = src->source;
1812     src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly
1813     src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
1814     src->varTop = NULL;
1815     src->UCA = UCA;
1816     src->invUCA = ucol_initInverseUCA(status);
1817     src->parsedToken.charsLen = 0;
1818     src->parsedToken.charsOffset = 0;
1819     src->parsedToken.extensionLen = 0;
1820     src->parsedToken.extensionOffset = 0;
1821     src->parsedToken.prefixLen = 0;
1822     src->parsedToken.prefixOffset = 0;
1823     src->parsedToken.flags = 0;
1824     src->parsedToken.strength = UCOL_TOK_UNSET;
1825     src->buildCCTabFlag = FALSE;
1826
1827     if(U_FAILURE(*status)) {
1828         return;
1829     }
1830     src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status);
1831     if(U_FAILURE(*status)) {
1832         return;
1833     }
1834     uhash_setValueDeleter(src->tailored, uhash_freeBlock);
1835
1836     src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
1837     /* test for NULL */
1838     if (src->opts == NULL) {
1839         *status = U_MEMORY_ALLOCATION_ERROR;
1840         return;
1841     }
1842
1843     uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet));
1844
1845     // rulesToParse = src->source;
1846     src->lh = 0;
1847     src->listCapacity = 1024;
1848     src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader));
1849     //Test for NULL
1850     if (src->lh == NULL) {
1851         *status = U_MEMORY_ALLOCATION_ERROR;
1852         return;
1853     }
1854     uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader));
1855     src->resultLen = 0;
1856
1857     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1858
1859     // UCOL_RESET_TOP_VALUE
1860     setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
1861     // UCOL_FIRST_PRIMARY_IGNORABLE
1862     setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);
1863     // UCOL_LAST_PRIMARY_IGNORABLE
1864     setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);
1865     // UCOL_FIRST_SECONDARY_IGNORABLE
1866     setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);
1867     // UCOL_LAST_SECONDARY_IGNORABLE
1868     setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);
1869     // UCOL_FIRST_TERTIARY_IGNORABLE
1870     setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);
1871     // UCOL_LAST_TERTIARY_IGNORABLE
1872     setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);
1873     // UCOL_FIRST_VARIABLE
1874     setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);
1875     // UCOL_LAST_VARIABLE
1876     setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);
1877     // UCOL_FIRST_NON_VARIABLE
1878     setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);
1879     // UCOL_LAST_NON_VARIABLE
1880     setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
1881     // UCOL_FIRST_IMPLICIT
1882     setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0);
1883     // UCOL_LAST_IMPLICIT
1884     setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING);
1885     // UCOL_FIRST_TRAILING
1886     setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0);
1887     // UCOL_LAST_TRAILING
1888     setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0);
1889     ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24);
1890 }
1891
1892
1893 void ucol_tok_closeTokenList(UColTokenParser *src) {
1894     if(src->copySet != NULL) {
1895         uset_close(src->copySet);
1896     }
1897     if(src->removeSet != NULL) {
1898         uset_close(src->removeSet);
1899     }
1900     if(src->tailored != NULL) {
1901         uhash_close(src->tailored);
1902     }
1903     if(src->lh != NULL) {
1904         uprv_free(src->lh);
1905     }
1906     if(src->source != NULL) {
1907         uprv_free(src->source);
1908     }
1909     if(src->opts != NULL) {
1910         uprv_free(src->opts);
1911     }
1912 }
1913
1914 #endif /* #if !UCONFIG_NO_COLLATION */
1915