icuSources/i18n/ucol_tok.cpp

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2001-2006, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  ucol_tok.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created 02/22/2001
  14 *   created by: Vladimir Weinstein
  15 *
  16 * This module reads a tailoring rule string and produces a list of
  17 * tokens that will be turned into collation elements
  18 *
  19 */
  20
  21 #include "unicode/utypes.h"
  22
  23 #if !UCONFIG_NO_COLLATION
  24
  25 #include "unicode/ustring.h"
  26 #include "unicode/uchar.h"
  27 #include "unicode/uniset.h"
  28
  29 #include "ucol_tok.h"
  30 #include "cmemory.h"
  31 #include "util.h"
  32
  33 U_CDECL_BEGIN
  34 static int32_t U_CALLCONV
  35 uhash_hashTokens(const UHashTok k)
  36 {
  37     int32_t hash = 0;
  38     //uint32_t key = (uint32_t)k.integer;
  39     UColToken *key = (UColToken *)k.pointer;
  40     if (key != 0) {
  41         //int32_t len = (key & 0xFF000000)>>24;
  42         int32_t len = (key->source & 0xFF000000)>>24;
  43         int32_t inc = ((len - 32) / 32) + 1;
  44
  45         //const UChar *p = (key & 0x00FFFFFF) + rulesToParse;
  46         const UChar *p = (key->source & 0x00FFFFFF) + key->rulesToParse;
  47         const UChar *limit = p + len;
  48
  49         while (p<limit) {
  50             hash = (hash * 37) + *p;
  51             p += inc;
  52         }
  53     }
  54     return hash;
  55 }
  56
  57 static UBool U_CALLCONV
  58 uhash_compareTokens(const UHashTok key1, const UHashTok key2)
  59 {
  60     //uint32_t p1 = (uint32_t) key1.integer;
  61     //uint32_t p2 = (uint32_t) key2.integer;
  62     UColToken *p1 = (UColToken *)key1.pointer;
  63     UColToken *p2 = (UColToken *)key2.pointer;
  64     const UChar *s1 = (p1->source & 0x00FFFFFF) + p1->rulesToParse;
  65     const UChar *s2 = (p2->source & 0x00FFFFFF) + p2->rulesToParse;
  66     uint32_t s1L = ((p1->source & 0xFF000000) >> 24);
  67     uint32_t s2L = ((p2->source & 0xFF000000) >> 24);
  68     const UChar *end = s1+s1L-1;
  69
  70     if (p1 == p2) {
  71         return TRUE;
  72     }
  73     if (p1->source == 0 || p2->source == 0) {
  74         return FALSE;
  75     }
  76     if(s1L != s2L) {
  77       return FALSE;
  78     }
  79     if(p1->source == p2->source) {
  80       return TRUE;
  81     }
  82     while((s1 < end) && *s1 == *s2) {
  83       ++s1;
  84       ++s2;
  85     }
  86     if(*s1 == *s2) {
  87       return TRUE;
  88     } else {
  89       return FALSE;
  90     }
  91 }
  92 U_CDECL_END
  93
  94 static inline void U_CALLCONV
  95 uhash_freeBlockWrapper(void *obj) {
  96   uhash_freeBlock(obj);
  97 }
  98
  99
 100 typedef struct {
 101   uint32_t startCE;
 102   uint32_t startContCE;
 103   uint32_t limitCE;
 104   uint32_t limitContCE;
 105 } indirectBoundaries;
 106
 107 /* these values are used for finding CE values for indirect positioning. */
 108 /* Indirect positioning is a mechanism for allowing resets on symbolic   */
 109 /* values. It only works for resets and you cannot tailor indirect names */
 110 /* An indirect name can define either an anchor point or a range. An     */
 111 /* anchor point behaves in exactly the same way as a code point in reset */
 112 /* would, except that it cannot be tailored. A range (we currently only  */
 113 /* know for the [top] range will explicitly set the upper bound for      */
 114 /* generated CEs, thus allowing for better control over how many CEs can */
 115 /* be squeezed between in the range without performance penalty.         */
 116 /* In that respect, we use [top] for tailoring of locales that use CJK   */
 117 /* characters. Other indirect values are currently a pure convenience,   */
 118 /* they can be used to assure that the CEs will be always positioned in  */
 119 /* the same place relative to a point with known properties (e.g. first  */
 120 /* primary ignorable). */
 121 static indirectBoundaries ucolIndirectBoundaries[15];
 122 /*
 123 static indirectBoundaries ucolIndirectBoundaries[11] = {
 124   { UCOL_RESET_TOP_VALUE,               0,
 125     UCOL_NEXT_TOP_VALUE,                0 },
 126   { UCOL_FIRST_PRIMARY_IGNORABLE,       0,
 127     0,                                  0 },
 128   { UCOL_LAST_PRIMARY_IGNORABLE,        UCOL_LAST_PRIMARY_IGNORABLE_CONT,
 129     0,                                  0 },
 130   { UCOL_FIRST_SECONDARY_IGNORABLE,     0,
 131     0,                                  0 },
 132   { UCOL_LAST_SECONDARY_IGNORABLE,      0,
 133     0,                                  0 },
 134   { UCOL_FIRST_TERTIARY_IGNORABLE,      0,
 135     0,                                  0 },
 136   { UCOL_LAST_TERTIARY_IGNORABLE,       0,
 137     0,                                  0 },
 138   { UCOL_FIRST_VARIABLE,                0,
 139     0,                                  0 },
 140   { UCOL_LAST_VARIABLE,                 0,
 141     0,                                  0 },
 142   { UCOL_FIRST_NON_VARIABLE,            0,
 143     0,                                  0 },
 144   { UCOL_LAST_NON_VARIABLE,             0,
 145     0,                                  0 },
 146 };
 147 */
 148
 149 static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) {
 150
 151   // Set values for the top - TODO: once we have values for all the indirects, we are going
 152   // to initalize here.
 153   ucolIndirectBoundaries[indexR].startCE = start[0];
 154   ucolIndirectBoundaries[indexR].startContCE = start[1];
 155   if(end) {
 156     ucolIndirectBoundaries[indexR].limitCE = end[0];
 157     ucolIndirectBoundaries[indexR].limitContCE = end[1];
 158   } else {
 159     ucolIndirectBoundaries[indexR].limitCE = 0;
 160     ucolIndirectBoundaries[indexR].limitContCE = 0;
 161   }
 162 }
 163
 164
 165 static inline
 166 void syntaxError(const UChar* rules,
 167                  int32_t pos,
 168                  int32_t rulesLen,
 169                  UParseError* parseError) {
 170     parseError->offset = pos;
 171     parseError->line = 0 ; /* we are not using line numbers */
 172
 173     // for pre-context
 174     int32_t start = (pos <=U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
 175     int32_t stop  = pos;
 176
 177     u_memcpy(parseError->preContext,rules+start,stop-start);
 178     //null terminate the buffer
 179     parseError->preContext[stop-start] = 0;
 180
 181     //for post-context
 182     start = pos+1;
 183     stop  = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) :
 184                                                             rulesLen;
 185
 186     if(start < stop) {
 187       u_memcpy(parseError->postContext,rules+start,stop-start);
 188       //null terminate the buffer
 189       parseError->postContext[stop-start]= 0;
 190     } else {
 191       parseError->postContext[0] = 0;
 192     }
 193 }
 194
 195 static
 196 void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) {
 197   switch(attrib) {
 198   case UCOL_HIRAGANA_QUATERNARY_MODE:
 199     opts->hiraganaQ = value;
 200     break;
 201   case UCOL_FRENCH_COLLATION:
 202     opts->frenchCollation = value;
 203     break;
 204   case UCOL_ALTERNATE_HANDLING:
 205     opts->alternateHandling = value;
 206     break;
 207   case UCOL_CASE_FIRST:
 208     opts->caseFirst = value;
 209     break;
 210   case UCOL_CASE_LEVEL:
 211     opts->caseLevel = value;
 212     break;
 213   case UCOL_NORMALIZATION_MODE:
 214     opts->normalizationMode = value;
 215     break;
 216   case UCOL_STRENGTH:
 217     opts->strength = value;
 218     break;
 219   case UCOL_NUMERIC_COLLATION:
 220    opts->numericCollation = value;
 221    break;
 222   case UCOL_ATTRIBUTE_COUNT:
 223   default:
 224     break;
 225   }
 226 }
 227
 228 #define UTOK_OPTION_COUNT 20
 229
 230 static UBool didInit = FALSE;
 231 /* we can be strict, or we can be lenient */
 232 /* I'd surely be lenient with the option arguments */
 233 /* maybe even with options */
 234 U_STRING_DECL(suboption_00, "non-ignorable", 13);
 235 U_STRING_DECL(suboption_01, "shifted",        7);
 236
 237 U_STRING_DECL(suboption_02, "lower",          5);
 238 U_STRING_DECL(suboption_03, "upper",          5);
 239 U_STRING_DECL(suboption_04, "off",            3);
 240 U_STRING_DECL(suboption_05, "on",             2);
 241 U_STRING_DECL(suboption_06, "1",              1);
 242 U_STRING_DECL(suboption_07, "2",              1);
 243 U_STRING_DECL(suboption_08, "3",              1);
 244 U_STRING_DECL(suboption_09, "4",              1);
 245 U_STRING_DECL(suboption_10, "I",              1);
 246
 247 U_STRING_DECL(suboption_11, "primary",        7);
 248 U_STRING_DECL(suboption_12, "secondary",      9);
 249 U_STRING_DECL(suboption_13, "tertiary",       8);
 250 U_STRING_DECL(suboption_14, "variable",       8);
 251 U_STRING_DECL(suboption_15, "regular",        7);
 252 U_STRING_DECL(suboption_16, "implicit",       8);
 253 U_STRING_DECL(suboption_17, "trailing",       8);
 254
 255
 256 U_STRING_DECL(option_00,    "undefined",      9);
 257 U_STRING_DECL(option_01,    "rearrange",      9);
 258 U_STRING_DECL(option_02,    "alternate",      9);
 259 U_STRING_DECL(option_03,    "backwards",      9);
 260 U_STRING_DECL(option_04,    "variable top",  12);
 261 U_STRING_DECL(option_05,    "top",            3);
 262 U_STRING_DECL(option_06,    "normalization", 13);
 263 U_STRING_DECL(option_07,    "caseLevel",      9);
 264 U_STRING_DECL(option_08,    "caseFirst",      9);
 265 U_STRING_DECL(option_09,    "scriptOrder",   11);
 266 U_STRING_DECL(option_10,    "charsetname",   11);
 267 U_STRING_DECL(option_11,    "charset",        7);
 268 U_STRING_DECL(option_12,    "before",         6);
 269 U_STRING_DECL(option_13,    "hiraganaQ",      9);
 270 U_STRING_DECL(option_14,    "strength",       8);
 271 U_STRING_DECL(option_15,    "first",          5);
 272 U_STRING_DECL(option_16,    "last",           4);
 273 U_STRING_DECL(option_17,    "optimize",       8);
 274 U_STRING_DECL(option_18,    "suppressContractions",         20);
 275 U_STRING_DECL(option_19,    "numericOrdering",              15);
 276
 277
 278 /*
 279 [last variable] last variable value
 280 [last primary ignorable] largest CE for primary ignorable
 281 [last secondary ignorable] largest CE for secondary ignorable
 282 [last tertiary ignorable] largest CE for tertiary ignorable
 283 [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
 284 */
 285
 286
 287 static const ucolTokSuboption alternateSub[2] = {
 288   {suboption_00, 13, UCOL_NON_IGNORABLE},
 289   {suboption_01,  7, UCOL_SHIFTED}
 290 };
 291
 292 static const ucolTokSuboption caseFirstSub[3] = {
 293   {suboption_02, 5, UCOL_LOWER_FIRST},
 294   {suboption_03,  5, UCOL_UPPER_FIRST},
 295   {suboption_04,  3, UCOL_OFF},
 296 };
 297
 298 static const ucolTokSuboption onOffSub[2] = {
 299   {suboption_04, 3, UCOL_OFF},
 300   {suboption_05, 2, UCOL_ON}
 301 };
 302
 303 static const ucolTokSuboption frenchSub[1] = {
 304   {suboption_07, 1, UCOL_ON}
 305 };
 306
 307 static const ucolTokSuboption beforeSub[3] = {
 308   {suboption_06, 1, UCOL_PRIMARY},
 309   {suboption_07, 1, UCOL_SECONDARY},
 310   {suboption_08, 1, UCOL_TERTIARY}
 311 };
 312
 313 static const ucolTokSuboption strengthSub[5] = {
 314   {suboption_06, 1, UCOL_PRIMARY},
 315   {suboption_07, 1, UCOL_SECONDARY},
 316   {suboption_08, 1, UCOL_TERTIARY},
 317   {suboption_09, 1, UCOL_QUATERNARY},
 318   {suboption_10, 1, UCOL_IDENTICAL},
 319 };
 320
 321 static const ucolTokSuboption firstLastSub[7] = {
 322   {suboption_11, 7, UCOL_PRIMARY},
 323   {suboption_12, 9, UCOL_PRIMARY},
 324   {suboption_13, 8, UCOL_PRIMARY},
 325   {suboption_14, 8, UCOL_PRIMARY},
 326   {suboption_15, 7, UCOL_PRIMARY},
 327   {suboption_16, 8, UCOL_PRIMARY},
 328   {suboption_17, 8, UCOL_PRIMARY},
 329 };
 330
 331 enum OptionNumber {
 332   OPTION_ALTERNATE_HANDLING = 0,
 333     OPTION_FRENCH_COLLATION,
 334     OPTION_CASE_LEVEL,
 335     OPTION_CASE_FIRST,
 336     OPTION_NORMALIZATION_MODE,
 337     OPTION_HIRAGANA_QUATERNARY,
 338     OPTION_STRENGTH,
 339     OPTION_NUMERIC_COLLATION,
 340     OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,
 341     OPTION_VARIABLE_TOP,
 342     OPTION_REARRANGE,
 343     OPTION_BEFORE,
 344     OPTION_TOP,
 345     OPTION_FIRST,
 346     OPTION_LAST,
 347     OPTION_OPTIMIZE,
 348     OPTION_SUPPRESS_CONTRACTIONS,
 349     OPTION_UNDEFINED,
 350     OPTION_SCRIPT_ORDER,
 351     OPTION_CHARSET_NAME,
 352     OPTION_CHARSET
 353 } ;
 354
 355 static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
 356  /*00*/ {option_02,  9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */
 357  /*01*/ {option_03,  9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards"      */
 358  /*02*/ {option_07,  9, onOffSub, 2, UCOL_CASE_LEVEL},  /*"caseLevel"      */
 359  /*03*/ {option_08,  9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst"   */
 360  /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */
 361  /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */
 362  /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */
 363  /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION},  /*"numericOrdering"*/
 364  /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top"   */
 365  /*09*/ {option_01,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange"      */
 366  /*10*/ {option_12,  6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before"    */
 367  /*11*/ {option_05,  3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top"            */
 368  /*12*/ {option_15,  5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */
 369  /*13*/ {option_16,  4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */
 370  /*14*/ {option_17,  8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize"      */
 371  /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions"      */
 372  /*16*/ {option_00,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined"      */
 373  /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder"    */
 374  /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname"    */
 375  /*19*/ {option_11,  7, NULL, 0, UCOL_ATTRIBUTE_COUNT}  /*"charset"        */
 376 };
 377
 378 static
 379 int32_t u_strncmpNoCase(const UChar     *s1,
 380      const UChar     *s2,
 381      int32_t     n)
 382 {
 383     if(n > 0) {
 384         int32_t rc;
 385         for(;;) {
 386             rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2);
 387             if(rc != 0 || *s1 == 0 || --n == 0) {
 388                 return rc;
 389             }
 390             ++s1;
 391             ++s2;
 392         }
 393     }
 394     return 0;
 395 }
 396
 397 static
 398 void ucol_uprv_tok_initData() {
 399   if(!didInit) {
 400     U_STRING_INIT(suboption_00, "non-ignorable", 13);
 401     U_STRING_INIT(suboption_01, "shifted",        7);
 402
 403     U_STRING_INIT(suboption_02, "lower",          5);
 404     U_STRING_INIT(suboption_03, "upper",          5);
 405     U_STRING_INIT(suboption_04, "off",            3);
 406     U_STRING_INIT(suboption_05, "on",             2);
 407
 408     U_STRING_INIT(suboption_06, "1",              1);
 409     U_STRING_INIT(suboption_07, "2",              1);
 410     U_STRING_INIT(suboption_08, "3",              1);
 411     U_STRING_INIT(suboption_09, "4",              1);
 412     U_STRING_INIT(suboption_10, "I",              1);
 413
 414     U_STRING_INIT(suboption_11, "primary",        7);
 415     U_STRING_INIT(suboption_12, "secondary",      9);
 416     U_STRING_INIT(suboption_13, "tertiary",       8);
 417     U_STRING_INIT(suboption_14, "variable",       8);
 418     U_STRING_INIT(suboption_15, "regular",        7);
 419     U_STRING_INIT(suboption_16, "implicit",       8);
 420     U_STRING_INIT(suboption_17, "trailing",       8);
 421
 422
 423     U_STRING_INIT(option_00, "undefined",      9);
 424     U_STRING_INIT(option_01, "rearrange",      9);
 425     U_STRING_INIT(option_02, "alternate",      9);
 426     U_STRING_INIT(option_03, "backwards",      9);
 427     U_STRING_INIT(option_04, "variable top",  12);
 428     U_STRING_INIT(option_05, "top",            3);
 429     U_STRING_INIT(option_06, "normalization", 13);
 430     U_STRING_INIT(option_07, "caseLevel",      9);
 431     U_STRING_INIT(option_08, "caseFirst",      9);
 432     U_STRING_INIT(option_09, "scriptOrder",   11);
 433     U_STRING_INIT(option_10, "charsetname",   11);
 434     U_STRING_INIT(option_11, "charset",        7);
 435     U_STRING_INIT(option_12, "before",         6);
 436     U_STRING_INIT(option_13, "hiraganaQ",      9);
 437     U_STRING_INIT(option_14, "strength",       8);
 438     U_STRING_INIT(option_15, "first",          5);
 439     U_STRING_INIT(option_16, "last",           4);
 440     U_STRING_INIT(option_17, "optimize",       8);
 441     U_STRING_INIT(option_18, "suppressContractions",         20);
 442     U_STRING_INIT(option_19, "numericOrdering",      15);
 443     didInit = TRUE;
 444   }
 445 }
 446
 447
 448 // This function reads basic options to set in the runtime collator
 449 // used by data driven tests. Should not support build time options
 450 U_CAPI const UChar * U_EXPORT2
 451 ucol_tok_getNextArgument(const UChar *start, const UChar *end,
 452                                UColAttribute *attrib, UColAttributeValue *value,
 453                                UErrorCode *status) {
 454   uint32_t i = 0;
 455   int32_t j=0;
 456   UBool foundOption = FALSE;
 457   const UChar *optionArg = NULL;
 458
 459   ucol_uprv_tok_initData();
 460
 461   while(start < end && u_isWhitespace(*start)) { /* eat whitespace */
 462     start++;
 463   }
 464   if(start >= end) {
 465     return NULL;
 466   }
 467   /* skip opening '[' */
 468   if(*start == 0x005b) {
 469     start++;
 470   } else {
 471     *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '['
 472     return NULL;
 473   }
 474
 475   while(i < UTOK_OPTION_COUNT) {
 476     if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
 477       foundOption = TRUE;
 478       if(end - start > rulesOptions[i].optionLen) {
 479         optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */
 480         while(u_isWhitespace(*optionArg)) { /* eat whitespace */
 481           optionArg++;
 482         }
 483       }
 484       break;
 485     }
 486     i++;
 487   }
 488
 489   if(!foundOption) {
 490     *status = U_ILLEGAL_ARGUMENT_ERROR;
 491     return NULL;
 492   }
 493
 494   if(optionArg) {
 495     for(j = 0; j<rulesOptions[i].subSize; j++) {
 496       if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
 497         //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
 498         *attrib = rulesOptions[i].attr;
 499         *value = rulesOptions[i].subopts[j].attrVal;
 500         optionArg += rulesOptions[i].subopts[j].subLen;
 501         while(u_isWhitespace(*optionArg)) { /* eat whitespace */
 502           optionArg++;
 503         }
 504         if(*optionArg == 0x005d) {
 505           optionArg++;
 506           return optionArg;
 507         } else {
 508           *status = U_ILLEGAL_ARGUMENT_ERROR;
 509           return NULL;
 510         }
 511       }
 512     }
 513   }
 514   *status = U_ILLEGAL_ARGUMENT_ERROR;
 515   return NULL;
 516 }
 517
 518 static
 519 USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) {
 520   while(*start != 0x005b) { /* advance while we find the first '[' */
 521     start++;
 522   }
 523   // now we need to get a balanced set of '[]'. The problem is that a set can have
 524   // many, and *end point to the first closing '['
 525   int32_t noOpenBraces = 1;
 526   int32_t current = 1; // skip the opening brace
 527   while(start+current < end && noOpenBraces != 0) {
 528     if(start[current] == 0x005b) {
 529       noOpenBraces++;
 530     } else if(start[current] == 0x005D) { // closing brace
 531       noOpenBraces--;
 532     }
 533     current++;
 534   }
 535   UChar *nextBrace = NULL;
 536
 537   if(noOpenBraces != 0 || (nextBrace = u_strchr(start+current, 0x005d /*']'*/)) == NULL) {
 538     *status = U_ILLEGAL_ARGUMENT_ERROR;
 539     return NULL;
 540   }
 541   return uset_openPattern(start, current, status);
 542 }
 543
 544 static
 545 int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) {
 546   int32_t i = 0;
 547   ucol_uprv_tok_initData();
 548
 549   while(u_isWhitespace(*start)) { /* eat whitespace */
 550     start++;
 551   }
 552   while(i < UTOK_OPTION_COUNT) {
 553     if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
 554       if(end - start > rulesOptions[i].optionLen) {
 555         *optionArg = start+rulesOptions[i].optionLen; /* start of the options*/
 556         while(u_isWhitespace(**optionArg)) { /* eat whitespace */
 557           (*optionArg)++;
 558         }
 559       }
 560       break;
 561     }
 562     i++;
 563   }
 564   if(i == UTOK_OPTION_COUNT) {
 565     i = -1; // didn't find an option
 566   }
 567   return i;
 568 }
 569
 570
 571 // reads and conforms to various options in rules
 572 // end is the position of the first closing ']'
 573 // However, some of the options take an UnicodeSet definition
 574 // which needs to duplicate the closing ']'
 575 // for example: '[copy [\uAC00-\uD7FF]]'
 576 // These options will move end to the second ']' and the
 577 // caller will set the current to it.
 578 static
 579 uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) {
 580   const UChar* start = src->current;
 581   int32_t i = 0;
 582   int32_t j=0;
 583   const UChar *optionArg = NULL;
 584
 585   uint8_t result = 0;
 586
 587   start++; /*skip opening '['*/
 588   i = ucol_uprv_tok_readOption(start, src->end, &optionArg);
 589   if(optionArg) {
 590     src->current = optionArg;
 591   }
 592
 593   if(i < 0) {
 594     *status = U_ILLEGAL_ARGUMENT_ERROR;
 595   } else {
 596     int32_t noOpenBraces = 1;
 597     switch(i) {
 598     case OPTION_ALTERNATE_HANDLING:
 599     case OPTION_FRENCH_COLLATION:
 600     case OPTION_CASE_LEVEL:
 601     case OPTION_CASE_FIRST:
 602     case OPTION_NORMALIZATION_MODE:
 603     case OPTION_HIRAGANA_QUATERNARY:
 604     case OPTION_STRENGTH:
 605     case OPTION_NUMERIC_COLLATION:
 606       if(optionArg) {
 607         for(j = 0; j<rulesOptions[i].subSize; j++) {
 608           if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
 609             ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
 610             result =  UCOL_TOK_SUCCESS;
 611           }
 612         }
 613       }
 614       if(result == 0) {
 615         *status = U_ILLEGAL_ARGUMENT_ERROR;
 616       }
 617       break;
 618     case OPTION_VARIABLE_TOP:
 619       result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP;
 620       break;
 621     case OPTION_REARRANGE:
 622       result = UCOL_TOK_SUCCESS;
 623       break;
 624     case OPTION_BEFORE:
 625       if(optionArg) {
 626         for(j = 0; j<rulesOptions[i].subSize; j++) {
 627           if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
 628           result = UCOL_TOK_SUCCESS | rulesOptions[i].subopts[j].attrVal + 1;
 629           }
 630         }
 631       }
 632       if(result == 0) {
 633         *status = U_ILLEGAL_ARGUMENT_ERROR;
 634       }
 635       break;
 636     case OPTION_TOP: /* we are going to have an array with structures of limit CEs */
 637       /* index to this array will be src->parsedToken.indirectIndex*/
 638       src->parsedToken.indirectIndex = 0;
 639       result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;
 640       break;
 641     case OPTION_FIRST:
 642     case OPTION_LAST: /* first, last */
 643       for(j = 0; j<rulesOptions[i].subSize; j++) {
 644         if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
 645           // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first
 646           // element of indirect boundaries is reserved for top.
 647           src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2);
 648           result =  UCOL_TOK_SUCCESS | UCOL_TOK_TOP;;
 649         }
 650       }
 651       if(result == 0) {
 652         *status = U_ILLEGAL_ARGUMENT_ERROR;
 653       }
 654       break;
 655     case OPTION_OPTIMIZE:
 656     case OPTION_SUPPRESS_CONTRACTIONS:  // copy and remove are handled before normalization
 657       // we need to move end here
 658       src->current++; // skip opening brace
 659       while(src->current < src->end && noOpenBraces != 0) {
 660         if(*src->current == 0x005b) {
 661           noOpenBraces++;
 662         } else if(*src->current == 0x005D) { // closing brace
 663           noOpenBraces--;
 664         }
 665         src->current++;
 666       }
 667       result = UCOL_TOK_SUCCESS;
 668       break;
 669     default:
 670       *status = U_UNSUPPORTED_ERROR;
 671       break;
 672     }
 673   }
 674   src->current = u_memchr(src->current, 0x005d, src->end-src->current);
 675   return result;
 676 }
 677
 678
 679 inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) {
 680       if(src->extraCurrent+len >= src->extraEnd) {
 681         /* reallocate */
 682         UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar));
 683         if(newSrc != NULL) {
 684           src->current = newSrc + (src->current - src->source);
 685           src->extraCurrent = newSrc + (src->extraCurrent - src->source);
 686           src->end = newSrc + (src->end - src->source);
 687           src->extraEnd = newSrc + (src->extraEnd-src->source)*2;
 688           src->sourceCurrent = newSrc + (src->sourceCurrent-src->source);
 689           src->source = newSrc;
 690         } else {
 691           *status = U_MEMORY_ALLOCATION_ERROR;
 692         }
 693       }
 694       if(len == 1) {
 695           *src->extraCurrent++ = *stuff;
 696       } else {
 697         uprv_memcpy(src->extraCurrent, stuff, len*sizeof(UChar));
 698         src->extraCurrent += len;
 699       }
 700
 701
 702 }
 703
 704 inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) {
 705   /*
 706   top = TRUE;
 707   */
 708   UChar buff[5];
 709   src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
 710   buff[0] = 0xFFFE;
 711   buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
 712   buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
 713   if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
 714     src->parsedToken.charsLen = 3;
 715     ucol_tok_addToExtraCurrent(src, buff, 3, status);
 716   } else {
 717     buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
 718     buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
 719     src->parsedToken.charsLen = 5;
 720     ucol_tok_addToExtraCurrent(src, buff, 5, status);
 721   }
 722   return TRUE;
 723 }
 724
 725 static UBool isCharNewLine(UChar c){
 726     switch(c){
 727     case 0x000A: /* LF  */
 728     case 0x000D: /* CR  */
 729     case 0x000C: /* FF  */
 730     case 0x0085: /* NEL */
 731     case 0x2028: /* LS  */
 732     case 0x2029: /* PS  */
 733         return TRUE;
 734     default:
 735         return FALSE;
 736     }
 737 }
 738
 739 U_CAPI const UChar* U_EXPORT2
 740 ucol_tok_parseNextToken(UColTokenParser *src,
 741                         UBool startOfRules,
 742                         UParseError *parseError,
 743                         UErrorCode *status) {
 744 /* parsing part */
 745   UBool variableTop = FALSE;
 746   UBool top = FALSE;
 747   UBool inChars = TRUE;
 748   UBool inQuote = FALSE;
 749   UBool wasInQuote = FALSE;
 750   UChar *optionEnd = NULL;
 751   uint8_t before = 0;
 752   UBool isEscaped = FALSE;
 753   // TODO: replace these variables with src->parsedToken counterparts
 754   // no need to use them anymore since we have src->parsedToken.
 755   // Ideally, token parser would be a nice class... Once, when I have
 756   // more time (around 2020 probably).
 757   uint32_t newExtensionLen = 0;
 758   uint32_t extensionOffset = 0;
 759   uint32_t newStrength = UCOL_TOK_UNSET;
 760   UChar buff[10];
 761
 762   src->parsedToken.charsOffset = 0;  src->parsedToken.charsLen = 0;
 763   src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;
 764   src->parsedToken.indirectIndex = 0;
 765
 766   while (src->current < src->end) {
 767     UChar ch = *(src->current);
 768
 769     if (inQuote) {
 770       if (ch == 0x0027/*'\''*/) {
 771           inQuote = FALSE;
 772       } else {
 773         if ((src->parsedToken.charsLen == 0) || inChars) {
 774           if(src->parsedToken.charsLen == 0) {
 775             src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
 776           }
 777           src->parsedToken.charsLen++;
 778         } else {
 779           if(newExtensionLen == 0) {
 780             extensionOffset = (uint32_t)(src->extraCurrent - src->source);
 781           }
 782           newExtensionLen++;
 783         }
 784       }
 785     }else if(isEscaped){
 786       isEscaped =FALSE;
 787       if (newStrength == UCOL_TOK_UNSET) {
 788         *status = U_INVALID_FORMAT_ERROR;
 789         syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
 790         return NULL;
 791         // enabling rules to start with non-tokens a < b
 792         // newStrength = UCOL_TOK_RESET;
 793       }
 794       if(ch != 0x0000  && src->current != src->end) {
 795           if (inChars) {
 796             if(src->parsedToken.charsLen == 0) {
 797               src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
 798             }
 799             src->parsedToken.charsLen++;
 800           } else {
 801             if(newExtensionLen == 0) {
 802               extensionOffset = (uint32_t)(src->current - src->source);
 803             }
 804             newExtensionLen++;
 805           }
 806       }
 807     }else {
 808       if(!uprv_isRuleWhiteSpace(ch)) {
 809         /* Sets the strength for this entry */
 810         switch (ch) {
 811           case 0x003D/*'='*/ :
 812             if (newStrength != UCOL_TOK_UNSET) {
 813               goto EndOfLoop;
 814             }
 815
 816             /* if we start with strength, we'll reset to top */
 817             if(startOfRules == TRUE) {
 818               src->parsedToken.indirectIndex = 5;
 819               top = ucol_tok_doSetTop(src, status);
 820               newStrength = UCOL_TOK_RESET;
 821               goto EndOfLoop;
 822             }
 823             newStrength = UCOL_IDENTICAL;
 824             break;
 825
 826           case 0x002C/*','*/:
 827             if (newStrength != UCOL_TOK_UNSET) {
 828               goto EndOfLoop;
 829             }
 830
 831             /* if we start with strength, we'll reset to top */
 832             if(startOfRules == TRUE) {
 833               src->parsedToken.indirectIndex = 5;
 834               top = ucol_tok_doSetTop(src, status);
 835               newStrength = UCOL_TOK_RESET;
 836               goto EndOfLoop;
 837             }
 838             newStrength = UCOL_TERTIARY;
 839             break;
 840
 841           case  0x003B/*';'*/:
 842             if (newStrength != UCOL_TOK_UNSET) {
 843               goto EndOfLoop;
 844             }
 845
 846             /* if we start with strength, we'll reset to top */
 847             if(startOfRules == TRUE) {
 848               src->parsedToken.indirectIndex = 5;
 849               top = ucol_tok_doSetTop(src, status);
 850               newStrength = UCOL_TOK_RESET;
 851               goto EndOfLoop;
 852             }
 853             newStrength = UCOL_SECONDARY;
 854             break;
 855
 856           case 0x003C/*'<'*/:
 857             if (newStrength != UCOL_TOK_UNSET) {
 858               goto EndOfLoop;
 859             }
 860
 861             /* if we start with strength, we'll reset to top */
 862             if(startOfRules == TRUE) {
 863               src->parsedToken.indirectIndex = 5;
 864               top = ucol_tok_doSetTop(src, status);
 865               newStrength = UCOL_TOK_RESET;
 866               goto EndOfLoop;
 867             }
 868             /* before this, do a scan to verify whether this is */
 869             /* another strength */
 870             if(*(src->current+1) == 0x003C) {
 871               src->current++;
 872               if(*(src->current+1) == 0x003C) {
 873                 src->current++; /* three in a row! */
 874                 newStrength = UCOL_TERTIARY;
 875               } else { /* two in a row */
 876                 newStrength = UCOL_SECONDARY;
 877               }
 878             } else { /* just one */
 879               newStrength = UCOL_PRIMARY;
 880             }
 881             break;
 882
 883           case 0x0026/*'&'*/:
 884             if (newStrength != UCOL_TOK_UNSET) {
 885               /**/
 886               goto EndOfLoop;
 887             }
 888
 889             newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
 890             break;
 891
 892           case 0x005b/*'['*/:
 893             /* options - read an option, analyze it */
 894             if((optionEnd = u_strchr(src->current, 0x005d /*']'*/)) != NULL) {
 895               uint8_t result = ucol_uprv_tok_readAndSetOption(src, status);
 896               //src->current = optionEnd;
 897               if(U_SUCCESS(*status)) {
 898                 if(result & UCOL_TOK_TOP) {
 899                   if(newStrength == UCOL_TOK_RESET) {
 900                     top = ucol_tok_doSetTop(src, status);
 901                     if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
 902                       src->parsedToken.charsLen+=2;
 903                       buff[0] = 0x002d;
 904                       buff[1] = before;
 905                       ucol_tok_addToExtraCurrent(src, buff, 2, status);
 906                     }
 907
 908                     src->current++;
 909                     goto EndOfLoop;
 910                   } else {
 911                     *status = U_INVALID_FORMAT_ERROR;
 912                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
 913                   }
 914                 } else if(result & UCOL_TOK_VARIABLE_TOP) {
 915                   if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
 916                     variableTop = TRUE;
 917                     src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
 918                     src->parsedToken.charsLen = 1;
 919                     buff[0] = 0xFFFF;
 920                     ucol_tok_addToExtraCurrent(src, buff, 1, status);
 921                     src->current++;
 922                     goto EndOfLoop;
 923                   } else {
 924                     *status = U_INVALID_FORMAT_ERROR;
 925                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
 926                   }
 927                 } else if (result & UCOL_TOK_BEFORE){
 928                   if(newStrength == UCOL_TOK_RESET) {
 929                     before = result & UCOL_TOK_BEFORE;
 930                   } else {
 931                     *status = U_INVALID_FORMAT_ERROR;
 932                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
 933
 934                   }
 935                 }
 936               } else {
 937                 *status = U_INVALID_FORMAT_ERROR;
 938                 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
 939                 return NULL;
 940               }
 941             }
 942             break;
 943           case 0x0021/*! skip java thai modifier reordering*/:
 944               break;
 945           case 0x002F/*'/'*/:
 946             wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
 947             inChars = FALSE; /* we're now processing expansion */
 948             break;
 949           case 0x005C /* back slash for escaped chars */:
 950               isEscaped = TRUE;
 951               break;
 952           /* found a quote, we're gonna start copying */
 953           case 0x0027/*'\''*/:
 954             if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
 955               *status = U_INVALID_FORMAT_ERROR;
 956               syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
 957               return NULL;
 958               // enabling rules to start with a non-token character a < b
 959               // newStrength = UCOL_TOK_RESET;
 960             }
 961
 962             inQuote = TRUE;
 963
 964             if(inChars) { /* we're doing characters */
 965               if(wasInQuote == FALSE) {
 966                 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
 967               }
 968               if (src->parsedToken.charsLen != 0) {
 969                   ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
 970               }
 971               src->parsedToken.charsLen++;
 972             } else { /* we're doing an expansion */
 973               if(wasInQuote == FALSE) {
 974                 extensionOffset = (uint32_t)(src->extraCurrent - src->source);
 975               }
 976               if (newExtensionLen != 0) {
 977                 ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status);
 978               }
 979               newExtensionLen++;
 980             }
 981
 982             wasInQuote = TRUE;
 983
 984             ch = *(++(src->current));
 985             if(ch == 0x0027) { /* copy the double quote */
 986               ucol_tok_addToExtraCurrent(src, &ch, 1, status);
 987               inQuote = FALSE;
 988             }
 989             break;
 990
 991           /* '@' is french only if the strength is not currently set */
 992           /* if it is, it's just a regular character in collation rules */
 993           case 0x0040/*'@'*/:
 994             if (newStrength == UCOL_TOK_UNSET) {
 995               src->opts->frenchCollation = UCOL_ON;
 996               break;
 997             }
 998
 999           case 0x007C /*|*/: /* this means we have actually been reading prefix part */
1000             // we want to store read characters to the prefix part and continue reading
1001             // the characters (proper way would be to restart reading the chars, but in
1002             // that case we would have to complicate the token hasher, which I do not
1003             // intend to play with. Instead, we will do prefixes when prefixes are due
1004             // (before adding the elements).
1005             src->parsedToken.prefixOffset = src->parsedToken.charsOffset;
1006             src->parsedToken.prefixLen = src->parsedToken.charsLen;
1007
1008             if(inChars) { /* we're doing characters */
1009               if(wasInQuote == FALSE) {
1010                 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1011               }
1012               if (src->parsedToken.charsLen != 0) {
1013                   ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
1014               }
1015               src->parsedToken.charsLen++;
1016             }
1017
1018             wasInQuote = TRUE;
1019
1020             do {
1021               ch = *(++(src->current));
1022               // skip whitespace between '|' and the character
1023             } while (uprv_isRuleWhiteSpace(ch));
1024             break;
1025
1026             //charsOffset = 0;
1027             //newCharsLen = 0;
1028             //break; // We want to store the whole prefix/character sequence. If we break
1029                      // the '|' is going to get lost.
1030           case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */
1031             do {
1032                 ch = *(++(src->current));
1033             } while (!isCharNewLine(ch));
1034
1035             break;
1036           default:
1037             if (newStrength == UCOL_TOK_UNSET) {
1038               *status = U_INVALID_FORMAT_ERROR;
1039               syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1040               return NULL;
1041             }
1042
1043             if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
1044               *status = U_INVALID_FORMAT_ERROR;
1045               syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1046               return NULL;
1047             }
1048
1049             if(ch == 0x0000 && src->current+1 == src->end) {
1050               break;
1051             }
1052
1053             if (inChars) {
1054               if(src->parsedToken.charsLen == 0) {
1055                 src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
1056               }
1057               src->parsedToken.charsLen++;
1058             } else {
1059               if(newExtensionLen == 0) {
1060                 extensionOffset = (uint32_t)(src->current - src->source);
1061               }
1062               newExtensionLen++;
1063             }
1064
1065             break;
1066           }
1067        }
1068     }
1069
1070     if(wasInQuote) {
1071       if(ch != 0x27) {
1072           if(inQuote || !uprv_isRuleWhiteSpace(ch)) {
1073             ucol_tok_addToExtraCurrent(src, &ch, 1, status);
1074           }
1075       }
1076     }
1077
1078       src->current++;
1079     }
1080
1081  EndOfLoop:
1082     wasInQuote = FALSE;
1083   if (newStrength == UCOL_TOK_UNSET) {
1084     return NULL;
1085   }
1086
1087   if (src->parsedToken.charsLen == 0 && top == FALSE) {
1088     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1089     *status = U_INVALID_FORMAT_ERROR;
1090     return NULL;
1091   }
1092
1093   src->parsedToken.strength = newStrength;
1094   src->parsedToken.extensionOffset = extensionOffset;
1095   src->parsedToken.extensionLen = newExtensionLen;
1096   src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before;
1097
1098   return src->current;
1099 }
1100
1101 /*
1102 Processing Description
1103   1 Build a ListList. Each list has a header, which contains two lists (positive
1104   and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and
1105   reset may be null.
1106   2 As you process, you keep a LAST pointer that points to the last token you
1107   handled.
1108 */
1109
1110 static UColToken *ucol_tok_initAReset(UColTokenParser *src, UChar *expand, uint32_t *expandNext,
1111                                       UParseError *parseError, UErrorCode *status) {
1112   if(src->resultLen == src->listCapacity) {
1113     // Unfortunately, this won't work, as we store addresses of lhs in token
1114     src->listCapacity *= 2;
1115     src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader));
1116     if(src->lh == NULL) {
1117       *status = U_MEMORY_ALLOCATION_ERROR;
1118       return NULL;
1119     }
1120   }
1121   /* do the reset thing */
1122   UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
1123   /* test for NULL */
1124   if (sourceToken == NULL) {
1125       *status = U_MEMORY_ALLOCATION_ERROR;
1126       return NULL;
1127   }
1128   sourceToken->rulesToParse = src->source;
1129   sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1130   sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
1131
1132   sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
1133   sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
1134
1135   // keep the flags around so that we know about before
1136   sourceToken->flags = src->parsedToken.flags;
1137
1138   if(src->parsedToken.prefixOffset != 0) {
1139     // this is a syntax error
1140     *status = U_INVALID_FORMAT_ERROR;
1141     syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
1142     return 0;
1143   } else {
1144     sourceToken->prefix = 0;
1145   }
1146
1147   sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
1148   sourceToken->strength = UCOL_TOK_RESET;
1149   sourceToken->next = NULL;
1150   sourceToken->previous = NULL;
1151   sourceToken->noOfCEs = 0;
1152   sourceToken->noOfExpCEs = 0;
1153   sourceToken->listHeader = &src->lh[src->resultLen];
1154
1155   src->lh[src->resultLen].first = NULL;
1156   src->lh[src->resultLen].last = NULL;
1157   src->lh[src->resultLen].first = NULL;
1158   src->lh[src->resultLen].last = NULL;
1159
1160   src->lh[src->resultLen].reset = sourceToken;
1161
1162   /*
1163     3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
1164       First convert all expansions into normal form. Examples:
1165         If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
1166         d * ... into &x * c/y * d * ...
1167         Note: reset values can never have expansions, although they can cause the
1168         very next item to have one. They may be contractions, if they are found
1169         earlier in the list.
1170   */
1171   if(expand != NULL) {
1172     /* check to see if there is an expansion */
1173     if(src->parsedToken.charsLen > 1) {
1174       uint32_t resetCharsOffset;
1175       resetCharsOffset = (uint32_t)(expand - src->source);
1176       sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset;
1177       *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset);
1178     } else {
1179       *expandNext = 0;
1180     }
1181   }
1182
1183   src->resultLen++;
1184
1185   uhash_put(src->tailored, sourceToken, sourceToken, status);
1186
1187   return sourceToken;
1188 }
1189
1190 static
1191 inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) {
1192   if(U_FAILURE(*status)) {
1193     return NULL;
1194   }
1195       /* this is a virgin before - we need to fish the anchor from the UCA */
1196   collIterate s;
1197   uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND;
1198   uint32_t CE, SecondCE;
1199   uint32_t invPos;
1200   if(sourceToken != NULL) {
1201     uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s);
1202   } else {
1203     uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s);
1204   }
1205
1206   baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;
1207   baseContCE = ucol_getNextCE(src->UCA, &s, status);
1208   if(baseContCE == UCOL_NO_MORE_CES) {
1209     baseContCE = 0;
1210   }
1211
1212
1213   UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1214   uint32_t ch = 0;
1215   uint32_t expandNext = 0;
1216   UColToken key;
1217
1218   if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
1219       uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16;
1220       uint32_t raw = uprv_uca_getRawFromImplicit(primary);
1221       ch = uprv_uca_getCodePointFromRaw(raw-1);
1222       uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
1223       CE = primaryCE & UCOL_PRIMARYMASK | 0x0505;
1224       SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER;
1225
1226       src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1227       *src->extraCurrent++ = 0xFFFE;
1228       *src->extraCurrent++ = (UChar)ch;
1229       src->parsedToken.charsLen++;
1230
1231       key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
1232       key.rulesToParse = src->source;
1233
1234       //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1235       sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1236
1237       if(sourceToken == NULL) {
1238           src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1239           if(isContinuation(SecondCE)) {
1240             src->lh[src->resultLen].baseContCE = SecondCE;
1241           } else {
1242             src->lh[src->resultLen].baseContCE = 0;
1243           }
1244           src->lh[src->resultLen].nextCE = 0;
1245           src->lh[src->resultLen].nextContCE = 0;
1246           src->lh[src->resultLen].previousCE = 0;
1247           src->lh[src->resultLen].previousContCE = 0;
1248
1249           src->lh[src->resultLen].indirect = FALSE;
1250
1251           sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1252       }
1253
1254   } else {
1255       invPos = ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
1256
1257       // we got the previous CE. Now we need to see if the difference between
1258       // the two CEs is really of the requested strength.
1259       // if it's a bigger difference (we asked for secondary and got primary), we
1260       // need to modify the CE.
1261       if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) {
1262           // adjust the strength
1263           // now we are in the situation where our baseCE should actually be modified in
1264           // order to get the CE in the right position.
1265           if(strength == UCOL_SECONDARY) {
1266               CE = baseCE - 0x0200;
1267           } else { // strength == UCOL_TERTIARY
1268               CE = baseCE - 0x02;
1269           }
1270           if(baseContCE) {
1271             if(strength == UCOL_SECONDARY) {
1272                 SecondCE = baseContCE - 0x0200;
1273             } else { // strength == UCOL_TERTIARY
1274                 SecondCE = baseContCE - 0x02;
1275             }
1276           }
1277       }
1278
1279 #if 0
1280       // the code below relies on getting a code point from the inverse table, in order to be
1281       // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
1282       // 1. There are many code points that have the same CE
1283       // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
1284       // Also, in case when there is no equivalent strength before an element, we have to actually
1285       // construct one. For example, &[before 2]a << x won't result in x << a, because the element
1286       // before a is a primary difference.
1287
1288       //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
1289
1290
1291       ch = CETable[3*invPos+2];
1292
1293       if((ch &  UCOL_INV_SIZEMASK) != 0) {
1294         uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts);
1295         uint32_t offset = (ch & UCOL_INV_OFFSETMASK);
1296         ch = conts[offset];
1297       }
1298
1299       *src->extraCurrent++ = (UChar)ch;
1300       src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1);
1301       src->parsedToken.charsLen = 1;
1302
1303       // We got an UCA before. However, this might have been tailored.
1304       // example:
1305       // &\u30ca = \u306a
1306       // &[before 3]\u306a<<<\u306a|\u309d
1307
1308
1309       // uint32_t key = (*newCharsLen << 24) | *charsOffset;
1310       key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
1311       key.rulesToParse = src->source;
1312
1313       //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1314       sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1315 #endif
1316
1317       // here is how it should be. The situation such as &[before 1]a < x, should be
1318       // resolved exactly as if we wrote &a > x.
1319       // therefore, I don't really care if the UCA value before a has been changed.
1320       // However, I do care if the strength between my element and the previous element
1321       // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
1322       // have to construct the base CE.
1323
1324
1325
1326       // if we found a tailored thing, we have to use the UCA value and construct
1327       // a new reset token with constructed name
1328       //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
1329         // character to which we want to anchor is already tailored.
1330         // We need to construct a new token which will be the anchor
1331         // point
1332         //*(src->extraCurrent-1) = 0xFFFE;
1333         //*src->extraCurrent++ = (UChar)ch;
1334         // grab before
1335         src->parsedToken.charsOffset -= 10;
1336         src->parsedToken.charsLen += 10;
1337         src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1338         if(isContinuation(SecondCE)) {
1339           src->lh[src->resultLen].baseContCE = SecondCE;
1340         } else {
1341           src->lh[src->resultLen].baseContCE = 0;
1342         }
1343         src->lh[src->resultLen].nextCE = 0;
1344         src->lh[src->resultLen].nextContCE = 0;
1345         src->lh[src->resultLen].previousCE = 0;
1346         src->lh[src->resultLen].previousContCE = 0;
1347
1348         src->lh[src->resultLen].indirect = FALSE;
1349
1350         sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1351       //}
1352   }
1353
1354   return sourceToken;
1355
1356 }
1357
1358 uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) {
1359   UColToken *lastToken = NULL;
1360   const UChar *parseEnd = NULL;
1361   uint32_t expandNext = 0;
1362   UBool variableTop = FALSE;
1363   UBool top = FALSE;
1364   uint16_t specs = 0;
1365   UColTokListHeader *ListList = NULL;
1366
1367   src->parsedToken.strength = UCOL_TOK_UNSET;
1368
1369   ListList = src->lh;
1370
1371   if(U_FAILURE(*status)) {
1372       return 0;
1373   }
1374
1375   while(src->current < src->end) {
1376     src->parsedToken.prefixOffset = 0;
1377
1378     parseEnd = ucol_tok_parseNextToken(src,
1379                         (UBool)(lastToken == NULL),
1380                         parseError,
1381                         status);
1382
1383     specs = src->parsedToken.flags;
1384
1385
1386     variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0);
1387     top = ((specs & UCOL_TOK_TOP) != 0);
1388
1389     if(U_SUCCESS(*status) && parseEnd != NULL) {
1390       UColToken *sourceToken = NULL;
1391       //uint32_t key = 0;
1392       uint32_t lastStrength = UCOL_TOK_UNSET;
1393
1394       if(lastToken != NULL ) {
1395         lastStrength = lastToken->strength;
1396       }
1397
1398       //key = newCharsLen << 24 | charsOffset;
1399       UColToken key;
1400       key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1401       key.rulesToParse = src->source;
1402
1403       /*  4 Lookup each source in the CharsToToken map, and find a sourceToken */
1404       sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1405
1406       if(src->parsedToken.strength != UCOL_TOK_RESET) {
1407         if(lastToken == NULL) { /* this means that rules haven't started properly */
1408           *status = U_INVALID_FORMAT_ERROR;
1409           syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
1410           return 0;
1411         }
1412       /*  6 Otherwise (when relation != reset) */
1413         if(sourceToken == NULL) {
1414           /* If sourceToken is null, create new one, */
1415           sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
1416           /* test for NULL */
1417           if (sourceToken == NULL) {
1418               *status = U_MEMORY_ALLOCATION_ERROR;
1419               return 0;
1420           }
1421           sourceToken->rulesToParse = src->source;
1422           sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1423
1424           sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
1425
1426           sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset;
1427           sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset);
1428
1429           sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
1430           sourceToken->next = NULL;
1431           sourceToken->previous = NULL;
1432           sourceToken->noOfCEs = 0;
1433           sourceToken->noOfExpCEs = 0;
1434           // keep the flags around so that we know about before
1435           sourceToken->flags = src->parsedToken.flags;
1436           uhash_put(src->tailored, sourceToken, sourceToken, status);
1437         } else {
1438           /* we could have fished out a reset here */
1439           if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) {
1440             /* otherwise remove sourceToken from where it was. */
1441             if(sourceToken->next != NULL) {
1442               if(sourceToken->next->strength > sourceToken->strength) {
1443                 sourceToken->next->strength = sourceToken->strength;
1444               }
1445               sourceToken->next->previous = sourceToken->previous;
1446             } else {
1447               sourceToken->listHeader->last = sourceToken->previous;
1448             }
1449
1450             if(sourceToken->previous != NULL) {
1451               sourceToken->previous->next = sourceToken->next;
1452             } else {
1453               sourceToken->listHeader->first = sourceToken->next;
1454             }
1455             sourceToken->next = NULL;
1456             sourceToken->previous = NULL;
1457           }
1458         }
1459
1460         sourceToken->strength = src->parsedToken.strength;
1461         sourceToken->listHeader = lastToken->listHeader;
1462
1463         /*
1464         1.  Find the strongest strength in each list, and set strongestP and strongestN
1465         accordingly in the headers.
1466         */
1467         if(lastStrength == UCOL_TOK_RESET
1468           || sourceToken->listHeader->first == 0) {
1469         /* If LAST is a reset
1470               insert sourceToken in the list. */
1471           if(sourceToken->listHeader->first == 0) {
1472             sourceToken->listHeader->first = sourceToken;
1473             sourceToken->listHeader->last = sourceToken;
1474           } else { /* we need to find a place for us */
1475             /* and we'll get in front of the same strength */
1476             if(sourceToken->listHeader->first->strength <= sourceToken->strength) {
1477               sourceToken->next = sourceToken->listHeader->first;
1478               sourceToken->next->previous = sourceToken;
1479               sourceToken->listHeader->first = sourceToken;
1480               sourceToken->previous = NULL;
1481             } else {
1482               lastToken = sourceToken->listHeader->first;
1483               while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
1484                 lastToken = lastToken->next;
1485               }
1486               if(lastToken->next != NULL) {
1487                 lastToken->next->previous = sourceToken;
1488               } else {
1489                 sourceToken->listHeader->last = sourceToken;
1490               }
1491               sourceToken->previous = lastToken;
1492               sourceToken->next = lastToken->next;
1493               lastToken->next = sourceToken;
1494             }
1495           }
1496         } else {
1497         /* Otherwise (when LAST is not a reset)
1498               if polarity (LAST) == polarity(relation), insert sourceToken after LAST,
1499               otherwise insert before.
1500               when inserting after or before, search to the next position with the same
1501               strength in that direction. (This is called postpone insertion).         */
1502           if(sourceToken != lastToken) {
1503             if(lastToken->polarity == sourceToken->polarity) {
1504               while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
1505                 lastToken = lastToken->next;
1506               }
1507               sourceToken->previous = lastToken;
1508               if(lastToken->next != NULL) {
1509                 lastToken->next->previous = sourceToken;
1510               } else {
1511                 sourceToken->listHeader->last = sourceToken;
1512               }
1513
1514               sourceToken->next = lastToken->next;
1515               lastToken->next = sourceToken;
1516             } else {
1517               while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) {
1518                 lastToken = lastToken->previous;
1519               }
1520               sourceToken->next = lastToken;
1521               if(lastToken->previous != NULL) {
1522                 lastToken->previous->next = sourceToken;
1523               } else {
1524                 sourceToken->listHeader->first = sourceToken;
1525               }
1526               sourceToken->previous = lastToken->previous;
1527               lastToken->previous = sourceToken;
1528             }
1529           } else { /* repeated one thing twice in rules, stay with the stronger strength */
1530             if(lastStrength < sourceToken->strength) {
1531               sourceToken->strength = lastStrength;
1532             }
1533           }
1534         }
1535
1536         /* if the token was a variable top, we're gonna put it in */
1537         if(variableTop == TRUE && src->varTop == NULL) {
1538           variableTop = FALSE;
1539           src->varTop = sourceToken;
1540         }
1541
1542        // Treat the expansions.
1543        // There are two types of expansions: explicit (x / y) and reset based propagating expansions
1544        // (&abc * d * e <=> &ab * d / c * e / c)
1545        // if both of them are in effect for a token, they are combined.
1546
1547         sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
1548
1549         if(expandNext != 0) {
1550           if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */
1551             expandNext = 0;
1552           } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */
1553             sourceToken->expansion = expandNext;
1554           } else { /* there is both explicit and implicit expansion. We need to make a combination */
1555             uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar));
1556             uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar));
1557             sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source));
1558             src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen;
1559           }
1560         }
1561
1562         // This is just for debugging purposes
1563         if(sourceToken->expansion != 0) {
1564           sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
1565         } else {
1566           sourceToken->debugExpansion = 0;
1567         }
1568         // if the previous token was a reset before, the strength of this
1569         // token must match the strength of before. Otherwise we have an
1570         // undefined situation.
1571         // In other words, we currently have a cludge which we use to
1572         // represent &a >> x. This is written as &[before 2]a << x.
1573         if((lastToken->flags & UCOL_TOK_BEFORE) != 0) {
1574             uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1;
1575             if(beforeStrength != sourceToken->strength) {
1576                 *status = U_INVALID_FORMAT_ERROR;
1577                 syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
1578                 return 0;
1579             }
1580         }
1581       } else {
1582         if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
1583           /* if the previous token was also a reset, */
1584           /*this means that we have two consecutive resets */
1585           /* and we want to remove the previous one if empty*/
1586           if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
1587             src->resultLen--;
1588           }
1589         }
1590
1591         if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
1592           uint32_t searchCharsLen = src->parsedToken.charsLen;
1593           while(searchCharsLen > 1 && sourceToken == NULL) {
1594             searchCharsLen--;
1595             //key = searchCharsLen << 24 | charsOffset;
1596             UColToken key;
1597             key.source = searchCharsLen << 24 | src->parsedToken.charsOffset;
1598             key.rulesToParse = src->source;
1599             sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1600           }
1601           if(sourceToken != NULL) {
1602             expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen);
1603           }
1604         }
1605
1606         if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */
1607           if(top == FALSE) { /* there is no indirection */
1608             uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
1609             if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
1610               /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
1611               while(sourceToken->strength > strength && sourceToken->previous != NULL) {
1612                 sourceToken = sourceToken->previous;
1613               }
1614               /* here, either we hit the strength or NULL */
1615               if(sourceToken->strength == strength) {
1616                 if(sourceToken->previous != NULL) {
1617                   sourceToken = sourceToken->previous;
1618                 } else { /* start of list */
1619                   sourceToken = sourceToken->listHeader->reset;
1620                 }
1621               } else { /* we hit NULL */
1622                 /* we should be doing the else part */
1623                 sourceToken = sourceToken->listHeader->reset;
1624                 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
1625               }
1626             } else {
1627               sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
1628             }
1629           } else { /* this is both before and indirection */
1630             top = FALSE;
1631             ListList[src->resultLen].previousCE = 0;
1632             ListList[src->resultLen].previousContCE = 0;
1633             ListList[src->resultLen].indirect = TRUE;
1634             /* we need to do slightly more work. we need to get the baseCE using the */
1635             /* inverse UCA & getPrevious. The next bound is not set, and will be decided */
1636             /* in ucol_bld */
1637             uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
1638             uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
1639             uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F;
1640             uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
1641
1642             UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1643             if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
1644               uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16;
1645               uint32_t raw = uprv_uca_getRawFromImplicit(primary);
1646               uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
1647               CE = primaryCE & UCOL_PRIMARYMASK | 0x0505;
1648               SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER;
1649             } else {
1650                 /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/
1651                 ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
1652             }
1653
1654             ListList[src->resultLen].baseCE = CE;
1655             ListList[src->resultLen].baseContCE = SecondCE;
1656             ListList[src->resultLen].nextCE = 0;
1657             ListList[src->resultLen].nextContCE = 0;
1658
1659             sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1660           }
1661         }
1662
1663
1664       /*  5 If the relation is a reset:
1665           If sourceToken is null
1666             Create new list, create new sourceToken, make the baseCE from source, put
1667             the sourceToken in ListHeader of the new list */
1668         if(sourceToken == NULL) {
1669           /*
1670             3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
1671               First convert all expansions into normal form. Examples:
1672                 If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
1673                 d * ... into &x * c/y * d * ...
1674                 Note: reset values can never have expansions, although they can cause the
1675                 very next item to have one. They may be contractions, if they are found
1676                 earlier in the list.
1677           */
1678           if(top == FALSE) {
1679             collIterate s;
1680             uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
1681
1682             uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s);
1683
1684             CE = ucol_getNextCE(src->UCA, &s, status);
1685             UChar *expand = s.pos;
1686             SecondCE = ucol_getNextCE(src->UCA, &s, status);
1687
1688             ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1689             if(isContinuation(SecondCE)) {
1690               ListList[src->resultLen].baseContCE = SecondCE;
1691             } else {
1692               ListList[src->resultLen].baseContCE = 0;
1693             }
1694             ListList[src->resultLen].nextCE = 0;
1695             ListList[src->resultLen].nextContCE = 0;
1696             ListList[src->resultLen].previousCE = 0;
1697             ListList[src->resultLen].previousContCE = 0;
1698             ListList[src->resultLen].indirect = FALSE;
1699             sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status);
1700           } else { /* top == TRUE */
1701             /* just use the supplied values */
1702             top = FALSE;
1703             ListList[src->resultLen].previousCE = 0;
1704             ListList[src->resultLen].previousContCE = 0;
1705             ListList[src->resultLen].indirect = TRUE;
1706             ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
1707             ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
1708             ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
1709             ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;
1710
1711             sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1712
1713           }
1714         } else { /* reset to something already in rules */
1715           top = FALSE;
1716         }
1717       }
1718       /*  7 After all this, set LAST to point to sourceToken, and goto step 3. */
1719       lastToken = sourceToken;
1720     } else {
1721       if(U_FAILURE(*status)) {
1722         return 0;
1723       }
1724     }
1725   }
1726
1727   if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
1728     src->resultLen--;
1729   }
1730   return src->resultLen;
1731 }
1732
1733 void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint32_t rulesLength, const UCollator *UCA, UErrorCode *status) {
1734   uint32_t nSize = 0;
1735   uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE);
1736   if(U_FAILURE(*status)) {
1737     return;
1738   }
1739
1740   // set everything to zero, so that we can clean up gracefully
1741   uprv_memset(src, 0, sizeof(UColTokenParser));
1742
1743   // first we need to find options that don't like to be normalized,
1744   // like copy and remove...
1745   //const UChar *openBrace = rules;
1746   int32_t optionNumber = -1;
1747   const UChar *setStart;
1748   uint32_t i = 0;
1749   while(i < rulesLength) {
1750     if(rules[i] == 0x005B) {
1751       // while((openBrace = u_strchr(openBrace, 0x005B)) != NULL) { // find open braces
1752       //optionNumber = ucol_uprv_tok_readOption(openBrace+1, rules+rulesLength, &setStart);
1753       optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart);
1754       if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */
1755         USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
1756         if(U_SUCCESS(*status)) {
1757           if(src->copySet == NULL) {
1758             src->copySet = newSet;
1759           } else {
1760             ((UnicodeSet *)src->copySet)->addAll(*((UnicodeSet *)newSet));
1761             uset_close(newSet);
1762           }
1763         } else {
1764           return;
1765         }
1766       } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {
1767         USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
1768         if(U_SUCCESS(*status)) {
1769           if(src->removeSet == NULL) {
1770             src->removeSet = newSet;
1771           } else {
1772             ((UnicodeSet *)src->removeSet)->addAll(*((UnicodeSet *)newSet));
1773             uset_close(newSet);
1774           }
1775         } else {
1776           return;
1777         }
1778       }
1779     }
1780     //openBrace++;
1781     i++;
1782   }
1783
1784   src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar));
1785   /* test for NULL */
1786   if (src->source == NULL) {
1787       *status = U_MEMORY_ALLOCATION_ERROR;
1788       return;
1789   }
1790   uprv_memset(src->source, 0, estimatedSize*sizeof(UChar));
1791   nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status);
1792   if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) {
1793     *status = U_ZERO_ERROR;
1794     src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
1795     /* test for NULL */
1796     if (src->source == NULL) {
1797         *status = U_MEMORY_ALLOCATION_ERROR;
1798         return;
1799     }
1800     nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);
1801   }
1802   src->current = src->source;
1803   src->end = src->source+nSize;
1804   src->sourceCurrent = src->source;
1805   src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly
1806   src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
1807   src->varTop = NULL;
1808   src->UCA = UCA;
1809   src->invUCA = ucol_initInverseUCA(status);
1810   src->parsedToken.charsLen = 0;
1811   src->parsedToken.charsOffset = 0;
1812   src->parsedToken.extensionLen = 0;
1813   src->parsedToken.extensionOffset = 0;
1814   src->parsedToken.prefixLen = 0;
1815   src->parsedToken.prefixOffset = 0;
1816   src->parsedToken.flags = 0;
1817   src->parsedToken.strength = UCOL_TOK_UNSET;
1818
1819
1820   if(U_FAILURE(*status)) {
1821     return;
1822   }
1823   src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status);
1824   if(U_FAILURE(*status)) {
1825     return;
1826   }
1827   uhash_setValueDeleter(src->tailored, uhash_freeBlock);
1828
1829   src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
1830   /* test for NULL */
1831   if (src->opts == NULL) {
1832       *status = U_MEMORY_ALLOCATION_ERROR;
1833       return;
1834   }
1835
1836   uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet));
1837
1838   // rulesToParse = src->source;
1839   src->lh = 0;
1840   src->listCapacity = 1024;
1841   src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader));
1842   //Test for NULL
1843   if (src->lh == NULL) {
1844       *status = U_MEMORY_ALLOCATION_ERROR;
1845       return;
1846   }
1847   uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader));
1848   src->resultLen = 0;
1849
1850   UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1851
1852   // UCOL_RESET_TOP_VALUE
1853   setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
1854   // UCOL_FIRST_PRIMARY_IGNORABLE
1855   setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);
1856   // UCOL_LAST_PRIMARY_IGNORABLE
1857   setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);
1858   // UCOL_FIRST_SECONDARY_IGNORABLE
1859   setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);
1860   // UCOL_LAST_SECONDARY_IGNORABLE
1861   setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);
1862   // UCOL_FIRST_TERTIARY_IGNORABLE
1863   setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);
1864   // UCOL_LAST_TERTIARY_IGNORABLE
1865   setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);
1866   // UCOL_FIRST_VARIABLE
1867   setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);
1868   // UCOL_LAST_VARIABLE
1869   setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);
1870   // UCOL_FIRST_NON_VARIABLE
1871   setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);
1872   // UCOL_LAST_NON_VARIABLE
1873   setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
1874   // UCOL_FIRST_IMPLICIT
1875   setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0);
1876   // UCOL_LAST_IMPLICIT
1877   setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING);
1878   // UCOL_FIRST_TRAILING
1879   setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0);
1880   // UCOL_LAST_TRAILING
1881   setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0);
1882   ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24);
1883 }
1884
1885
1886 void ucol_tok_closeTokenList(UColTokenParser *src) {
1887   if(src->copySet != NULL) {
1888     uset_close(src->copySet);
1889   }
1890   if(src->removeSet != NULL) {
1891     uset_close(src->removeSet);
1892   }
1893   if(src->tailored != NULL) {
1894     uhash_close(src->tailored);
1895   }
1896   if(src->lh != NULL) {
1897     uprv_free(src->lh);
1898   }
1899   if(src->source != NULL) {
1900     uprv_free(src->source);
1901   }
1902   if(src->opts != NULL) {
1903     uprv_free(src->opts);
1904   }
1905 }
1906
1907 #endif /* #if !UCONFIG_NO_COLLATION */
1908