icuSources/i18n/ucol_tok.cpp

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2001-2012, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  ucol_tok.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created 02/22/2001
  14 *   created by: Vladimir Weinstein
  15 *
  16 * This module reads a tailoring rule string and produces a list of
  17 * tokens that will be turned into collation elements
  18 *
  19 */
  20
  21 #include "unicode/utypes.h"
  22
  23 #if !UCONFIG_NO_COLLATION
  24
  25 #include "unicode/uscript.h"
  26 #include "unicode/ustring.h"
  27 #include "unicode/uchar.h"
  28 #include "unicode/uniset.h"
  29
  30 #include "cmemory.h"
  31 #include "cstring.h"
  32 #include "patternprops.h"
  33 #include "ucol_bld.h"
  34 #include "ucol_tok.h"
  35 #include "ulocimp.h"
  36 #include "uresimp.h"
  37
  38 // Define this only for debugging.
  39 // #define DEBUG_FOR_COLL_RULES 1
  40
  41 #ifdef DEBUG_FOR_COLL_RULES
  42 #include <iostream>
  43 #endif
  44
  45 U_NAMESPACE_USE
  46
  47 U_CDECL_BEGIN
  48 static int32_t U_CALLCONV
  49 uhash_hashTokens(const UHashTok k)
  50 {
  51     int32_t hash = 0;
  52     //uint32_t key = (uint32_t)k.integer;
  53     UColToken *key = (UColToken *)k.pointer;
  54     if (key != 0) {
  55         int32_t len = (key->source & 0xFF000000)>>24;
  56         int32_t inc = ((len - 32) / 32) + 1;
  57
  58         const UChar *p = (key->source & 0x00FFFFFF) + *(key->rulesToParseHdl);
  59         const UChar *limit = p + len;
  60
  61         while (p<limit) {
  62             hash = (hash * 37) + *p;
  63             p += inc;
  64         }
  65     }
  66     return hash;
  67 }
  68
  69 static UBool U_CALLCONV
  70 uhash_compareTokens(const UHashTok key1, const UHashTok key2)
  71 {
  72     //uint32_t p1 = (uint32_t) key1.integer;
  73     //uint32_t p2 = (uint32_t) key2.integer;
  74     UColToken *p1 = (UColToken *)key1.pointer;
  75     UColToken *p2 = (UColToken *)key2.pointer;
  76     const UChar *s1 = (p1->source & 0x00FFFFFF) + *(p1->rulesToParseHdl);
  77     const UChar *s2 = (p2->source & 0x00FFFFFF) + *(p2->rulesToParseHdl);
  78     uint32_t s1L = ((p1->source & 0xFF000000) >> 24);
  79     uint32_t s2L = ((p2->source & 0xFF000000) >> 24);
  80     const UChar *end = s1+s1L-1;
  81
  82     if (p1 == p2) {
  83         return TRUE;
  84     }
  85     if (p1->source == 0 || p2->source == 0) {
  86         return FALSE;
  87     }
  88     if(s1L != s2L) {
  89         return FALSE;
  90     }
  91     if(p1->source == p2->source) {
  92         return TRUE;
  93     }
  94     while((s1 < end) && *s1 == *s2) {
  95         ++s1;
  96         ++s2;
  97     }
  98     if(*s1 == *s2) {
  99         return TRUE;
 100     } else {
 101         return FALSE;
 102     }
 103 }
 104 U_CDECL_END
 105
 106 /*
 107  * Debug messages used to pinpoint where a format error occurred.
 108  * A better way is to include context-sensitive information in syntaxError() function.
 109  *
 110  * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_FORMAT_ERROR
 111  * in the compile line.
 112  */
 113 /* #define DEBUG_FOR_FORMAT_ERROR 1 */
 114
 115 #ifdef DEBUG_FOR_FORMAT_ERROR
 116 #define DBG_FORMAT_ERROR { printf("U_INVALID_FORMAT_ERROR at line %d", __LINE__);}
 117 #else
 118 #define DBG_FORMAT_ERROR
 119 #endif
 120
 121
 122 /*
 123  * Controls debug messages so that the output can be compared before and after a
 124  * big change.  Prints the information of every code point that comes out of the
 125  * collation parser and its strength into a file.  When a big change in format
 126  * happens, the files before and after the change should be identical.
 127  *
 128  * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_CODE_POINTS
 129  * in the compile line.
 130  */
 131 // #define DEBUG_FOR_CODE_POINTS 1
 132
 133 #ifdef DEBUG_FOR_CODE_POINTS
 134     FILE* dfcp_fp = NULL;
 135 #endif
 136
 137
 138 typedef struct {
 139     uint32_t startCE;
 140     uint32_t startContCE;
 141     uint32_t limitCE;
 142     uint32_t limitContCE;
 143 } indirectBoundaries;
 144
 145 /* these values are used for finding CE values for indirect positioning. */
 146 /* Indirect positioning is a mechanism for allowing resets on symbolic   */
 147 /* values. It only works for resets and you cannot tailor indirect names */
 148 /* An indirect name can define either an anchor point or a range. An     */
 149 /* anchor point behaves in exactly the same way as a code point in reset */
 150 /* would, except that it cannot be tailored. A range (we currently only  */
 151 /* know for the [top] range will explicitly set the upper bound for      */
 152 /* generated CEs, thus allowing for better control over how many CEs can */
 153 /* be squeezed between in the range without performance penalty.         */
 154 /* In that respect, we use [top] for tailoring of locales that use CJK   */
 155 /* characters. Other indirect values are currently a pure convenience,   */
 156 /* they can be used to assure that the CEs will be always positioned in  */
 157 /* the same place relative to a point with known properties (e.g. first  */
 158 /* primary ignorable). */
 159 static indirectBoundaries ucolIndirectBoundaries[15];
 160 /*
 161 static indirectBoundaries ucolIndirectBoundaries[11] = {
 162 { UCOL_RESET_TOP_VALUE,               0,
 163 UCOL_NEXT_TOP_VALUE,                0 },
 164 { UCOL_FIRST_PRIMARY_IGNORABLE,       0,
 165 0,                                  0 },
 166 { UCOL_LAST_PRIMARY_IGNORABLE,        UCOL_LAST_PRIMARY_IGNORABLE_CONT,
 167 0,                                  0 },
 168 { UCOL_FIRST_SECONDARY_IGNORABLE,     0,
 169 0,                                  0 },
 170 { UCOL_LAST_SECONDARY_IGNORABLE,      0,
 171 0,                                  0 },
 172 { UCOL_FIRST_TERTIARY_IGNORABLE,      0,
 173 0,                                  0 },
 174 { UCOL_LAST_TERTIARY_IGNORABLE,       0,
 175 0,                                  0 },
 176 { UCOL_FIRST_VARIABLE,                0,
 177 0,                                  0 },
 178 { UCOL_LAST_VARIABLE,                 0,
 179 0,                                  0 },
 180 { UCOL_FIRST_NON_VARIABLE,            0,
 181 0,                                  0 },
 182 { UCOL_LAST_NON_VARIABLE,             0,
 183 0,                                  0 },
 184 };
 185 */
 186
 187 static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) {
 188
 189     // Set values for the top - TODO: once we have values for all the indirects, we are going
 190     // to initalize here.
 191     ucolIndirectBoundaries[indexR].startCE = start[0];
 192     ucolIndirectBoundaries[indexR].startContCE = start[1];
 193     if(end) {
 194         ucolIndirectBoundaries[indexR].limitCE = end[0];
 195         ucolIndirectBoundaries[indexR].limitContCE = end[1];
 196     } else {
 197         ucolIndirectBoundaries[indexR].limitCE = 0;
 198         ucolIndirectBoundaries[indexR].limitContCE = 0;
 199     }
 200 }
 201
 202
 203 static inline
 204 void syntaxError(const UChar* rules,
 205                  int32_t pos,
 206                  int32_t rulesLen,
 207                  UParseError* parseError)
 208 {
 209     parseError->offset = pos;
 210     parseError->line = 0 ; /* we are not using line numbers */
 211
 212     // for pre-context
 213     int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
 214     int32_t stop  = pos;
 215
 216     u_memcpy(parseError->preContext,rules+start,stop-start);
 217     //null terminate the buffer
 218     parseError->preContext[stop-start] = 0;
 219
 220     //for post-context
 221     start = pos+1;
 222     stop  = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) :
 223     rulesLen;
 224
 225     if(start < stop) {
 226         u_memcpy(parseError->postContext,rules+start,stop-start);
 227         //null terminate the buffer
 228         parseError->postContext[stop-start]= 0;
 229     } else {
 230         parseError->postContext[0] = 0;
 231     }
 232 }
 233
 234 static
 235 void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) {
 236     switch(attrib) {
 237     case UCOL_HIRAGANA_QUATERNARY_MODE:
 238         opts->hiraganaQ = value;
 239         break;
 240     case UCOL_FRENCH_COLLATION:
 241         opts->frenchCollation = value;
 242         break;
 243     case UCOL_ALTERNATE_HANDLING:
 244         opts->alternateHandling = value;
 245         break;
 246     case UCOL_CASE_FIRST:
 247         opts->caseFirst = value;
 248         break;
 249     case UCOL_CASE_LEVEL:
 250         opts->caseLevel = value;
 251         break;
 252     case UCOL_NORMALIZATION_MODE:
 253         opts->normalizationMode = value;
 254         break;
 255     case UCOL_STRENGTH:
 256         opts->strength = value;
 257         break;
 258     case UCOL_NUMERIC_COLLATION:
 259         opts->numericCollation = value;
 260         break;
 261     case UCOL_ATTRIBUTE_COUNT:
 262     default:
 263         break;
 264     }
 265 }
 266
 267 #define UTOK_OPTION_COUNT 22
 268
 269 static UBool didInit = FALSE;
 270 /* we can be strict, or we can be lenient */
 271 /* I'd surely be lenient with the option arguments */
 272 /* maybe even with options */
 273 U_STRING_DECL(suboption_00, "non-ignorable", 13);
 274 U_STRING_DECL(suboption_01, "shifted",        7);
 275
 276 U_STRING_DECL(suboption_02, "lower",          5);
 277 U_STRING_DECL(suboption_03, "upper",          5);
 278 U_STRING_DECL(suboption_04, "off",            3);
 279 U_STRING_DECL(suboption_05, "on",             2);
 280 U_STRING_DECL(suboption_06, "1",              1);
 281 U_STRING_DECL(suboption_07, "2",              1);
 282 U_STRING_DECL(suboption_08, "3",              1);
 283 U_STRING_DECL(suboption_09, "4",              1);
 284 U_STRING_DECL(suboption_10, "I",              1);
 285
 286 U_STRING_DECL(suboption_11, "primary",        7);
 287 U_STRING_DECL(suboption_12, "secondary",      9);
 288 U_STRING_DECL(suboption_13, "tertiary",       8);
 289 U_STRING_DECL(suboption_14, "variable",       8);
 290 U_STRING_DECL(suboption_15, "regular",        7);
 291 U_STRING_DECL(suboption_16, "implicit",       8);
 292 U_STRING_DECL(suboption_17, "trailing",       8);
 293
 294
 295 U_STRING_DECL(option_00,    "undefined",      9);
 296 U_STRING_DECL(option_01,    "rearrange",      9);
 297 U_STRING_DECL(option_02,    "alternate",      9);
 298 U_STRING_DECL(option_03,    "backwards",      9);
 299 U_STRING_DECL(option_04,    "variable top",  12);
 300 U_STRING_DECL(option_05,    "top",            3);
 301 U_STRING_DECL(option_06,    "normalization", 13);
 302 U_STRING_DECL(option_07,    "caseLevel",      9);
 303 U_STRING_DECL(option_08,    "caseFirst",      9);
 304 U_STRING_DECL(option_09,    "scriptOrder",   11);
 305 U_STRING_DECL(option_10,    "charsetname",   11);
 306 U_STRING_DECL(option_11,    "charset",        7);
 307 U_STRING_DECL(option_12,    "before",         6);
 308 U_STRING_DECL(option_13,    "hiraganaQ",      9);
 309 U_STRING_DECL(option_14,    "strength",       8);
 310 U_STRING_DECL(option_15,    "first",          5);
 311 U_STRING_DECL(option_16,    "last",           4);
 312 U_STRING_DECL(option_17,    "optimize",       8);
 313 U_STRING_DECL(option_18,    "suppressContractions",         20);
 314 U_STRING_DECL(option_19,    "numericOrdering",              15);
 315 U_STRING_DECL(option_20,    "import",         6);
 316 U_STRING_DECL(option_21,    "reorder",         7);
 317
 318 /*
 319 [last variable] last variable value
 320 [last primary ignorable] largest CE for primary ignorable
 321 [last secondary ignorable] largest CE for secondary ignorable
 322 [last tertiary ignorable] largest CE for tertiary ignorable
 323 [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
 324 */
 325
 326
 327 static const ucolTokSuboption alternateSub[2] = {
 328     {suboption_00, 13, UCOL_NON_IGNORABLE},
 329     {suboption_01,  7, UCOL_SHIFTED}
 330 };
 331
 332 static const ucolTokSuboption caseFirstSub[3] = {
 333     {suboption_02, 5, UCOL_LOWER_FIRST},
 334     {suboption_03,  5, UCOL_UPPER_FIRST},
 335     {suboption_04,  3, UCOL_OFF},
 336 };
 337
 338 static const ucolTokSuboption onOffSub[2] = {
 339     {suboption_04, 3, UCOL_OFF},
 340     {suboption_05, 2, UCOL_ON}
 341 };
 342
 343 static const ucolTokSuboption frenchSub[1] = {
 344     {suboption_07, 1, UCOL_ON}
 345 };
 346
 347 static const ucolTokSuboption beforeSub[3] = {
 348     {suboption_06, 1, UCOL_PRIMARY},
 349     {suboption_07, 1, UCOL_SECONDARY},
 350     {suboption_08, 1, UCOL_TERTIARY}
 351 };
 352
 353 static const ucolTokSuboption strengthSub[5] = {
 354     {suboption_06, 1, UCOL_PRIMARY},
 355     {suboption_07, 1, UCOL_SECONDARY},
 356     {suboption_08, 1, UCOL_TERTIARY},
 357     {suboption_09, 1, UCOL_QUATERNARY},
 358     {suboption_10, 1, UCOL_IDENTICAL},
 359 };
 360
 361 static const ucolTokSuboption firstLastSub[7] = {
 362     {suboption_11, 7, UCOL_PRIMARY},
 363     {suboption_12, 9, UCOL_PRIMARY},
 364     {suboption_13, 8, UCOL_PRIMARY},
 365     {suboption_14, 8, UCOL_PRIMARY},
 366     {suboption_15, 7, UCOL_PRIMARY},
 367     {suboption_16, 8, UCOL_PRIMARY},
 368     {suboption_17, 8, UCOL_PRIMARY},
 369 };
 370
 371 enum OptionNumber {
 372     OPTION_ALTERNATE_HANDLING = 0,
 373     OPTION_FRENCH_COLLATION,
 374     OPTION_CASE_LEVEL,
 375     OPTION_CASE_FIRST,
 376     OPTION_NORMALIZATION_MODE,
 377     OPTION_HIRAGANA_QUATERNARY,
 378     OPTION_STRENGTH,
 379     OPTION_NUMERIC_COLLATION,
 380     OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,
 381     OPTION_VARIABLE_TOP,
 382     OPTION_REARRANGE,
 383     OPTION_BEFORE,
 384     OPTION_TOP,
 385     OPTION_FIRST,
 386     OPTION_LAST,
 387     OPTION_OPTIMIZE,
 388     OPTION_SUPPRESS_CONTRACTIONS,
 389     OPTION_UNDEFINED,
 390     OPTION_SCRIPT_ORDER,
 391     OPTION_CHARSET_NAME,
 392     OPTION_CHARSET,
 393     OPTION_IMPORT,
 394     OPTION_SCRIPTREORDER
 395 } ;
 396
 397 static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
 398     /*00*/ {option_02,  9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */
 399     /*01*/ {option_03,  9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards"      */
 400     /*02*/ {option_07,  9, onOffSub, 2, UCOL_CASE_LEVEL},  /*"caseLevel"      */
 401     /*03*/ {option_08,  9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst"   */
 402     /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */
 403     /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */
 404     /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */
 405     /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION},  /*"numericOrdering"*/
 406     /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top"   */
 407     /*09*/ {option_01,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange"      */
 408     /*10*/ {option_12,  6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before"    */
 409     /*11*/ {option_05,  3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top"            */
 410     /*12*/ {option_15,  5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */
 411     /*13*/ {option_16,  4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */
 412     /*14*/ {option_17,  8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize"      */
 413     /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions"      */
 414     /*16*/ {option_00,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined"      */
 415     /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder"    */
 416     /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname"    */
 417     /*19*/ {option_11,  7, NULL, 0, UCOL_ATTRIBUTE_COUNT},  /*"charset"        */
 418     /*20*/ {option_20,  6, NULL, 0, UCOL_ATTRIBUTE_COUNT},  /*"import"        */
 419     /*21*/ {option_21,  7, NULL, 0, UCOL_ATTRIBUTE_COUNT}  /*"reorder"        */
 420 };
 421
 422 static
 423 int32_t u_strncmpNoCase(const UChar     *s1,
 424                         const UChar     *s2,
 425                         int32_t     n)
 426 {
 427     if(n > 0) {
 428         int32_t rc;
 429         for(;;) {
 430             rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2);
 431             if(rc != 0 || *s1 == 0 || --n == 0) {
 432                 return rc;
 433             }
 434             ++s1;
 435             ++s2;
 436         }
 437     }
 438     return 0;
 439 }
 440
 441 static
 442 void ucol_uprv_tok_initData() {
 443     if(!didInit) {
 444         U_STRING_INIT(suboption_00, "non-ignorable", 13);
 445         U_STRING_INIT(suboption_01, "shifted",        7);
 446
 447         U_STRING_INIT(suboption_02, "lower",          5);
 448         U_STRING_INIT(suboption_03, "upper",          5);
 449         U_STRING_INIT(suboption_04, "off",            3);
 450         U_STRING_INIT(suboption_05, "on",             2);
 451
 452         U_STRING_INIT(suboption_06, "1",              1);
 453         U_STRING_INIT(suboption_07, "2",              1);
 454         U_STRING_INIT(suboption_08, "3",              1);
 455         U_STRING_INIT(suboption_09, "4",              1);
 456         U_STRING_INIT(suboption_10, "I",              1);
 457
 458         U_STRING_INIT(suboption_11, "primary",        7);
 459         U_STRING_INIT(suboption_12, "secondary",      9);
 460         U_STRING_INIT(suboption_13, "tertiary",       8);
 461         U_STRING_INIT(suboption_14, "variable",       8);
 462         U_STRING_INIT(suboption_15, "regular",        7);
 463         U_STRING_INIT(suboption_16, "implicit",       8);
 464         U_STRING_INIT(suboption_17, "trailing",       8);
 465
 466
 467         U_STRING_INIT(option_00, "undefined",      9);
 468         U_STRING_INIT(option_01, "rearrange",      9);
 469         U_STRING_INIT(option_02, "alternate",      9);
 470         U_STRING_INIT(option_03, "backwards",      9);
 471         U_STRING_INIT(option_04, "variable top",  12);
 472         U_STRING_INIT(option_05, "top",            3);
 473         U_STRING_INIT(option_06, "normalization", 13);
 474         U_STRING_INIT(option_07, "caseLevel",      9);
 475         U_STRING_INIT(option_08, "caseFirst",      9);
 476         U_STRING_INIT(option_09, "scriptOrder",   11);
 477         U_STRING_INIT(option_10, "charsetname",   11);
 478         U_STRING_INIT(option_11, "charset",        7);
 479         U_STRING_INIT(option_12, "before",         6);
 480         U_STRING_INIT(option_13, "hiraganaQ",      9);
 481         U_STRING_INIT(option_14, "strength",       8);
 482         U_STRING_INIT(option_15, "first",          5);
 483         U_STRING_INIT(option_16, "last",           4);
 484         U_STRING_INIT(option_17, "optimize",       8);
 485         U_STRING_INIT(option_18, "suppressContractions",         20);
 486         U_STRING_INIT(option_19, "numericOrdering",      15);
 487         U_STRING_INIT(option_20, "import ",        6);
 488         U_STRING_INIT(option_21, "reorder",        7);
 489         didInit = TRUE;
 490     }
 491 }
 492
 493
 494 // This function reads basic options to set in the runtime collator
 495 // used by data driven tests. Should not support build time options
 496 U_CAPI const UChar * U_EXPORT2
 497 ucol_tok_getNextArgument(const UChar *start, const UChar *end,
 498                          UColAttribute *attrib, UColAttributeValue *value,
 499                          UErrorCode *status)
 500 {
 501     uint32_t i = 0;
 502     int32_t j=0;
 503     UBool foundOption = FALSE;
 504     const UChar *optionArg = NULL;
 505
 506     ucol_uprv_tok_initData();
 507
 508     while(start < end && PatternProps::isWhiteSpace(*start)) { /* eat whitespace */
 509         start++;
 510     }
 511     if(start >= end) {
 512         return NULL;
 513     }
 514     /* skip opening '[' */
 515     if(*start == 0x005b) {
 516         start++;
 517     } else {
 518         *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '['
 519         return NULL;
 520     }
 521
 522     while(i < UTOK_OPTION_COUNT) {
 523         if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
 524             foundOption = TRUE;
 525             if(end - start > rulesOptions[i].optionLen) {
 526                 optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */
 527                 while(PatternProps::isWhiteSpace(*optionArg)) { /* eat whitespace */
 528                     optionArg++;
 529                 }
 530             }
 531             break;
 532         }
 533         i++;
 534     }
 535
 536     if(!foundOption) {
 537         *status = U_ILLEGAL_ARGUMENT_ERROR;
 538         return NULL;
 539     }
 540
 541     if(optionArg) {
 542         for(j = 0; j<rulesOptions[i].subSize; j++) {
 543             if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
 544                 //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
 545                 *attrib = rulesOptions[i].attr;
 546                 *value = rulesOptions[i].subopts[j].attrVal;
 547                 optionArg += rulesOptions[i].subopts[j].subLen;
 548                 while(PatternProps::isWhiteSpace(*optionArg)) { /* eat whitespace */
 549                     optionArg++;
 550                 }
 551                 if(*optionArg == 0x005d) {
 552                     optionArg++;
 553                     return optionArg;
 554                 } else {
 555                     *status = U_ILLEGAL_ARGUMENT_ERROR;
 556                     return NULL;
 557                 }
 558             }
 559         }
 560     }
 561     *status = U_ILLEGAL_ARGUMENT_ERROR;
 562     return NULL;
 563 }
 564
 565 static
 566 USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) {
 567     while(*start != 0x005b) { /* advance while we find the first '[' */
 568         start++;
 569     }
 570     // now we need to get a balanced set of '[]'. The problem is that a set can have
 571     // many, and *end point to the first closing '['
 572     int32_t noOpenBraces = 1;
 573     int32_t current = 1; // skip the opening brace
 574     while(start+current < end && noOpenBraces != 0) {
 575         if(start[current] == 0x005b) {
 576             noOpenBraces++;
 577         } else if(start[current] == 0x005D) { // closing brace
 578             noOpenBraces--;
 579         }
 580         current++;
 581     }
 582
 583     if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) {
 584         *status = U_ILLEGAL_ARGUMENT_ERROR;
 585         return NULL;
 586     }
 587     return uset_openPattern(start, current, status);
 588 }
 589
 590 /**
 591  * Reads an option and matches the option name with the predefined options. (Case-insensitive.)
 592  * @param start Pointer to the start UChar.
 593  * @param end Pointer to the last valid pointer beyond which the option will not extend.
 594  * @param optionArg Address of the pointer at which the options start (after the option name)
 595  * @return The index of the option, or -1 if the option is not valid.
 596  */
 597 static
 598 int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) {
 599     int32_t i = 0;
 600     ucol_uprv_tok_initData();
 601
 602     while(PatternProps::isWhiteSpace(*start)) { /* eat whitespace */
 603         start++;
 604     }
 605     while(i < UTOK_OPTION_COUNT) {
 606         if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
 607             if(end - start > rulesOptions[i].optionLen) {
 608                 *optionArg = start+rulesOptions[i].optionLen; /* End of option name; start of the options */
 609                 while(PatternProps::isWhiteSpace(**optionArg)) { /* eat whitespace */
 610                     (*optionArg)++;
 611                 }
 612             }
 613             break;
 614         }
 615         i++;
 616     }
 617     if(i == UTOK_OPTION_COUNT) {
 618         i = -1; // didn't find an option
 619     }
 620     return i;
 621 }
 622
 623
 624 static
 625 void ucol_tok_parseScriptReorder(UColTokenParser *src, UErrorCode *status) {
 626     int32_t codeCount = 0;
 627     int32_t codeIndex = 0;
 628     char conversion[64];
 629     int32_t tokenLength = 0;
 630     const UChar* space;
 631
 632     const UChar* current = src->current;
 633     const UChar* end = u_memchr(src->current, 0x005d, src->end - src->current);
 634
 635     // eat leading whitespace
 636     while(current < end && u_isWhitespace(*current)) {
 637         current++;
 638     }
 639
 640     while(current < end) {
 641         space = u_memchr(current, 0x0020, end - current);
 642         space = space == 0 ? end : space;
 643         tokenLength = space - current;
 644         if (tokenLength < 4) {
 645             *status = U_INVALID_FORMAT_ERROR;
 646             return;
 647         }
 648         codeCount++;
 649         current += tokenLength;
 650         while(current < end && u_isWhitespace(*current)) { /* eat whitespace */
 651             ++current;
 652         }
 653     }
 654
 655     if (codeCount == 0) {
 656         *status = U_INVALID_FORMAT_ERROR;
 657     }
 658
 659     src->reorderCodesLength = codeCount;
 660     src->reorderCodes = (int32_t*)uprv_malloc(codeCount * sizeof(int32_t));
 661     current = src->current;
 662
 663     // eat leading whitespace
 664     while(current < end && u_isWhitespace(*current)) {
 665         current++;
 666     }
 667
 668     while(current < end) {
 669         space = u_memchr(current, 0x0020, end - current);
 670         space = space == 0 ? end : space;
 671         tokenLength = space - current;
 672         if (tokenLength < 4) {
 673             *status = U_ILLEGAL_ARGUMENT_ERROR;
 674             return;
 675         } else {
 676             u_UCharsToChars(current, conversion, tokenLength);
 677             conversion[tokenLength] = '\0';
 678             src->reorderCodes[codeIndex] = ucol_findReorderingEntry(conversion);
 679             if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) {
 680                 src->reorderCodes[codeIndex] = u_getPropertyValueEnum(UCHAR_SCRIPT, conversion);
 681             }
 682             if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) {
 683                 *status = U_ILLEGAL_ARGUMENT_ERROR;
 684             }
 685         }
 686         codeIndex++;
 687         current += tokenLength;
 688         while(current < end && u_isWhitespace(*current)) { /* eat whitespace */
 689             ++current;
 690         }
 691     }
 692 }
 693
 694 // reads and conforms to various options in rules
 695 // end is the position of the first closing ']'
 696 // However, some of the options take an UnicodeSet definition
 697 // which needs to duplicate the closing ']'
 698 // for example: '[copy [\uAC00-\uD7FF]]'
 699 // These options will move end to the second ']' and the
 700 // caller will set the current to it.
 701 static
 702 uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) {
 703     const UChar* start = src->current;
 704     int32_t i = 0;
 705     int32_t j=0;
 706     const UChar *optionArg = NULL;
 707
 708     uint8_t result = 0;
 709
 710     start++; /*skip opening '['*/
 711     i = ucol_uprv_tok_readOption(start, src->end, &optionArg);
 712     if(optionArg) {
 713         src->current = optionArg;
 714     }
 715
 716     if(i < 0) {
 717         *status = U_ILLEGAL_ARGUMENT_ERROR;
 718     } else {
 719         int32_t noOpenBraces = 1;
 720         switch(i) {
 721     case OPTION_ALTERNATE_HANDLING:
 722     case OPTION_FRENCH_COLLATION:
 723     case OPTION_CASE_LEVEL:
 724     case OPTION_CASE_FIRST:
 725     case OPTION_NORMALIZATION_MODE:
 726     case OPTION_HIRAGANA_QUATERNARY:
 727     case OPTION_STRENGTH:
 728     case OPTION_NUMERIC_COLLATION:
 729         if(optionArg) {
 730             for(j = 0; j<rulesOptions[i].subSize; j++) {
 731                 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
 732                     ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
 733                     result =  UCOL_TOK_SUCCESS;
 734                 }
 735             }
 736         }
 737         if(result == 0) {
 738             *status = U_ILLEGAL_ARGUMENT_ERROR;
 739         }
 740         break;
 741     case OPTION_VARIABLE_TOP:
 742         result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP;
 743         break;
 744     case OPTION_REARRANGE:
 745         result = UCOL_TOK_SUCCESS;
 746         break;
 747     case OPTION_BEFORE:
 748         if(optionArg) {
 749             for(j = 0; j<rulesOptions[i].subSize; j++) {
 750                 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
 751                     result = UCOL_TOK_SUCCESS | (rulesOptions[i].subopts[j].attrVal + 1);
 752                 }
 753             }
 754         }
 755         if(result == 0) {
 756             *status = U_ILLEGAL_ARGUMENT_ERROR;
 757         }
 758         break;
 759     case OPTION_TOP: /* we are going to have an array with structures of limit CEs */
 760         /* index to this array will be src->parsedToken.indirectIndex*/
 761         src->parsedToken.indirectIndex = 0;
 762         result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;
 763         break;
 764     case OPTION_FIRST:
 765     case OPTION_LAST: /* first, last */
 766         for(j = 0; j<rulesOptions[i].subSize; j++) {
 767             if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
 768                 // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first
 769                 // element of indirect boundaries is reserved for top.
 770                 src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2);
 771                 result =  UCOL_TOK_SUCCESS | UCOL_TOK_TOP;;
 772             }
 773         }
 774         if(result == 0) {
 775             *status = U_ILLEGAL_ARGUMENT_ERROR;
 776         }
 777         break;
 778     case OPTION_OPTIMIZE:
 779     case OPTION_SUPPRESS_CONTRACTIONS:  // copy and remove are handled before normalization
 780         // we need to move end here
 781         src->current++; // skip opening brace
 782         while(src->current < src->end && noOpenBraces != 0) {
 783             if(*src->current == 0x005b) {
 784                 noOpenBraces++;
 785             } else if(*src->current == 0x005D) { // closing brace
 786                 noOpenBraces--;
 787             }
 788             src->current++;
 789         }
 790         result = UCOL_TOK_SUCCESS;
 791         break;
 792     case OPTION_SCRIPTREORDER:
 793         ucol_tok_parseScriptReorder(src, status);
 794         break;
 795     default:
 796         *status = U_UNSUPPORTED_ERROR;
 797         break;
 798         }
 799     }
 800     src->current = u_memchr(src->current, 0x005d, (int32_t)(src->end-src->current));
 801     return result;
 802 }
 803
 804
 805 inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) {
 806     if (stuff == NULL || len <= 0) {
 807         return;
 808     }
 809     UnicodeString tempStuff(FALSE, stuff, len);
 810     if(src->extraCurrent+len >= src->extraEnd) {
 811         /* reallocate */
 812         if (stuff >= src->source && stuff <= src->end) {
 813             // Copy the "stuff" contents into tempStuff's own buffer.
 814             // UnicodeString is copy-on-write.
 815             if (len > 0) {
 816                 tempStuff.setCharAt(0, tempStuff[0]);
 817             } else {
 818                 tempStuff.remove();
 819             }
 820         }
 821         UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar));
 822         if(newSrc != NULL) {
 823             src->current = newSrc + (src->current - src->source);
 824             src->extraCurrent = newSrc + (src->extraCurrent - src->source);
 825             src->end = newSrc + (src->end - src->source);
 826             src->extraEnd = newSrc + (src->extraEnd-src->source)*2;
 827             src->sourceCurrent = newSrc + (src->sourceCurrent-src->source);
 828             src->source = newSrc;
 829         } else {
 830             *status = U_MEMORY_ALLOCATION_ERROR;
 831             return;
 832         }
 833     }
 834     if(len == 1) {
 835         *src->extraCurrent++ = tempStuff[0];
 836     } else {
 837         u_memcpy(src->extraCurrent, tempStuff.getBuffer(), len);
 838         src->extraCurrent += len;
 839     }
 840 }
 841
 842 inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) {
 843     /*
 844     top = TRUE;
 845     */
 846     UChar buff[5];
 847     src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
 848     buff[0] = 0xFFFE;
 849     buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
 850     buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
 851     if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
 852         src->parsedToken.charsLen = 3;
 853         ucol_tok_addToExtraCurrent(src, buff, 3, status);
 854     } else {
 855         buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
 856         buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
 857         src->parsedToken.charsLen = 5;
 858         ucol_tok_addToExtraCurrent(src, buff, 5, status);
 859     }
 860     return TRUE;
 861 }
 862
 863 static UBool isCharNewLine(UChar c){
 864     switch(c){
 865     case 0x000A: /* LF  */
 866     case 0x000D: /* CR  */
 867     case 0x000C: /* FF  */
 868     case 0x0085: /* NEL */
 869     case 0x2028: /* LS  */
 870     case 0x2029: /* PS  */
 871         return TRUE;
 872     default:
 873         return FALSE;
 874     }
 875 }
 876
 877 /*
 878  * This function is called several times when a range is processed.  Each time, the next code point
 879  * is processed.
 880  * The following variables must be set before calling this function:
 881  *   src->currentRangeCp:  The current code point to process.
 882  *   src->lastRangeCp: The last code point in the range.
 883  * Pre-requisite: src->currentRangeCp <= src->lastRangeCp.
 884  */
 885 static const UChar*
 886 ucol_tok_processNextCodePointInRange(UColTokenParser *src,
 887                                      UErrorCode *status)
 888 {
 889   // Append current code point to source
 890   UChar buff[U16_MAX_LENGTH];
 891   uint32_t i = 0;
 892
 893   uint32_t nChars = U16_LENGTH(src->currentRangeCp);
 894   src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
 895   src->parsedToken.charsLen = nChars;
 896
 897   U16_APPEND_UNSAFE(buff, i, src->currentRangeCp);
 898   ucol_tok_addToExtraCurrent(src, buff, nChars, status);
 899
 900   ++src->currentRangeCp;
 901   if (src->currentRangeCp > src->lastRangeCp) {
 902     src->inRange = FALSE;
 903
 904     if (src->currentStarredCharIndex > src->lastStarredCharIndex) {
 905       src->isStarred = FALSE;
 906     }
 907   } else {
 908     src->previousCp = src->currentRangeCp;
 909   }
 910   return src->current;
 911 }
 912
 913 /*
 914  * This function is called several times when a starred list is processed.  Each time, the next code point
 915  * in the list is processed.
 916  * The following variables must be set before calling this function:
 917  *   src->currentStarredCharIndex:  Index (in src->source) of the first char of the current code point.
 918  *   src->lastStarredCharIndex: Index to the last character in the list.
 919  * Pre-requisite: src->currentStarredCharIndex <= src->lastStarredCharIndex.
 920  */
 921 static const UChar*
 922 ucol_tok_processNextTokenInStarredList(UColTokenParser *src)
 923 {
 924   // Extract the characters corresponding to the next code point.
 925   UChar32 cp;
 926   src->parsedToken.charsOffset = src->currentStarredCharIndex;
 927   int32_t prev = src->currentStarredCharIndex;
 928   U16_NEXT(src->source, src->currentStarredCharIndex, (uint32_t)(src->end - src->source), cp);
 929   src->parsedToken.charsLen = src->currentStarredCharIndex - prev;
 930
 931   // When we are done parsing the starred string, turn the flag off so that
 932   // the normal processing is restored.
 933   if (src->currentStarredCharIndex > src->lastStarredCharIndex) {
 934     src->isStarred = FALSE;
 935   }
 936   src->previousCp = cp;
 937   return src->current;
 938 }
 939
 940 /*
 941  * Partially parses the next token, keeps the indices in src->parsedToken, and updates the counters.
 942  *
 943  * This routine parses and separates almost all tokens. The following are the syntax characters recognized.
 944  *  # : Comment character
 945  *  & : Reset operator
 946  *  = : Equality
 947  *  < : Primary collation
 948  *  << : Secondary collation
 949  *  <<< : Tertiary collation
 950  *  ; : Secondary collation
 951  *  , : Tertiary collation
 952  *  / : Expansions
 953  *  | : Prefix
 954  *  - : Range
 955
 956  *  ! : Java Thai modifier, ignored
 957  *  @ : French only
 958
 959  * [] : Options
 960  * '' : Quotes
 961  *
 962  *  Along with operators =, <, <<, <<<, the operator * is supported to indicate a list.  For example, &a<*bcdexyz
 963  *  is equivalent to &a<b<c<d<e<x<y<z.  In lists, ranges also can be given, so &a*b-ex-z is equivalent to the above.
 964  *  This function do not separate the tokens in a list.  Instead, &a<*b-ex-z is parsed as three tokens - "&a",
 965  *  "<*b", "-ex", "-z".  The strength (< in this case), whether in a list, whether in a range and the previous
 966  *  character returned as cached so that the calling program can do further splitting.
 967  */
 968 static const UChar*
 969 ucol_tok_parseNextTokenInternal(UColTokenParser *src,
 970                                 UBool startOfRules,
 971                                 UParseError *parseError,
 972                                 UErrorCode *status)
 973 {
 974     UBool variableTop = FALSE;
 975     UBool top = FALSE;
 976     UBool inChars = TRUE;
 977     UBool inQuote = FALSE;
 978     UBool wasInQuote = FALSE;
 979     uint8_t before = 0;
 980     UBool isEscaped = FALSE;
 981
 982     // TODO: replace these variables with src->parsedToken counterparts
 983     // no need to use them anymore since we have src->parsedToken.
 984     // Ideally, token parser would be a nice class... Once, when I have
 985     // more time (around 2020 probably).
 986     uint32_t newExtensionLen = 0;
 987     uint32_t extensionOffset = 0;
 988     uint32_t newStrength = UCOL_TOK_UNSET;
 989     UChar buff[10];
 990
 991     src->parsedToken.charsOffset = 0;  src->parsedToken.charsLen = 0;
 992     src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;
 993     src->parsedToken.indirectIndex = 0;
 994
 995     while (src->current < src->end) {
 996         UChar ch = *(src->current);
 997
 998         if (inQuote) {
 999             if (ch == 0x0027/*'\''*/) {
1000                 inQuote = FALSE;
1001             } else {
1002                 if ((src->parsedToken.charsLen == 0) || inChars) {
1003                     if(src->parsedToken.charsLen == 0) {
1004                         src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1005                     }
1006                     src->parsedToken.charsLen++;
1007                 } else {
1008                     if(newExtensionLen == 0) {
1009                         extensionOffset = (uint32_t)(src->extraCurrent - src->source);
1010                     }
1011                     newExtensionLen++;
1012                 }
1013             }
1014         }else if(isEscaped){
1015             isEscaped =FALSE;
1016             if (newStrength == UCOL_TOK_UNSET) {
1017                 *status = U_INVALID_FORMAT_ERROR;
1018                 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1019                 DBG_FORMAT_ERROR
1020                 return NULL;
1021                 // enabling rules to start with non-tokens a < b
1022                 // newStrength = UCOL_TOK_RESET;
1023             }
1024             if(ch != 0x0000  && src->current != src->end) {
1025                 if (inChars) {
1026                     if(src->parsedToken.charsLen == 0) {
1027                         src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
1028                     }
1029                     src->parsedToken.charsLen++;
1030                 } else {
1031                     if(newExtensionLen == 0) {
1032                         extensionOffset = (uint32_t)(src->current - src->source);
1033                     }
1034                     newExtensionLen++;
1035                 }
1036             }
1037         }else {
1038             if(!PatternProps::isWhiteSpace(ch)) {
1039                 /* Sets the strength for this entry */
1040                 switch (ch) {
1041                 case 0x003D/*'='*/ :
1042                     if (newStrength != UCOL_TOK_UNSET) {
1043                         goto EndOfLoop;
1044                     }
1045
1046                     /* if we start with strength, we'll reset to top */
1047                     if(startOfRules == TRUE) {
1048                         src->parsedToken.indirectIndex = 5;
1049                         top = ucol_tok_doSetTop(src, status);
1050                         newStrength = UCOL_TOK_RESET;
1051                         goto EndOfLoop;
1052                     }
1053                     newStrength = UCOL_IDENTICAL;
1054                     if(*(src->current+1) == 0x002A) {/*'*'*/
1055                         src->current++;
1056                         src->isStarred = TRUE;
1057                     }
1058                     break;
1059
1060                 case 0x002C/*','*/:
1061                     if (newStrength != UCOL_TOK_UNSET) {
1062                         goto EndOfLoop;
1063                     }
1064
1065                     /* if we start with strength, we'll reset to top */
1066                     if(startOfRules == TRUE) {
1067                         src->parsedToken.indirectIndex = 5;
1068                         top = ucol_tok_doSetTop(src, status);
1069                         newStrength = UCOL_TOK_RESET;
1070                         goto EndOfLoop;
1071                     }
1072                     newStrength = UCOL_TERTIARY;
1073                     break;
1074
1075                 case  0x003B/*';'*/:
1076                     if (newStrength != UCOL_TOK_UNSET) {
1077                         goto EndOfLoop;
1078                     }
1079
1080                     /* if we start with strength, we'll reset to top */
1081                     if(startOfRules == TRUE) {
1082                         src->parsedToken.indirectIndex = 5;
1083                         top = ucol_tok_doSetTop(src, status);
1084                         newStrength = UCOL_TOK_RESET;
1085                         goto EndOfLoop;
1086                     }
1087                     newStrength = UCOL_SECONDARY;
1088                     break;
1089
1090                 case 0x003C/*'<'*/:
1091                     if (newStrength != UCOL_TOK_UNSET) {
1092                         goto EndOfLoop;
1093                     }
1094
1095                     /* if we start with strength, we'll reset to top */
1096                     if(startOfRules == TRUE) {
1097                         src->parsedToken.indirectIndex = 5;
1098                         top = ucol_tok_doSetTop(src, status);
1099                         newStrength = UCOL_TOK_RESET;
1100                         goto EndOfLoop;
1101                     }
1102                     /* before this, do a scan to verify whether this is */
1103                     /* another strength */
1104                     if(*(src->current+1) == 0x003C) {
1105                         src->current++;
1106                         if(*(src->current+1) == 0x003C) {
1107                             src->current++; /* three in a row! */
1108                             newStrength = UCOL_TERTIARY;
1109                         } else { /* two in a row */
1110                             newStrength = UCOL_SECONDARY;
1111                         }
1112                     } else { /* just one */
1113                         newStrength = UCOL_PRIMARY;
1114                     }
1115                     if(*(src->current+1) == 0x002A) {/*'*'*/
1116                         src->current++;
1117                         src->isStarred = TRUE;
1118                     }
1119                     break;
1120
1121                 case 0x0026/*'&'*/:
1122                     if (newStrength != UCOL_TOK_UNSET) {
1123                         /**/
1124                         goto EndOfLoop;
1125                     }
1126
1127                     newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
1128                     break;
1129
1130                 case 0x005b/*'['*/:
1131                     /* options - read an option, analyze it */
1132                     if(u_strchr(src->current, 0x005d /*']'*/) != NULL) {
1133                         uint8_t result = ucol_uprv_tok_readAndSetOption(src, status);
1134                         if(U_SUCCESS(*status)) {
1135                             if(result & UCOL_TOK_TOP) {
1136                                 if(newStrength == UCOL_TOK_RESET) {
1137                                     top = ucol_tok_doSetTop(src, status);
1138                                     if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
1139                                         src->parsedToken.charsLen+=2;
1140                                         buff[0] = 0x002d;
1141                                         buff[1] = before;
1142                                         ucol_tok_addToExtraCurrent(src, buff, 2, status);
1143                                     }
1144
1145                                     src->current++;
1146                                     goto EndOfLoop;
1147                                 } else {
1148                                     *status = U_INVALID_FORMAT_ERROR;
1149                                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1150                                     DBG_FORMAT_ERROR
1151                                 }
1152                             } else if(result & UCOL_TOK_VARIABLE_TOP) {
1153                                 if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
1154                                     variableTop = TRUE;
1155                                     src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1156                                     src->parsedToken.charsLen = 1;
1157                                     buff[0] = 0xFFFF;
1158                                     ucol_tok_addToExtraCurrent(src, buff, 1, status);
1159                                     src->current++;
1160                                     goto EndOfLoop;
1161                                 } else {
1162                                     *status = U_INVALID_FORMAT_ERROR;
1163                                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1164                                     DBG_FORMAT_ERROR
1165                                 }
1166                             } else if (result & UCOL_TOK_BEFORE){
1167                                 if(newStrength == UCOL_TOK_RESET) {
1168                                     before = result & UCOL_TOK_BEFORE;
1169                                 } else {
1170                                     *status = U_INVALID_FORMAT_ERROR;
1171                                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1172                                     DBG_FORMAT_ERROR
1173                                 }
1174                             }
1175                         } else {
1176                             *status = U_INVALID_FORMAT_ERROR;
1177                             syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1178                             DBG_FORMAT_ERROR
1179                             return NULL;
1180                         }
1181                     }
1182                     break;
1183                 case 0x0021/*! skip java thai modifier reordering*/:
1184                     break;
1185                 case 0x002F/*'/'*/:
1186                     wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
1187                     inChars = FALSE; /* we're now processing expansion */
1188                     break;
1189                 case 0x005C /* back slash for escaped chars */:
1190                     isEscaped = TRUE;
1191                     break;
1192                     /* found a quote, we're gonna start copying */
1193                 case 0x0027/*'\''*/:
1194                     if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
1195                       *status = U_INVALID_FORMAT_ERROR;
1196                       syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1197                       DBG_FORMAT_ERROR
1198                       return NULL;
1199                       // enabling rules to start with a non-token character a < b
1200                       // newStrength = UCOL_TOK_RESET;
1201                     }
1202
1203                     inQuote = TRUE;
1204
1205                     if(inChars) { /* we're doing characters */
1206                         if(wasInQuote == FALSE) {
1207                             src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1208                         }
1209                         if (src->parsedToken.charsLen != 0) {
1210                             ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
1211                         }
1212                         src->parsedToken.charsLen++;
1213                     } else { /* we're doing an expansion */
1214                         if(wasInQuote == FALSE) {
1215                             extensionOffset = (uint32_t)(src->extraCurrent - src->source);
1216                         }
1217                         if (newExtensionLen != 0) {
1218                             ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status);
1219                         }
1220                         newExtensionLen++;
1221                     }
1222
1223                     wasInQuote = TRUE;
1224
1225                     ch = *(++(src->current));
1226                     if(ch == 0x0027) { /* copy the double quote */
1227                         ucol_tok_addToExtraCurrent(src, &ch, 1, status);
1228                         inQuote = FALSE;
1229                     }
1230                     break;
1231
1232                     /* '@' is french only if the strength is not currently set */
1233                     /* if it is, it's just a regular character in collation rules */
1234                 case 0x0040/*'@'*/:
1235                     if (newStrength == UCOL_TOK_UNSET) {
1236                         src->opts->frenchCollation = UCOL_ON;
1237                         break;
1238                     }
1239
1240                 case 0x007C /*|*/: /* this means we have actually been reading prefix part */
1241                     // we want to store read characters to the prefix part and continue reading
1242                     // the characters (proper way would be to restart reading the chars, but in
1243                     // that case we would have to complicate the token hasher, which I do not
1244                     // intend to play with. Instead, we will do prefixes when prefixes are due
1245                     // (before adding the elements).
1246                     src->parsedToken.prefixOffset = src->parsedToken.charsOffset;
1247                     src->parsedToken.prefixLen = src->parsedToken.charsLen;
1248
1249                     if(inChars) { /* we're doing characters */
1250                         if(wasInQuote == FALSE) {
1251                             src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1252                         }
1253                         if (src->parsedToken.charsLen != 0) {
1254                             ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
1255                         }
1256                         src->parsedToken.charsLen++;
1257                     }
1258
1259                     wasInQuote = TRUE;
1260
1261                     do {
1262                         ch = *(++(src->current));
1263                         // skip whitespace between '|' and the character
1264                     } while (PatternProps::isWhiteSpace(ch));
1265                     break;
1266
1267                     //charsOffset = 0;
1268                     //newCharsLen = 0;
1269                     //break; // We want to store the whole prefix/character sequence. If we break
1270                     // the '|' is going to get lost.
1271
1272                 case 0x002D /*-*/: /* A range. */
1273                     if (newStrength != UCOL_TOK_UNSET) {
1274                       // While processing the pending token, the isStarred field
1275                       // is reset, so it needs to be saved for the next
1276                       // invocation.
1277                       src->savedIsStarred = src->isStarred;
1278                       goto EndOfLoop;
1279                    }
1280                    src->isStarred = src->savedIsStarred;
1281
1282                    // Ranges are valid only in starred tokens.
1283                    if (!src->isStarred) {
1284                      *status = U_INVALID_FORMAT_ERROR;
1285                      syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1286                      DBG_FORMAT_ERROR
1287                      return NULL;
1288                    }
1289                    newStrength = src->parsedToken.strength;
1290                    src->inRange = TRUE;
1291                    break;
1292
1293                 case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */
1294                     do {
1295                         ch = *(++(src->current));
1296                     } while (!isCharNewLine(ch));
1297
1298                     break;
1299                 default:
1300                     if (newStrength == UCOL_TOK_UNSET) {
1301                       *status = U_INVALID_FORMAT_ERROR;
1302                       syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1303                       DBG_FORMAT_ERROR
1304                       return NULL;
1305                     }
1306
1307                     if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
1308                         *status = U_INVALID_FORMAT_ERROR;
1309                         syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1310                         DBG_FORMAT_ERROR
1311                         return NULL;
1312                     }
1313
1314                     if(ch == 0x0000 && src->current+1 == src->end) {
1315                         break;
1316                     }
1317
1318                     if (inChars) {
1319                         if(src->parsedToken.charsLen == 0) {
1320                             src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
1321                         }
1322                         src->parsedToken.charsLen++;
1323                     } else {
1324                         if(newExtensionLen == 0) {
1325                             extensionOffset = (uint32_t)(src->current - src->source);
1326                         }
1327                         newExtensionLen++;
1328                     }
1329
1330                     break;
1331                 }
1332             }
1333         }
1334
1335         if(wasInQuote) {
1336             if(ch != 0x27) {
1337                 if(inQuote || !PatternProps::isWhiteSpace(ch)) {
1338                     ucol_tok_addToExtraCurrent(src, &ch, 1, status);
1339                 }
1340             }
1341         }
1342
1343         src->current++;
1344     }
1345
1346 EndOfLoop:
1347     wasInQuote = FALSE;
1348     if (newStrength == UCOL_TOK_UNSET) {
1349         return NULL;
1350     }
1351
1352     if (src->parsedToken.charsLen == 0 && top == FALSE) {
1353         syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1354         *status = U_INVALID_FORMAT_ERROR;
1355         DBG_FORMAT_ERROR
1356         return NULL;
1357     }
1358
1359     src->parsedToken.strength = newStrength;
1360     src->parsedToken.extensionOffset = extensionOffset;
1361     src->parsedToken.extensionLen = newExtensionLen;
1362     src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before;
1363
1364     return src->current;
1365 }
1366
1367 /*
1368  * Parses the next token, keeps the indices in src->parsedToken, and updates the counters.
1369  * @see ucol_tok_parseNextTokenInternal() for the description of what operators are supported.
1370  *
1371  * In addition to what ucol_tok_parseNextTokenInternal() does, this function does the following:
1372  *  1) ucol_tok_parseNextTokenInternal() returns a range as a single token.  This function separates
1373  *     it to separate tokens and returns one by one.  In order to do that, the necessary states are
1374  *     cached as member variables of the token parser.
1375  *  2) When encountering a range, ucol_tok_parseNextTokenInternal() processes characters up to the
1376  *     starting character as a single list token (which is separated into individual characters here)
1377  *     and as another list token starting with the last character in the range.  Before expanding it
1378  *     as a list of tokens, this function expands the range by filling the intermediate characters and
1379  *     returns them one by one as separate tokens.
1380  * Necessary checks are done for invalid combinations.
1381  */
1382 U_CAPI const UChar* U_EXPORT2
1383 ucol_tok_parseNextToken(UColTokenParser *src,
1384                         UBool startOfRules,
1385                         UParseError *parseError,
1386                         UErrorCode *status)
1387 {
1388   const UChar *nextToken;
1389
1390   if (src->inRange) {
1391     // We are not done processing a range.  Continue it.
1392     return ucol_tok_processNextCodePointInRange(src, status);
1393   } else if (src->isStarred) {
1394     // We are not done processing a starred token.  Continue it.
1395     return ucol_tok_processNextTokenInStarredList(src);
1396   }
1397
1398   // Get the next token.
1399   nextToken = ucol_tok_parseNextTokenInternal(src, startOfRules, parseError, status);
1400
1401   if (nextToken == NULL) {
1402     return NULL;
1403   }
1404
1405   if (src->inRange) {
1406     // A new range has started.
1407     // Check whether it is a chain of ranges with more than one hyphen.
1408     if (src->lastRangeCp > 0 && src->lastRangeCp == src->previousCp) {
1409         *status = U_INVALID_FORMAT_ERROR;
1410         syntaxError(src->source,src->parsedToken.charsOffset-1,
1411                     src->parsedToken.charsOffset+src->parsedToken.charsLen, parseError);
1412         DBG_FORMAT_ERROR
1413         return NULL;
1414     }
1415
1416     // The current token indicates the second code point of the range.
1417     // Process just that, and then proceed with the star.
1418     src->currentStarredCharIndex = src->parsedToken.charsOffset;
1419     U16_NEXT(src->source, src->currentStarredCharIndex,
1420              (uint32_t)(src->end - src->source), src->lastRangeCp);
1421     if (src->lastRangeCp <= src->previousCp) {
1422         *status = U_INVALID_FORMAT_ERROR;
1423         syntaxError(src->source,src->parsedToken.charsOffset-1,
1424                     src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
1425         DBG_FORMAT_ERROR
1426         return NULL;
1427     }
1428
1429     // Set current range code point to process the range loop
1430     src->currentRangeCp = src->previousCp + 1;
1431
1432     src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1;
1433
1434     return ucol_tok_processNextCodePointInRange(src, status);
1435  } else if (src->isStarred) {
1436     // We define two indices m_currentStarredCharIndex_ and m_lastStarredCharIndex_ so that
1437     // [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive, need to be
1438     // separated into several tokens and returned.
1439     src->currentStarredCharIndex = src->parsedToken.charsOffset;
1440     src->lastStarredCharIndex =  src->parsedToken.charsOffset + src->parsedToken.charsLen - 1;
1441
1442     return ucol_tok_processNextTokenInStarredList(src);
1443   } else {
1444     // Set previous codepoint
1445     U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->end - src->source), src->previousCp);
1446   }
1447   return nextToken;
1448 }
1449
1450
1451 /*
1452 Processing Description
1453 1 Build a ListList. Each list has a header, which contains two lists (positive
1454 and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and
1455 reset may be null.
1456 2 As you process, you keep a LAST pointer that points to the last token you
1457 handled.
1458
1459 */
1460
1461 static UColToken *ucol_tok_initAReset(UColTokenParser *src, const UChar *expand, uint32_t *expandNext,
1462                                       UParseError *parseError, UErrorCode *status)
1463 {
1464     if(src->resultLen == src->listCapacity) {
1465         // Unfortunately, this won't work, as we store addresses of lhs in token
1466         src->listCapacity *= 2;
1467         src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader));
1468         if(src->lh == NULL) {
1469             *status = U_MEMORY_ALLOCATION_ERROR;
1470             return NULL;
1471         }
1472     }
1473     /* do the reset thing */
1474     UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
1475     /* test for NULL */
1476     if (sourceToken == NULL) {
1477         *status = U_MEMORY_ALLOCATION_ERROR;
1478         return NULL;
1479     }
1480     sourceToken->rulesToParseHdl = &(src->source);
1481     sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1482     sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
1483
1484     sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
1485     sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
1486
1487     // keep the flags around so that we know about before
1488     sourceToken->flags = src->parsedToken.flags;
1489
1490     if(src->parsedToken.prefixOffset != 0) {
1491         // this is a syntax error
1492         *status = U_INVALID_FORMAT_ERROR;
1493         syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
1494         DBG_FORMAT_ERROR
1495         uprv_free(sourceToken);
1496         return 0;
1497     } else {
1498         sourceToken->prefix = 0;
1499     }
1500
1501     sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
1502     sourceToken->strength = UCOL_TOK_RESET;
1503     sourceToken->next = NULL;
1504     sourceToken->previous = NULL;
1505     sourceToken->noOfCEs = 0;
1506     sourceToken->noOfExpCEs = 0;
1507     sourceToken->listHeader = &src->lh[src->resultLen];
1508
1509     src->lh[src->resultLen].first = NULL;
1510     src->lh[src->resultLen].last = NULL;
1511     src->lh[src->resultLen].first = NULL;
1512     src->lh[src->resultLen].last = NULL;
1513
1514     src->lh[src->resultLen].reset = sourceToken;
1515
1516     /*
1517     3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
1518     First convert all expansions into normal form. Examples:
1519     If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
1520     d * ... into &x * c/y * d * ...
1521     Note: reset values can never have expansions, although they can cause the
1522     very next item to have one. They may be contractions, if they are found
1523     earlier in the list.
1524     */
1525     *expandNext = 0;
1526     if(expand != NULL) {
1527         /* check to see if there is an expansion */
1528         if(src->parsedToken.charsLen > 1) {
1529             uint32_t resetCharsOffset;
1530             resetCharsOffset = (uint32_t)(expand - src->source);
1531             sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset;
1532             *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset);
1533         }
1534     }
1535
1536     src->resultLen++;
1537
1538     uhash_put(src->tailored, sourceToken, sourceToken, status);
1539
1540     return sourceToken;
1541 }
1542
1543 static
1544 inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) {
1545     if(U_FAILURE(*status)) {
1546         return NULL;
1547     }
1548     /* this is a virgin before - we need to fish the anchor from the UCA */
1549     collIterate s;
1550     uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND;
1551     uint32_t CE, SecondCE;
1552     // uint32_t invPos;
1553     if(sourceToken != NULL) {
1554         uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s, status);
1555     } else {
1556         uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s, status);
1557     }
1558     if(U_FAILURE(*status)) {
1559         return NULL;
1560     }
1561
1562     baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;
1563     baseContCE = ucol_getNextCE(src->UCA, &s, status);
1564     if(baseContCE == UCOL_NO_MORE_CES) {
1565         baseContCE = 0;
1566     }
1567
1568
1569     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1570     uint32_t ch = 0;
1571     uint32_t expandNext = 0;
1572     UColToken key;
1573
1574     if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
1575         uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16);
1576         uint32_t raw = uprv_uca_getRawFromImplicit(primary);
1577         ch = uprv_uca_getCodePointFromRaw(raw-1);
1578         uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
1579         CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
1580         SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER;
1581
1582         src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1583         *src->extraCurrent++ = 0xFFFE;
1584         *src->extraCurrent++ = (UChar)ch;
1585         src->parsedToken.charsLen++;
1586
1587         key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
1588         key.rulesToParseHdl = &(src->source);
1589
1590         //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1591         sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1592
1593         if(sourceToken == NULL) {
1594             src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1595             if(isContinuation(SecondCE)) {
1596                 src->lh[src->resultLen].baseContCE = SecondCE;
1597             } else {
1598                 src->lh[src->resultLen].baseContCE = 0;
1599             }
1600             src->lh[src->resultLen].nextCE = 0;
1601             src->lh[src->resultLen].nextContCE = 0;
1602             src->lh[src->resultLen].previousCE = 0;
1603             src->lh[src->resultLen].previousContCE = 0;
1604
1605             src->lh[src->resultLen].indirect = FALSE;
1606
1607             sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1608         }
1609
1610     } else {
1611         /* invPos = */ ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
1612
1613         // we got the previous CE. Now we need to see if the difference between
1614         // the two CEs is really of the requested strength.
1615         // if it's a bigger difference (we asked for secondary and got primary), we
1616         // need to modify the CE.
1617         if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) {
1618             // adjust the strength
1619             // now we are in the situation where our baseCE should actually be modified in
1620             // order to get the CE in the right position.
1621             if(strength == UCOL_SECONDARY) {
1622                 CE = baseCE - 0x0200;
1623             } else { // strength == UCOL_TERTIARY
1624                 CE = baseCE - 0x02;
1625             }
1626             if(baseContCE) {
1627                 if(strength == UCOL_SECONDARY) {
1628                     SecondCE = baseContCE - 0x0200;
1629                 } else { // strength == UCOL_TERTIARY
1630                     SecondCE = baseContCE - 0x02;
1631                 }
1632             }
1633         }
1634
1635 #if 0
1636         // the code below relies on getting a code point from the inverse table, in order to be
1637         // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
1638         // 1. There are many code points that have the same CE
1639         // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
1640         // Also, in case when there is no equivalent strength before an element, we have to actually
1641         // construct one. For example, &[before 2]a << x won't result in x << a, because the element
1642         // before a is a primary difference.
1643
1644         //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
1645
1646
1647         ch = CETable[3*invPos+2];
1648
1649         if((ch &  UCOL_INV_SIZEMASK) != 0) {
1650             uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts);
1651             uint32_t offset = (ch & UCOL_INV_OFFSETMASK);
1652             ch = conts[offset];
1653         }
1654
1655         *src->extraCurrent++ = (UChar)ch;
1656         src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1);
1657         src->parsedToken.charsLen = 1;
1658
1659         // We got an UCA before. However, this might have been tailored.
1660         // example:
1661         // &\u30ca = \u306a
1662         // &[before 3]\u306a<<<\u306a|\u309d
1663
1664
1665         // uint32_t key = (*newCharsLen << 24) | *charsOffset;
1666         key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
1667         key.rulesToParseHdl = &(src->source);
1668
1669         //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1670         sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1671 #endif
1672
1673         // here is how it should be. The situation such as &[before 1]a < x, should be
1674         // resolved exactly as if we wrote &a > x.
1675         // therefore, I don't really care if the UCA value before a has been changed.
1676         // However, I do care if the strength between my element and the previous element
1677         // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
1678         // have to construct the base CE.
1679
1680
1681
1682         // if we found a tailored thing, we have to use the UCA value and construct
1683         // a new reset token with constructed name
1684         //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
1685         // character to which we want to anchor is already tailored.
1686         // We need to construct a new token which will be the anchor
1687         // point
1688         //*(src->extraCurrent-1) = 0xFFFE;
1689         //*src->extraCurrent++ = (UChar)ch;
1690         // grab before
1691         src->parsedToken.charsOffset -= 10;
1692         src->parsedToken.charsLen += 10;
1693         src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1694         if(isContinuation(SecondCE)) {
1695             src->lh[src->resultLen].baseContCE = SecondCE;
1696         } else {
1697             src->lh[src->resultLen].baseContCE = 0;
1698         }
1699         src->lh[src->resultLen].nextCE = 0;
1700         src->lh[src->resultLen].nextContCE = 0;
1701         src->lh[src->resultLen].previousCE = 0;
1702         src->lh[src->resultLen].previousContCE = 0;
1703
1704         src->lh[src->resultLen].indirect = FALSE;
1705
1706         sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1707         //}
1708     }
1709
1710     return sourceToken;
1711
1712 }
1713
1714 uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) {
1715     UColToken *lastToken = NULL;
1716     const UChar *parseEnd = NULL;
1717     uint32_t expandNext = 0;
1718     UBool variableTop = FALSE;
1719     UBool top = FALSE;
1720     uint16_t specs = 0;
1721     UColTokListHeader *ListList = NULL;
1722
1723     src->parsedToken.strength = UCOL_TOK_UNSET;
1724
1725     ListList = src->lh;
1726
1727     if(U_FAILURE(*status)) {
1728         return 0;
1729     }
1730 #ifdef DEBUG_FOR_CODE_POINTS
1731     char filename[35];
1732     sprintf(filename, "/tmp/debug_for_cp_%09d.txt", getpid());
1733     dfcp_fp = fopen(filename, "a");
1734     fprintf(stdout, "Output is in the file %s.\n", filename);
1735 #endif
1736
1737 #ifdef DEBUG_FOR_COLL_RULES
1738     std::string s3;
1739     UnicodeString(src->source).toUTF8String(s3);
1740     std::cout << "src->source = " << s3 << std::endl;
1741 #endif
1742
1743     while(src->current < src->end || src->isStarred) {
1744         src->parsedToken.prefixOffset = 0;
1745
1746         parseEnd = ucol_tok_parseNextToken(src,
1747             (UBool)(lastToken == NULL),
1748             parseError,
1749             status);
1750
1751         specs = src->parsedToken.flags;
1752
1753
1754         variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0);
1755         top = ((specs & UCOL_TOK_TOP) != 0);
1756
1757         if(U_SUCCESS(*status) && parseEnd != NULL) {
1758             UColToken *sourceToken = NULL;
1759             //uint32_t key = 0;
1760             uint32_t lastStrength = UCOL_TOK_UNSET;
1761
1762             if(lastToken != NULL ) {
1763                 lastStrength = lastToken->strength;
1764             }
1765
1766 #ifdef DEBUG_FOR_CODE_POINTS
1767             UChar32 cp;
1768             U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->extraEnd - src->source), cp);
1769             fprintf(dfcp_fp, "Code point = %x, Strength = %x\n", cp, src->parsedToken.strength);
1770 #endif
1771             //key = newCharsLen << 24 | charsOffset;
1772             UColToken key;
1773             key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1774             key.rulesToParseHdl = &(src->source);
1775
1776             /*  4 Lookup each source in the CharsToToken map, and find a sourceToken */
1777             sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1778
1779             if(src->parsedToken.strength != UCOL_TOK_RESET) {
1780                 if(lastToken == NULL) { /* this means that rules haven't started properly */
1781                     *status = U_INVALID_FORMAT_ERROR;
1782                     syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
1783                     DBG_FORMAT_ERROR
1784                     return 0;
1785                 }
1786                 /*  6 Otherwise (when relation != reset) */
1787                 if(sourceToken == NULL) {
1788                     /* If sourceToken is null, create new one, */
1789                     sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
1790                     /* test for NULL */
1791                     if (sourceToken == NULL) {
1792                         *status = U_MEMORY_ALLOCATION_ERROR;
1793                         return 0;
1794                     }
1795                     sourceToken->rulesToParseHdl = &(src->source);
1796                     sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1797
1798                     sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
1799
1800                     sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset;
1801                     sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset);
1802
1803                     sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
1804                     sourceToken->next = NULL;
1805                     sourceToken->previous = NULL;
1806                     sourceToken->noOfCEs = 0;
1807                     sourceToken->noOfExpCEs = 0;
1808                     // keep the flags around so that we know about before
1809                     sourceToken->flags = src->parsedToken.flags;
1810                     uhash_put(src->tailored, sourceToken, sourceToken, status);
1811                     if(U_FAILURE(*status)) {
1812                         return 0;
1813                     }
1814                 } else {
1815                     /* we could have fished out a reset here */
1816                     if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) {
1817                         /* otherwise remove sourceToken from where it was. */
1818                         if(sourceToken->next != NULL) {
1819                             if(sourceToken->next->strength > sourceToken->strength) {
1820                                 sourceToken->next->strength = sourceToken->strength;
1821                             }
1822                             sourceToken->next->previous = sourceToken->previous;
1823                         } else {
1824                             sourceToken->listHeader->last = sourceToken->previous;
1825                         }
1826
1827                         if(sourceToken->previous != NULL) {
1828                             sourceToken->previous->next = sourceToken->next;
1829                         } else {
1830                             sourceToken->listHeader->first = sourceToken->next;
1831                         }
1832                         sourceToken->next = NULL;
1833                         sourceToken->previous = NULL;
1834                     }
1835                 }
1836
1837                 sourceToken->strength = src->parsedToken.strength;
1838                 sourceToken->listHeader = lastToken->listHeader;
1839
1840                 /*
1841                 1.  Find the strongest strength in each list, and set strongestP and strongestN
1842                 accordingly in the headers.
1843                 */
1844                 if(lastStrength == UCOL_TOK_RESET
1845                     || sourceToken->listHeader->first == 0) {
1846                         /* If LAST is a reset
1847                         insert sourceToken in the list. */
1848                         if(sourceToken->listHeader->first == 0) {
1849                             sourceToken->listHeader->first = sourceToken;
1850                             sourceToken->listHeader->last = sourceToken;
1851                         } else { /* we need to find a place for us */
1852                             /* and we'll get in front of the same strength */
1853                             if(sourceToken->listHeader->first->strength <= sourceToken->strength) {
1854                                 sourceToken->next = sourceToken->listHeader->first;
1855                                 sourceToken->next->previous = sourceToken;
1856                                 sourceToken->listHeader->first = sourceToken;
1857                                 sourceToken->previous = NULL;
1858                             } else {
1859                                 lastToken = sourceToken->listHeader->first;
1860                                 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
1861                                     lastToken = lastToken->next;
1862                                 }
1863                                 if(lastToken->next != NULL) {
1864                                     lastToken->next->previous = sourceToken;
1865                                 } else {
1866                                     sourceToken->listHeader->last = sourceToken;
1867                                 }
1868                                 sourceToken->previous = lastToken;
1869                                 sourceToken->next = lastToken->next;
1870                                 lastToken->next = sourceToken;
1871                             }
1872                         }
1873                     } else {
1874                         /* Otherwise (when LAST is not a reset)
1875                         if polarity (LAST) == polarity(relation), insert sourceToken after LAST,
1876                         otherwise insert before.
1877                         when inserting after or before, search to the next position with the same
1878                         strength in that direction. (This is called postpone insertion).         */
1879                         if(sourceToken != lastToken) {
1880                             if(lastToken->polarity == sourceToken->polarity) {
1881                                 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
1882                                     lastToken = lastToken->next;
1883                                 }
1884                                 sourceToken->previous = lastToken;
1885                                 if(lastToken->next != NULL) {
1886                                     lastToken->next->previous = sourceToken;
1887                                 } else {
1888                                     sourceToken->listHeader->last = sourceToken;
1889                                 }
1890
1891                                 sourceToken->next = lastToken->next;
1892                                 lastToken->next = sourceToken;
1893                             } else {
1894                                 while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) {
1895                                     lastToken = lastToken->previous;
1896                                 }
1897                                 sourceToken->next = lastToken;
1898                                 if(lastToken->previous != NULL) {
1899                                     lastToken->previous->next = sourceToken;
1900                                 } else {
1901                                     sourceToken->listHeader->first = sourceToken;
1902                                 }
1903                                 sourceToken->previous = lastToken->previous;
1904                                 lastToken->previous = sourceToken;
1905                             }
1906                         } else { /* repeated one thing twice in rules, stay with the stronger strength */
1907                             if(lastStrength < sourceToken->strength) {
1908                                 sourceToken->strength = lastStrength;
1909                             }
1910                         }
1911                     }
1912
1913                     /* if the token was a variable top, we're gonna put it in */
1914                     if(variableTop == TRUE && src->varTop == NULL) {
1915                         variableTop = FALSE;
1916                         src->varTop = sourceToken;
1917                     }
1918
1919                     // Treat the expansions.
1920                     // There are two types of expansions: explicit (x / y) and reset based propagating expansions
1921                     // (&abc * d * e <=> &ab * d / c * e / c)
1922                     // if both of them are in effect for a token, they are combined.
1923
1924                     sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
1925
1926                     if(expandNext != 0) {
1927                         if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */
1928                             expandNext = 0;
1929                         } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */
1930                             sourceToken->expansion = expandNext;
1931                         } else { /* there is both explicit and implicit expansion. We need to make a combination */
1932                             uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar));
1933                             uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar));
1934                             sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source));
1935                             src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen;
1936                         }
1937                     }
1938
1939                     // This is just for debugging purposes
1940                     if(sourceToken->expansion != 0) {
1941                         sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
1942                     } else {
1943                         sourceToken->debugExpansion = 0;
1944                     }
1945                     // if the previous token was a reset before, the strength of this
1946                     // token must match the strength of before. Otherwise we have an
1947                     // undefined situation.
1948                     // In other words, we currently have a cludge which we use to
1949                     // represent &a >> x. This is written as &[before 2]a << x.
1950                     if((lastToken->flags & UCOL_TOK_BEFORE) != 0) {
1951                         uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1;
1952                         if(beforeStrength != sourceToken->strength) {
1953                             *status = U_INVALID_FORMAT_ERROR;
1954                             syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
1955                             DBG_FORMAT_ERROR
1956                             return 0;
1957                         }
1958                     }
1959             } else {
1960                 if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
1961                     /* if the previous token was also a reset, */
1962                     /*this means that we have two consecutive resets */
1963                     /* and we want to remove the previous one if empty*/
1964                     if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
1965                         src->resultLen--;
1966                     }
1967                 }
1968
1969                 if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
1970                     uint32_t searchCharsLen = src->parsedToken.charsLen;
1971                     while(searchCharsLen > 1 && sourceToken == NULL) {
1972                         searchCharsLen--;
1973                         //key = searchCharsLen << 24 | charsOffset;
1974                         UColToken key;
1975                         key.source = searchCharsLen << 24 | src->parsedToken.charsOffset;
1976                         key.rulesToParseHdl = &(src->source);
1977                         sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1978                     }
1979                     if(sourceToken != NULL) {
1980                         expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen);
1981                     }
1982                 }
1983
1984                 if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */
1985                     if(top == FALSE) { /* there is no indirection */
1986                         uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
1987                         if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
1988                             /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
1989                             while(sourceToken->strength > strength && sourceToken->previous != NULL) {
1990                                 sourceToken = sourceToken->previous;
1991                             }
1992                             /* here, either we hit the strength or NULL */
1993                             if(sourceToken->strength == strength) {
1994                                 if(sourceToken->previous != NULL) {
1995                                     sourceToken = sourceToken->previous;
1996                                 } else { /* start of list */
1997                                     sourceToken = sourceToken->listHeader->reset;
1998                                 }
1999                             } else { /* we hit NULL */
2000                                 /* we should be doing the else part */
2001                                 sourceToken = sourceToken->listHeader->reset;
2002                                 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
2003                             }
2004                         } else {
2005                             sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
2006                         }
2007                     } else { /* this is both before and indirection */
2008                         top = FALSE;
2009                         ListList[src->resultLen].previousCE = 0;
2010                         ListList[src->resultLen].previousContCE = 0;
2011                         ListList[src->resultLen].indirect = TRUE;
2012                         /* we need to do slightly more work. we need to get the baseCE using the */
2013                         /* inverse UCA & getPrevious. The next bound is not set, and will be decided */
2014                         /* in ucol_bld */
2015                         uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
2016                         uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
2017                         uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F;
2018                         uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
2019
2020                         UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
2021                         if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) &&
2022                            (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
2023                             uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16);
2024                             uint32_t raw = uprv_uca_getRawFromImplicit(primary);
2025                             uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
2026                             CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
2027                             SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER;
2028                         } else {
2029                             /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/
2030                             ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
2031                         }
2032
2033                         ListList[src->resultLen].baseCE = CE;
2034                         ListList[src->resultLen].baseContCE = SecondCE;
2035                         ListList[src->resultLen].nextCE = 0;
2036                         ListList[src->resultLen].nextContCE = 0;
2037
2038                         sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
2039                     }
2040                 }
2041
2042
2043                 /*  5 If the relation is a reset:
2044                 If sourceToken is null
2045                 Create new list, create new sourceToken, make the baseCE from source, put
2046                 the sourceToken in ListHeader of the new list */
2047                 if(sourceToken == NULL) {
2048                     /*
2049                     3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
2050                     First convert all expansions into normal form. Examples:
2051                     If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
2052                     d * ... into &x * c/y * d * ...
2053                     Note: reset values can never have expansions, although they can cause the
2054                     very next item to have one. They may be contractions, if they are found
2055                     earlier in the list.
2056                     */
2057                     if(top == FALSE) {
2058                         collIterate s;
2059                         uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
2060
2061                         uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s, status);
2062
2063                         CE = ucol_getNextCE(src->UCA, &s, status);
2064                         const UChar *expand = s.pos;
2065                         SecondCE = ucol_getNextCE(src->UCA, &s, status);
2066
2067                         ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;
2068                         if(isContinuation(SecondCE)) {
2069                             ListList[src->resultLen].baseContCE = SecondCE;
2070                         } else {
2071                             ListList[src->resultLen].baseContCE = 0;
2072                         }
2073                         ListList[src->resultLen].nextCE = 0;
2074                         ListList[src->resultLen].nextContCE = 0;
2075                         ListList[src->resultLen].previousCE = 0;
2076                         ListList[src->resultLen].previousContCE = 0;
2077                         ListList[src->resultLen].indirect = FALSE;
2078                         sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status);
2079                     } else { /* top == TRUE */
2080                         /* just use the supplied values */
2081                         top = FALSE;
2082                         ListList[src->resultLen].previousCE = 0;
2083                         ListList[src->resultLen].previousContCE = 0;
2084                         ListList[src->resultLen].indirect = TRUE;
2085                         ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
2086                         ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
2087                         ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
2088                         ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;
2089
2090                         sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
2091
2092                     }
2093                 } else { /* reset to something already in rules */
2094                     top = FALSE;
2095                 }
2096             }
2097             /*  7 After all this, set LAST to point to sourceToken, and goto step 3. */
2098             lastToken = sourceToken;
2099         } else {
2100             if(U_FAILURE(*status)) {
2101                 return 0;
2102             }
2103         }
2104     }
2105 #ifdef DEBUG_FOR_CODE_POINTS
2106     fclose(dfcp_fp);
2107 #endif
2108
2109
2110     if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
2111         src->resultLen--;
2112     }
2113     return src->resultLen;
2114 }
2115
2116 const UChar* ucol_tok_getRulesFromBundle(
2117     void* /*context*/,
2118     const char* locale,
2119     const char* type,
2120     int32_t* pLength,
2121     UErrorCode* status)
2122 {
2123     const UChar* rules = NULL;
2124     UResourceBundle* bundle;
2125     UResourceBundle* collations;
2126     UResourceBundle* collation;
2127
2128     *pLength = 0;
2129
2130     bundle = ures_open(U_ICUDATA_COLL, locale, status);
2131     if(U_SUCCESS(*status)){
2132         collations = ures_getByKey(bundle, "collations", NULL, status);
2133         if(U_SUCCESS(*status)){
2134             collation = ures_getByKey(collations, type, NULL, status);
2135             if(U_SUCCESS(*status)){
2136                 rules = ures_getStringByKey(collation, "Sequence", pLength, status);
2137                 if(U_FAILURE(*status)){
2138                     *pLength = 0;
2139                     rules = NULL;
2140                 }
2141                 ures_close(collation);
2142             }
2143             ures_close(collations);
2144         }
2145     }
2146
2147     ures_close(bundle);
2148
2149     return rules;
2150 }
2151
2152 void ucol_tok_initTokenList(
2153     UColTokenParser *src,
2154     const UChar *rules,
2155     uint32_t rulesLength,
2156     const UCollator *UCA,
2157     GetCollationRulesFunction importFunc,
2158     void* context,
2159     UErrorCode *status) {
2160     U_NAMESPACE_USE
2161
2162     uint32_t nSize = 0;
2163     uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE);
2164
2165     bool needToDeallocRules = false;
2166
2167     if(U_FAILURE(*status)) {
2168         return;
2169     }
2170
2171     // set everything to zero, so that we can clean up gracefully
2172     uprv_memset(src, 0, sizeof(UColTokenParser));
2173
2174     // first we need to find options that don't like to be normalized,
2175     // like copy and remove...
2176     //const UChar *openBrace = rules;
2177     int32_t optionNumber = -1;
2178     const UChar *setStart = NULL;
2179     uint32_t i = 0;
2180     while(i < rulesLength) {
2181         if(rules[i] == 0x005B) {    // '[': start of an option
2182             /* Gets the following:
2183                optionNumber: The index of the option.
2184                setStart: The pointer at which the option arguments start.
2185              */
2186             optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart);
2187
2188             if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */
2189                 // [optimize]
2190                 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
2191                 if(U_SUCCESS(*status)) {
2192                     if(src->copySet == NULL) {
2193                         src->copySet = newSet;
2194                     } else {
2195                         uset_addAll(src->copySet, newSet);
2196                         uset_close(newSet);
2197                     }
2198                 } else {
2199                     return;
2200                 }
2201             } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {
2202                 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
2203                 if(U_SUCCESS(*status)) {
2204                     if(src->removeSet == NULL) {
2205                         src->removeSet = newSet;
2206                     } else {
2207                         uset_addAll(src->removeSet, newSet);
2208                         uset_close(newSet);
2209                     }
2210                 } else {
2211                     return;
2212                 }
2213             } else if(optionNumber == OPTION_IMPORT){
2214                 // [import <collation-name>]
2215
2216                 // Find the address of the closing ].
2217                 UChar* import_end = u_strchr(setStart, 0x005D);
2218                 int32_t optionEndOffset = (int32_t)(import_end + 1 - rules);
2219                 // Ignore trailing whitespace.
2220                 while(PatternProps::isWhiteSpace(*(import_end-1))) {
2221                     --import_end;
2222                 }
2223
2224                 int32_t optionLength = (int32_t)(import_end - setStart);
2225                 char option[50];
2226                 if(optionLength >= (int32_t)sizeof(option)) {
2227                     *status = U_ILLEGAL_ARGUMENT_ERROR;
2228                     return;
2229                 }
2230                 u_UCharsToChars(setStart, option, optionLength);
2231                 option[optionLength] = 0;
2232
2233                 *status = U_ZERO_ERROR;
2234                 char locale[50];
2235                 int32_t templ;
2236                 uloc_forLanguageTag(option, locale, (int32_t)sizeof(locale), &templ, status);
2237                 if(U_FAILURE(*status)) {
2238                     *status = U_ILLEGAL_ARGUMENT_ERROR;
2239                     return;
2240                 }
2241
2242                 char type[50];
2243                 if (uloc_getKeywordValue(locale, "collation", type, (int32_t)sizeof(type), status) <= 0 ||
2244                     U_FAILURE(*status)
2245                 ) {
2246                     *status = U_ZERO_ERROR;
2247                     uprv_strcpy(type, "standard");
2248                 }
2249
2250                 // TODO: Use public functions when available, see ticket #8134.
2251                 char *keywords = (char *)locale_getKeywordsStart(locale);
2252                 if(keywords != NULL) {
2253                     *keywords = 0;
2254                 }
2255
2256                 int32_t importRulesLength = 0;
2257                 const UChar* importRules = importFunc(context, locale, type, &importRulesLength, status);
2258
2259 #ifdef DEBUG_FOR_COLL_RULES
2260                 std::string s;
2261                 UnicodeString(importRules).toUTF8String(s);
2262                 std::cout << "Import rules = " << s << std::endl;
2263 #endif
2264
2265                 // Add the length of the imported rules to length of the original rules,
2266                 // and subtract the length of the import option.
2267                 uint32_t newRulesLength = rulesLength + importRulesLength - (optionEndOffset - i);
2268
2269                 UChar* newRules = (UChar*)uprv_malloc(newRulesLength*sizeof(UChar));
2270
2271 #ifdef DEBUG_FOR_COLL_RULES
2272                 std::string s1;
2273                 UnicodeString(rules).toUTF8String(s1);
2274                 std::cout << "Original rules = " << s1 << std::endl;
2275 #endif
2276
2277
2278                 // Copy the section of the original rules leading up to the import
2279                 uprv_memcpy(newRules, rules, i*sizeof(UChar));
2280                 // Copy the imported rules
2281                 uprv_memcpy(newRules+i, importRules, importRulesLength*sizeof(UChar));
2282                 // Copy the rest of the original rules (minus the import option itself)
2283                 uprv_memcpy(newRules+i+importRulesLength,
2284                             rules+optionEndOffset,
2285                             (rulesLength-optionEndOffset)*sizeof(UChar));
2286
2287 #ifdef DEBUG_FOR_COLL_RULES
2288                 std::string s2;
2289                 UnicodeString(newRules).toUTF8String(s2);
2290                 std::cout << "Resulting rules = " << s2 << std::endl;
2291 #endif
2292
2293                 if(needToDeallocRules){
2294                     // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free
2295                     uprv_free((void*)rules);
2296                 }
2297                 needToDeallocRules = true;
2298                 rules = newRules;
2299                 rulesLength = newRulesLength;
2300
2301                 estimatedSize += importRulesLength*2;
2302
2303                 // First character of the new rules needs to be processed
2304                 i--;
2305             }
2306         }
2307         //openBrace++;
2308         i++;
2309     }
2310
2311     src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar));
2312     /* test for NULL */
2313     if (src->source == NULL) {
2314         *status = U_MEMORY_ALLOCATION_ERROR;
2315         return;
2316     }
2317     uprv_memset(src->source, 0, estimatedSize*sizeof(UChar));
2318     nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status);
2319     if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) {
2320         *status = U_ZERO_ERROR;
2321         src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
2322         /* test for NULL */
2323         if (src->source == NULL) {
2324             *status = U_MEMORY_ALLOCATION_ERROR;
2325             return;
2326         }
2327         nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);
2328     }
2329     if(needToDeallocRules){
2330         // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free
2331         uprv_free((void*)rules);
2332     }
2333
2334
2335     src->current = src->source;
2336     src->end = src->source+nSize;
2337     src->sourceCurrent = src->source;
2338     src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly
2339     src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
2340     src->varTop = NULL;
2341     src->UCA = UCA;
2342     src->invUCA = ucol_initInverseUCA(status);
2343     src->parsedToken.charsLen = 0;
2344     src->parsedToken.charsOffset = 0;
2345     src->parsedToken.extensionLen = 0;
2346     src->parsedToken.extensionOffset = 0;
2347     src->parsedToken.prefixLen = 0;
2348     src->parsedToken.prefixOffset = 0;
2349     src->parsedToken.flags = 0;
2350     src->parsedToken.strength = UCOL_TOK_UNSET;
2351     src->buildCCTabFlag = FALSE;
2352     src->isStarred = FALSE;
2353     src->inRange = FALSE;
2354     src->lastRangeCp = 0;
2355     src->previousCp = 0;
2356
2357     if(U_FAILURE(*status)) {
2358         return;
2359     }
2360     src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status);
2361     if(U_FAILURE(*status)) {
2362         return;
2363     }
2364     uhash_setValueDeleter(src->tailored, uprv_free);
2365
2366     src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
2367     /* test for NULL */
2368     if (src->opts == NULL) {
2369         *status = U_MEMORY_ALLOCATION_ERROR;
2370         return;
2371     }
2372
2373     uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet));
2374
2375     src->lh = 0;
2376     src->listCapacity = 1024;
2377     src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader));
2378     //Test for NULL
2379     if (src->lh == NULL) {
2380         *status = U_MEMORY_ALLOCATION_ERROR;
2381         return;
2382     }
2383     uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader));
2384     src->resultLen = 0;
2385
2386     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
2387
2388     // UCOL_RESET_TOP_VALUE
2389     setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
2390     // UCOL_FIRST_PRIMARY_IGNORABLE
2391     setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);
2392     // UCOL_LAST_PRIMARY_IGNORABLE
2393     setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);
2394     // UCOL_FIRST_SECONDARY_IGNORABLE
2395     setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);
2396     // UCOL_LAST_SECONDARY_IGNORABLE
2397     setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);
2398     // UCOL_FIRST_TERTIARY_IGNORABLE
2399     setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);
2400     // UCOL_LAST_TERTIARY_IGNORABLE
2401     setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);
2402     // UCOL_FIRST_VARIABLE
2403     setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);
2404     // UCOL_LAST_VARIABLE
2405     setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);
2406     // UCOL_FIRST_NON_VARIABLE
2407     setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);
2408     // UCOL_LAST_NON_VARIABLE
2409     setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
2410     // UCOL_FIRST_IMPLICIT
2411     setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0);
2412     // UCOL_LAST_IMPLICIT
2413     setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING);
2414     // UCOL_FIRST_TRAILING
2415     setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0);
2416     // UCOL_LAST_TRAILING
2417     setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0);
2418     ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24);
2419 }
2420
2421
2422 void ucol_tok_closeTokenList(UColTokenParser *src) {
2423     if(src->copySet != NULL) {
2424         uset_close(src->copySet);
2425     }
2426     if(src->removeSet != NULL) {
2427         uset_close(src->removeSet);
2428     }
2429     if(src->tailored != NULL) {
2430         uhash_close(src->tailored);
2431     }
2432     if(src->lh != NULL) {
2433         uprv_free(src->lh);
2434     }
2435     if(src->source != NULL) {
2436         uprv_free(src->source);
2437     }
2438     if(src->opts != NULL) {
2439         uprv_free(src->opts);
2440     }
2441     if (src->reorderCodes != NULL) {
2442         uprv_free(src->reorderCodes);
2443     }
2444 }
2445
2446 #endif /* #if !UCONFIG_NO_COLLATION */