]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/ucol_tok.cpp
ICU-511.32.tar.gz
[apple/icu.git] / icuSources / i18n / ucol_tok.cpp
CommitLineData
b75a7d8f
A
1/*
2*******************************************************************************
3*
4388f060 4* Copyright (C) 2001-2012, International Business Machines
b75a7d8f
A
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: ucol_tok.cpp
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created 02/22/2001
14* created by: Vladimir Weinstein
15*
374ca955 16* This module reads a tailoring rule string and produces a list of
b75a7d8f 17* tokens that will be turned into collation elements
374ca955 18*
b75a7d8f
A
19*/
20
21#include "unicode/utypes.h"
22
23#if !UCONFIG_NO_COLLATION
24
729e4ab9 25#include "unicode/uscript.h"
b75a7d8f
A
26#include "unicode/ustring.h"
27#include "unicode/uchar.h"
28#include "unicode/uniset.h"
374ca955 29
b75a7d8f 30#include "cmemory.h"
729e4ab9 31#include "cstring.h"
4388f060 32#include "patternprops.h"
729e4ab9
A
33#include "ucol_bld.h"
34#include "ucol_tok.h"
35#include "ulocimp.h"
36#include "uresimp.h"
b75a7d8f 37
729e4ab9
A
38// Define this only for debugging.
39// #define DEBUG_FOR_COLL_RULES 1
40
41#ifdef DEBUG_FOR_COLL_RULES
42#include <iostream>
43#endif
44
45U_NAMESPACE_USE
46
b75a7d8f 47U_CDECL_BEGIN
73c04bcf 48static int32_t U_CALLCONV
b75a7d8f
A
49uhash_hashTokens(const UHashTok k)
50{
51 int32_t hash = 0;
52 //uint32_t key = (uint32_t)k.integer;
53 UColToken *key = (UColToken *)k.pointer;
54 if (key != 0) {
b75a7d8f
A
55 int32_t len = (key->source & 0xFF000000)>>24;
56 int32_t inc = ((len - 32) / 32) + 1;
374ca955 57
729e4ab9 58 const UChar *p = (key->source & 0x00FFFFFF) + *(key->rulesToParseHdl);
374ca955 59 const UChar *limit = p + len;
b75a7d8f
A
60
61 while (p<limit) {
62 hash = (hash * 37) + *p;
63 p += inc;
64 }
65 }
66 return hash;
67}
68
73c04bcf 69static UBool U_CALLCONV
b75a7d8f
A
70uhash_compareTokens(const UHashTok key1, const UHashTok key2)
71{
72 //uint32_t p1 = (uint32_t) key1.integer;
73 //uint32_t p2 = (uint32_t) key2.integer;
74 UColToken *p1 = (UColToken *)key1.pointer;
75 UColToken *p2 = (UColToken *)key2.pointer;
729e4ab9
A
76 const UChar *s1 = (p1->source & 0x00FFFFFF) + *(p1->rulesToParseHdl);
77 const UChar *s2 = (p2->source & 0x00FFFFFF) + *(p2->rulesToParseHdl);
b75a7d8f
A
78 uint32_t s1L = ((p1->source & 0xFF000000) >> 24);
79 uint32_t s2L = ((p2->source & 0xFF000000) >> 24);
80 const UChar *end = s1+s1L-1;
81
82 if (p1 == p2) {
83 return TRUE;
84 }
85 if (p1->source == 0 || p2->source == 0) {
86 return FALSE;
87 }
88 if(s1L != s2L) {
46f4442e 89 return FALSE;
b75a7d8f
A
90 }
91 if(p1->source == p2->source) {
46f4442e 92 return TRUE;
b75a7d8f
A
93 }
94 while((s1 < end) && *s1 == *s2) {
46f4442e
A
95 ++s1;
96 ++s2;
b75a7d8f
A
97 }
98 if(*s1 == *s2) {
46f4442e 99 return TRUE;
b75a7d8f 100 } else {
46f4442e 101 return FALSE;
b75a7d8f
A
102 }
103}
104U_CDECL_END
105
729e4ab9
A
106/*
107 * Debug messages used to pinpoint where a format error occurred.
108 * A better way is to include context-sensitive information in syntaxError() function.
109 *
110 * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_FORMAT_ERROR
111 * in the compile line.
112 */
113/* #define DEBUG_FOR_FORMAT_ERROR 1 */
114
115#ifdef DEBUG_FOR_FORMAT_ERROR
116#define DBG_FORMAT_ERROR { printf("U_INVALID_FORMAT_ERROR at line %d", __LINE__);}
117#else
118#define DBG_FORMAT_ERROR
119#endif
120
121
122/*
123 * Controls debug messages so that the output can be compared before and after a
124 * big change. Prints the information of every code point that comes out of the
125 * collation parser and its strength into a file. When a big change in format
126 * happens, the files before and after the change should be identical.
127 *
128 * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_CODE_POINTS
129 * in the compile line.
130 */
131// #define DEBUG_FOR_CODE_POINTS 1
132
133#ifdef DEBUG_FOR_CODE_POINTS
134 FILE* dfcp_fp = NULL;
135#endif
136
137
b75a7d8f 138typedef struct {
46f4442e
A
139 uint32_t startCE;
140 uint32_t startContCE;
141 uint32_t limitCE;
142 uint32_t limitContCE;
b75a7d8f
A
143} indirectBoundaries;
144
145/* these values are used for finding CE values for indirect positioning. */
146/* Indirect positioning is a mechanism for allowing resets on symbolic */
147/* values. It only works for resets and you cannot tailor indirect names */
148/* An indirect name can define either an anchor point or a range. An */
149/* anchor point behaves in exactly the same way as a code point in reset */
150/* would, except that it cannot be tailored. A range (we currently only */
151/* know for the [top] range will explicitly set the upper bound for */
152/* generated CEs, thus allowing for better control over how many CEs can */
153/* be squeezed between in the range without performance penalty. */
154/* In that respect, we use [top] for tailoring of locales that use CJK */
155/* characters. Other indirect values are currently a pure convenience, */
156/* they can be used to assure that the CEs will be always positioned in */
157/* the same place relative to a point with known properties (e.g. first */
158/* primary ignorable). */
159static indirectBoundaries ucolIndirectBoundaries[15];
160/*
161static indirectBoundaries ucolIndirectBoundaries[11] = {
46f4442e
A
162{ UCOL_RESET_TOP_VALUE, 0,
163UCOL_NEXT_TOP_VALUE, 0 },
164{ UCOL_FIRST_PRIMARY_IGNORABLE, 0,
1650, 0 },
166{ UCOL_LAST_PRIMARY_IGNORABLE, UCOL_LAST_PRIMARY_IGNORABLE_CONT,
1670, 0 },
168{ UCOL_FIRST_SECONDARY_IGNORABLE, 0,
1690, 0 },
170{ UCOL_LAST_SECONDARY_IGNORABLE, 0,
1710, 0 },
172{ UCOL_FIRST_TERTIARY_IGNORABLE, 0,
1730, 0 },
174{ UCOL_LAST_TERTIARY_IGNORABLE, 0,
1750, 0 },
176{ UCOL_FIRST_VARIABLE, 0,
1770, 0 },
178{ UCOL_LAST_VARIABLE, 0,
1790, 0 },
180{ UCOL_FIRST_NON_VARIABLE, 0,
1810, 0 },
182{ UCOL_LAST_NON_VARIABLE, 0,
1830, 0 },
b75a7d8f
A
184};
185*/
186
374ca955
A
187static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) {
188
46f4442e
A
189 // Set values for the top - TODO: once we have values for all the indirects, we are going
190 // to initalize here.
191 ucolIndirectBoundaries[indexR].startCE = start[0];
192 ucolIndirectBoundaries[indexR].startContCE = start[1];
193 if(end) {
194 ucolIndirectBoundaries[indexR].limitCE = end[0];
195 ucolIndirectBoundaries[indexR].limitContCE = end[1];
196 } else {
197 ucolIndirectBoundaries[indexR].limitCE = 0;
198 ucolIndirectBoundaries[indexR].limitContCE = 0;
199 }
b75a7d8f
A
200}
201
202
374ca955
A
203static inline
204void syntaxError(const UChar* rules,
b75a7d8f
A
205 int32_t pos,
206 int32_t rulesLen,
46f4442e
A
207 UParseError* parseError)
208{
b75a7d8f
A
209 parseError->offset = pos;
210 parseError->line = 0 ; /* we are not using line numbers */
374ca955 211
b75a7d8f 212 // for pre-context
46f4442e 213 int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
b75a7d8f 214 int32_t stop = pos;
374ca955 215
b75a7d8f
A
216 u_memcpy(parseError->preContext,rules+start,stop-start);
217 //null terminate the buffer
218 parseError->preContext[stop-start] = 0;
374ca955 219
b75a7d8f
A
220 //for post-context
221 start = pos+1;
374ca955 222 stop = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) :
46f4442e 223 rulesLen;
b75a7d8f 224
374ca955 225 if(start < stop) {
46f4442e
A
226 u_memcpy(parseError->postContext,rules+start,stop-start);
227 //null terminate the buffer
228 parseError->postContext[stop-start]= 0;
374ca955 229 } else {
46f4442e 230 parseError->postContext[0] = 0;
374ca955 231 }
b75a7d8f
A
232}
233
234static
235void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) {
46f4442e
A
236 switch(attrib) {
237 case UCOL_HIRAGANA_QUATERNARY_MODE:
238 opts->hiraganaQ = value;
239 break;
240 case UCOL_FRENCH_COLLATION:
241 opts->frenchCollation = value;
242 break;
243 case UCOL_ALTERNATE_HANDLING:
244 opts->alternateHandling = value;
245 break;
246 case UCOL_CASE_FIRST:
247 opts->caseFirst = value;
248 break;
249 case UCOL_CASE_LEVEL:
250 opts->caseLevel = value;
251 break;
252 case UCOL_NORMALIZATION_MODE:
253 opts->normalizationMode = value;
254 break;
255 case UCOL_STRENGTH:
256 opts->strength = value;
257 break;
258 case UCOL_NUMERIC_COLLATION:
259 opts->numericCollation = value;
260 break;
261 case UCOL_ATTRIBUTE_COUNT:
262 default:
263 break;
264 }
b75a7d8f
A
265}
266
729e4ab9 267#define UTOK_OPTION_COUNT 22
b75a7d8f
A
268
269static UBool didInit = FALSE;
270/* we can be strict, or we can be lenient */
271/* I'd surely be lenient with the option arguments */
272/* maybe even with options */
273U_STRING_DECL(suboption_00, "non-ignorable", 13);
274U_STRING_DECL(suboption_01, "shifted", 7);
275
276U_STRING_DECL(suboption_02, "lower", 5);
277U_STRING_DECL(suboption_03, "upper", 5);
278U_STRING_DECL(suboption_04, "off", 3);
279U_STRING_DECL(suboption_05, "on", 2);
280U_STRING_DECL(suboption_06, "1", 1);
281U_STRING_DECL(suboption_07, "2", 1);
282U_STRING_DECL(suboption_08, "3", 1);
283U_STRING_DECL(suboption_09, "4", 1);
284U_STRING_DECL(suboption_10, "I", 1);
285
286U_STRING_DECL(suboption_11, "primary", 7);
287U_STRING_DECL(suboption_12, "secondary", 9);
288U_STRING_DECL(suboption_13, "tertiary", 8);
289U_STRING_DECL(suboption_14, "variable", 8);
290U_STRING_DECL(suboption_15, "regular", 7);
291U_STRING_DECL(suboption_16, "implicit", 8);
292U_STRING_DECL(suboption_17, "trailing", 8);
293
294
295U_STRING_DECL(option_00, "undefined", 9);
374ca955 296U_STRING_DECL(option_01, "rearrange", 9);
b75a7d8f 297U_STRING_DECL(option_02, "alternate", 9);
374ca955
A
298U_STRING_DECL(option_03, "backwards", 9);
299U_STRING_DECL(option_04, "variable top", 12);
300U_STRING_DECL(option_05, "top", 3);
301U_STRING_DECL(option_06, "normalization", 13);
302U_STRING_DECL(option_07, "caseLevel", 9);
303U_STRING_DECL(option_08, "caseFirst", 9);
304U_STRING_DECL(option_09, "scriptOrder", 11);
305U_STRING_DECL(option_10, "charsetname", 11);
306U_STRING_DECL(option_11, "charset", 7);
307U_STRING_DECL(option_12, "before", 6);
b75a7d8f
A
308U_STRING_DECL(option_13, "hiraganaQ", 9);
309U_STRING_DECL(option_14, "strength", 8);
310U_STRING_DECL(option_15, "first", 5);
311U_STRING_DECL(option_16, "last", 4);
312U_STRING_DECL(option_17, "optimize", 8);
313U_STRING_DECL(option_18, "suppressContractions", 20);
374ca955 314U_STRING_DECL(option_19, "numericOrdering", 15);
729e4ab9
A
315U_STRING_DECL(option_20, "import", 6);
316U_STRING_DECL(option_21, "reorder", 7);
b75a7d8f
A
317
318/*
374ca955
A
319[last variable] last variable value
320[last primary ignorable] largest CE for primary ignorable
321[last secondary ignorable] largest CE for secondary ignorable
322[last tertiary ignorable] largest CE for tertiary ignorable
323[top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
b75a7d8f
A
324*/
325
326
327static const ucolTokSuboption alternateSub[2] = {
46f4442e
A
328 {suboption_00, 13, UCOL_NON_IGNORABLE},
329 {suboption_01, 7, UCOL_SHIFTED}
b75a7d8f
A
330};
331
332static const ucolTokSuboption caseFirstSub[3] = {
46f4442e
A
333 {suboption_02, 5, UCOL_LOWER_FIRST},
334 {suboption_03, 5, UCOL_UPPER_FIRST},
335 {suboption_04, 3, UCOL_OFF},
b75a7d8f
A
336};
337
338static const ucolTokSuboption onOffSub[2] = {
46f4442e
A
339 {suboption_04, 3, UCOL_OFF},
340 {suboption_05, 2, UCOL_ON}
b75a7d8f
A
341};
342
343static const ucolTokSuboption frenchSub[1] = {
46f4442e 344 {suboption_07, 1, UCOL_ON}
b75a7d8f
A
345};
346
347static const ucolTokSuboption beforeSub[3] = {
46f4442e
A
348 {suboption_06, 1, UCOL_PRIMARY},
349 {suboption_07, 1, UCOL_SECONDARY},
350 {suboption_08, 1, UCOL_TERTIARY}
b75a7d8f
A
351};
352
353static const ucolTokSuboption strengthSub[5] = {
46f4442e
A
354 {suboption_06, 1, UCOL_PRIMARY},
355 {suboption_07, 1, UCOL_SECONDARY},
356 {suboption_08, 1, UCOL_TERTIARY},
357 {suboption_09, 1, UCOL_QUATERNARY},
358 {suboption_10, 1, UCOL_IDENTICAL},
b75a7d8f
A
359};
360
361static const ucolTokSuboption firstLastSub[7] = {
46f4442e
A
362 {suboption_11, 7, UCOL_PRIMARY},
363 {suboption_12, 9, UCOL_PRIMARY},
364 {suboption_13, 8, UCOL_PRIMARY},
365 {suboption_14, 8, UCOL_PRIMARY},
366 {suboption_15, 7, UCOL_PRIMARY},
367 {suboption_16, 8, UCOL_PRIMARY},
368 {suboption_17, 8, UCOL_PRIMARY},
b75a7d8f
A
369};
370
371enum OptionNumber {
46f4442e 372 OPTION_ALTERNATE_HANDLING = 0,
b75a7d8f
A
373 OPTION_FRENCH_COLLATION,
374 OPTION_CASE_LEVEL,
375 OPTION_CASE_FIRST,
376 OPTION_NORMALIZATION_MODE,
377 OPTION_HIRAGANA_QUATERNARY,
378 OPTION_STRENGTH,
379 OPTION_NUMERIC_COLLATION,
380 OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,
381 OPTION_VARIABLE_TOP,
382 OPTION_REARRANGE,
383 OPTION_BEFORE,
384 OPTION_TOP,
385 OPTION_FIRST,
386 OPTION_LAST,
387 OPTION_OPTIMIZE,
388 OPTION_SUPPRESS_CONTRACTIONS,
389 OPTION_UNDEFINED,
390 OPTION_SCRIPT_ORDER,
391 OPTION_CHARSET_NAME,
729e4ab9
A
392 OPTION_CHARSET,
393 OPTION_IMPORT,
394 OPTION_SCRIPTREORDER
b75a7d8f
A
395} ;
396
397static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
46f4442e
A
398 /*00*/ {option_02, 9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */
399 /*01*/ {option_03, 9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards" */
400 /*02*/ {option_07, 9, onOffSub, 2, UCOL_CASE_LEVEL}, /*"caseLevel" */
401 /*03*/ {option_08, 9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst" */
402 /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */
403 /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */
404 /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */
405 /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION}, /*"numericOrdering"*/
406 /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top" */
407 /*09*/ {option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange" */
408 /*10*/ {option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before" */
409 /*11*/ {option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top" */
410 /*12*/ {option_15, 5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */
411 /*13*/ {option_16, 4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */
412 /*14*/ {option_17, 8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize" */
413 /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions" */
414 /*16*/ {option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined" */
415 /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder" */
416 /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname" */
729e4ab9
A
417 /*19*/ {option_11, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charset" */
418 /*20*/ {option_20, 6, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"import" */
419 /*21*/ {option_21, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT} /*"reorder" */
b75a7d8f
A
420};
421
422static
374ca955 423int32_t u_strncmpNoCase(const UChar *s1,
46f4442e
A
424 const UChar *s2,
425 int32_t n)
b75a7d8f
A
426{
427 if(n > 0) {
428 int32_t rc;
429 for(;;) {
430 rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2);
431 if(rc != 0 || *s1 == 0 || --n == 0) {
432 return rc;
433 }
434 ++s1;
435 ++s2;
436 }
437 }
438 return 0;
439}
440
441static
442void ucol_uprv_tok_initData() {
46f4442e
A
443 if(!didInit) {
444 U_STRING_INIT(suboption_00, "non-ignorable", 13);
445 U_STRING_INIT(suboption_01, "shifted", 7);
446
447 U_STRING_INIT(suboption_02, "lower", 5);
448 U_STRING_INIT(suboption_03, "upper", 5);
449 U_STRING_INIT(suboption_04, "off", 3);
450 U_STRING_INIT(suboption_05, "on", 2);
451
452 U_STRING_INIT(suboption_06, "1", 1);
453 U_STRING_INIT(suboption_07, "2", 1);
454 U_STRING_INIT(suboption_08, "3", 1);
455 U_STRING_INIT(suboption_09, "4", 1);
456 U_STRING_INIT(suboption_10, "I", 1);
457
458 U_STRING_INIT(suboption_11, "primary", 7);
459 U_STRING_INIT(suboption_12, "secondary", 9);
460 U_STRING_INIT(suboption_13, "tertiary", 8);
461 U_STRING_INIT(suboption_14, "variable", 8);
462 U_STRING_INIT(suboption_15, "regular", 7);
463 U_STRING_INIT(suboption_16, "implicit", 8);
464 U_STRING_INIT(suboption_17, "trailing", 8);
465
466
467 U_STRING_INIT(option_00, "undefined", 9);
468 U_STRING_INIT(option_01, "rearrange", 9);
469 U_STRING_INIT(option_02, "alternate", 9);
470 U_STRING_INIT(option_03, "backwards", 9);
471 U_STRING_INIT(option_04, "variable top", 12);
472 U_STRING_INIT(option_05, "top", 3);
473 U_STRING_INIT(option_06, "normalization", 13);
474 U_STRING_INIT(option_07, "caseLevel", 9);
475 U_STRING_INIT(option_08, "caseFirst", 9);
476 U_STRING_INIT(option_09, "scriptOrder", 11);
477 U_STRING_INIT(option_10, "charsetname", 11);
478 U_STRING_INIT(option_11, "charset", 7);
479 U_STRING_INIT(option_12, "before", 6);
480 U_STRING_INIT(option_13, "hiraganaQ", 9);
481 U_STRING_INIT(option_14, "strength", 8);
482 U_STRING_INIT(option_15, "first", 5);
483 U_STRING_INIT(option_16, "last", 4);
484 U_STRING_INIT(option_17, "optimize", 8);
485 U_STRING_INIT(option_18, "suppressContractions", 20);
486 U_STRING_INIT(option_19, "numericOrdering", 15);
729e4ab9
A
487 U_STRING_INIT(option_20, "import ", 6);
488 U_STRING_INIT(option_21, "reorder", 7);
46f4442e
A
489 didInit = TRUE;
490 }
b75a7d8f
A
491}
492
493
494// This function reads basic options to set in the runtime collator
495// used by data driven tests. Should not support build time options
496U_CAPI const UChar * U_EXPORT2
374ca955 497ucol_tok_getNextArgument(const UChar *start, const UChar *end,
46f4442e
A
498 UColAttribute *attrib, UColAttributeValue *value,
499 UErrorCode *status)
500{
501 uint32_t i = 0;
502 int32_t j=0;
503 UBool foundOption = FALSE;
504 const UChar *optionArg = NULL;
505
506 ucol_uprv_tok_initData();
507
4388f060 508 while(start < end && PatternProps::isWhiteSpace(*start)) { /* eat whitespace */
46f4442e
A
509 start++;
510 }
511 if(start >= end) {
512 return NULL;
513 }
514 /* skip opening '[' */
515 if(*start == 0x005b) {
516 start++;
517 } else {
518 *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '['
519 return NULL;
b75a7d8f 520 }
b75a7d8f 521
46f4442e
A
522 while(i < UTOK_OPTION_COUNT) {
523 if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
524 foundOption = TRUE;
525 if(end - start > rulesOptions[i].optionLen) {
526 optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */
4388f060 527 while(PatternProps::isWhiteSpace(*optionArg)) { /* eat whitespace */
46f4442e
A
528 optionArg++;
529 }
530 }
531 break;
b75a7d8f 532 }
46f4442e
A
533 i++;
534 }
535
536 if(!foundOption) {
537 *status = U_ILLEGAL_ARGUMENT_ERROR;
538 return NULL;
539 }
540
541 if(optionArg) {
542 for(j = 0; j<rulesOptions[i].subSize; j++) {
543 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
544 //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
545 *attrib = rulesOptions[i].attr;
546 *value = rulesOptions[i].subopts[j].attrVal;
547 optionArg += rulesOptions[i].subopts[j].subLen;
4388f060 548 while(PatternProps::isWhiteSpace(*optionArg)) { /* eat whitespace */
46f4442e
A
549 optionArg++;
550 }
551 if(*optionArg == 0x005d) {
552 optionArg++;
553 return optionArg;
554 } else {
555 *status = U_ILLEGAL_ARGUMENT_ERROR;
556 return NULL;
557 }
558 }
b75a7d8f 559 }
b75a7d8f 560 }
46f4442e
A
561 *status = U_ILLEGAL_ARGUMENT_ERROR;
562 return NULL;
b75a7d8f
A
563}
564
374ca955 565static
b75a7d8f 566USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) {
46f4442e
A
567 while(*start != 0x005b) { /* advance while we find the first '[' */
568 start++;
569 }
570 // now we need to get a balanced set of '[]'. The problem is that a set can have
571 // many, and *end point to the first closing '['
572 int32_t noOpenBraces = 1;
573 int32_t current = 1; // skip the opening brace
574 while(start+current < end && noOpenBraces != 0) {
575 if(start[current] == 0x005b) {
576 noOpenBraces++;
577 } else if(start[current] == 0x005D) { // closing brace
578 noOpenBraces--;
579 }
580 current++;
b75a7d8f 581 }
b75a7d8f 582
46f4442e
A
583 if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) {
584 *status = U_ILLEGAL_ARGUMENT_ERROR;
585 return NULL;
586 }
587 return uset_openPattern(start, current, status);
b75a7d8f
A
588}
589
729e4ab9
A
590/**
591 * Reads an option and matches the option name with the predefined options. (Case-insensitive.)
592 * @param start Pointer to the start UChar.
593 * @param end Pointer to the last valid pointer beyond which the option will not extend.
594 * @param optionArg Address of the pointer at which the options start (after the option name)
595 * @return The index of the option, or -1 if the option is not valid.
596 */
b75a7d8f
A
597static
598int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) {
46f4442e
A
599 int32_t i = 0;
600 ucol_uprv_tok_initData();
601
4388f060 602 while(PatternProps::isWhiteSpace(*start)) { /* eat whitespace */
46f4442e
A
603 start++;
604 }
605 while(i < UTOK_OPTION_COUNT) {
606 if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
607 if(end - start > rulesOptions[i].optionLen) {
729e4ab9 608 *optionArg = start+rulesOptions[i].optionLen; /* End of option name; start of the options */
4388f060 609 while(PatternProps::isWhiteSpace(**optionArg)) { /* eat whitespace */
46f4442e
A
610 (*optionArg)++;
611 }
612 }
613 break;
b75a7d8f 614 }
46f4442e
A
615 i++;
616 }
617 if(i == UTOK_OPTION_COUNT) {
618 i = -1; // didn't find an option
b75a7d8f 619 }
46f4442e 620 return i;
b75a7d8f
A
621}
622
623
729e4ab9
A
624static
625void ucol_tok_parseScriptReorder(UColTokenParser *src, UErrorCode *status) {
626 int32_t codeCount = 0;
627 int32_t codeIndex = 0;
628 char conversion[64];
629 int32_t tokenLength = 0;
630 const UChar* space;
631
632 const UChar* current = src->current;
633 const UChar* end = u_memchr(src->current, 0x005d, src->end - src->current);
634
635 // eat leading whitespace
636 while(current < end && u_isWhitespace(*current)) {
637 current++;
638 }
639
640 while(current < end) {
641 space = u_memchr(current, 0x0020, end - current);
642 space = space == 0 ? end : space;
643 tokenLength = space - current;
644 if (tokenLength < 4) {
645 *status = U_INVALID_FORMAT_ERROR;
646 return;
647 }
648 codeCount++;
649 current += tokenLength;
650 while(current < end && u_isWhitespace(*current)) { /* eat whitespace */
651 ++current;
652 }
653 }
654
655 if (codeCount == 0) {
656 *status = U_INVALID_FORMAT_ERROR;
657 }
658
659 src->reorderCodesLength = codeCount;
660 src->reorderCodes = (int32_t*)uprv_malloc(codeCount * sizeof(int32_t));
661 current = src->current;
662
663 // eat leading whitespace
664 while(current < end && u_isWhitespace(*current)) {
665 current++;
666 }
667
668 while(current < end) {
669 space = u_memchr(current, 0x0020, end - current);
670 space = space == 0 ? end : space;
671 tokenLength = space - current;
672 if (tokenLength < 4) {
673 *status = U_ILLEGAL_ARGUMENT_ERROR;
674 return;
675 } else {
676 u_UCharsToChars(current, conversion, tokenLength);
677 conversion[tokenLength] = '\0';
678 src->reorderCodes[codeIndex] = ucol_findReorderingEntry(conversion);
679 if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) {
680 src->reorderCodes[codeIndex] = u_getPropertyValueEnum(UCHAR_SCRIPT, conversion);
681 }
682 if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) {
683 *status = U_ILLEGAL_ARGUMENT_ERROR;
684 }
685 }
686 codeIndex++;
687 current += tokenLength;
688 while(current < end && u_isWhitespace(*current)) { /* eat whitespace */
689 ++current;
690 }
691 }
692}
693
b75a7d8f
A
694// reads and conforms to various options in rules
695// end is the position of the first closing ']'
696// However, some of the options take an UnicodeSet definition
697// which needs to duplicate the closing ']'
698// for example: '[copy [\uAC00-\uD7FF]]'
374ca955 699// These options will move end to the second ']' and the
b75a7d8f
A
700// caller will set the current to it.
701static
702uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) {
46f4442e
A
703 const UChar* start = src->current;
704 int32_t i = 0;
705 int32_t j=0;
706 const UChar *optionArg = NULL;
b75a7d8f 707
46f4442e 708 uint8_t result = 0;
b75a7d8f 709
46f4442e
A
710 start++; /*skip opening '['*/
711 i = ucol_uprv_tok_readOption(start, src->end, &optionArg);
712 if(optionArg) {
713 src->current = optionArg;
714 }
b75a7d8f 715
46f4442e
A
716 if(i < 0) {
717 *status = U_ILLEGAL_ARGUMENT_ERROR;
718 } else {
719 int32_t noOpenBraces = 1;
720 switch(i) {
b75a7d8f
A
721 case OPTION_ALTERNATE_HANDLING:
722 case OPTION_FRENCH_COLLATION:
723 case OPTION_CASE_LEVEL:
724 case OPTION_CASE_FIRST:
725 case OPTION_NORMALIZATION_MODE:
726 case OPTION_HIRAGANA_QUATERNARY:
727 case OPTION_STRENGTH:
728 case OPTION_NUMERIC_COLLATION:
46f4442e
A
729 if(optionArg) {
730 for(j = 0; j<rulesOptions[i].subSize; j++) {
731 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
732 ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
733 result = UCOL_TOK_SUCCESS;
734 }
735 }
b75a7d8f 736 }
46f4442e
A
737 if(result == 0) {
738 *status = U_ILLEGAL_ARGUMENT_ERROR;
739 }
740 break;
b75a7d8f 741 case OPTION_VARIABLE_TOP:
46f4442e
A
742 result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP;
743 break;
b75a7d8f 744 case OPTION_REARRANGE:
46f4442e
A
745 result = UCOL_TOK_SUCCESS;
746 break;
b75a7d8f 747 case OPTION_BEFORE:
46f4442e
A
748 if(optionArg) {
749 for(j = 0; j<rulesOptions[i].subSize; j++) {
750 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
729e4ab9 751 result = UCOL_TOK_SUCCESS | (rulesOptions[i].subopts[j].attrVal + 1);
46f4442e
A
752 }
753 }
b75a7d8f 754 }
46f4442e
A
755 if(result == 0) {
756 *status = U_ILLEGAL_ARGUMENT_ERROR;
757 }
758 break;
b75a7d8f 759 case OPTION_TOP: /* we are going to have an array with structures of limit CEs */
46f4442e
A
760 /* index to this array will be src->parsedToken.indirectIndex*/
761 src->parsedToken.indirectIndex = 0;
762 result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;
763 break;
b75a7d8f
A
764 case OPTION_FIRST:
765 case OPTION_LAST: /* first, last */
46f4442e
A
766 for(j = 0; j<rulesOptions[i].subSize; j++) {
767 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
768 // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first
769 // element of indirect boundaries is reserved for top.
770 src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2);
771 result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;;
772 }
b75a7d8f 773 }
46f4442e
A
774 if(result == 0) {
775 *status = U_ILLEGAL_ARGUMENT_ERROR;
776 }
777 break;
b75a7d8f
A
778 case OPTION_OPTIMIZE:
779 case OPTION_SUPPRESS_CONTRACTIONS: // copy and remove are handled before normalization
46f4442e
A
780 // we need to move end here
781 src->current++; // skip opening brace
782 while(src->current < src->end && noOpenBraces != 0) {
783 if(*src->current == 0x005b) {
784 noOpenBraces++;
785 } else if(*src->current == 0x005D) { // closing brace
786 noOpenBraces--;
787 }
788 src->current++;
b75a7d8f 789 }
46f4442e
A
790 result = UCOL_TOK_SUCCESS;
791 break;
729e4ab9
A
792 case OPTION_SCRIPTREORDER:
793 ucol_tok_parseScriptReorder(src, status);
794 break;
b75a7d8f 795 default:
46f4442e
A
796 *status = U_UNSUPPORTED_ERROR;
797 break;
798 }
b75a7d8f 799 }
729e4ab9 800 src->current = u_memchr(src->current, 0x005d, (int32_t)(src->end-src->current));
46f4442e 801 return result;
b75a7d8f
A
802}
803
374ca955
A
804
805inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) {
729e4ab9
A
806 if (stuff == NULL || len <= 0) {
807 return;
808 }
809 UnicodeString tempStuff(FALSE, stuff, len);
46f4442e 810 if(src->extraCurrent+len >= src->extraEnd) {
374ca955 811 /* reallocate */
729e4ab9
A
812 if (stuff >= src->source && stuff <= src->end) {
813 // Copy the "stuff" contents into tempStuff's own buffer.
814 // UnicodeString is copy-on-write.
815 if (len > 0) {
816 tempStuff.setCharAt(0, tempStuff[0]);
817 } else {
818 tempStuff.remove();
819 }
820 }
374ca955
A
821 UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar));
822 if(newSrc != NULL) {
46f4442e
A
823 src->current = newSrc + (src->current - src->source);
824 src->extraCurrent = newSrc + (src->extraCurrent - src->source);
825 src->end = newSrc + (src->end - src->source);
826 src->extraEnd = newSrc + (src->extraEnd-src->source)*2;
827 src->sourceCurrent = newSrc + (src->sourceCurrent-src->source);
828 src->source = newSrc;
374ca955 829 } else {
46f4442e 830 *status = U_MEMORY_ALLOCATION_ERROR;
729e4ab9 831 return;
374ca955 832 }
46f4442e
A
833 }
834 if(len == 1) {
729e4ab9 835 *src->extraCurrent++ = tempStuff[0];
46f4442e 836 } else {
729e4ab9 837 u_memcpy(src->extraCurrent, tempStuff.getBuffer(), len);
374ca955 838 src->extraCurrent += len;
46f4442e 839 }
374ca955
A
840}
841
842inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) {
46f4442e
A
843 /*
844 top = TRUE;
845 */
846 UChar buff[5];
847 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
848 buff[0] = 0xFFFE;
849 buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
850 buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
851 if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
852 src->parsedToken.charsLen = 3;
853 ucol_tok_addToExtraCurrent(src, buff, 3, status);
854 } else {
855 buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
856 buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
857 src->parsedToken.charsLen = 5;
858 ucol_tok_addToExtraCurrent(src, buff, 5, status);
859 }
860 return TRUE;
b75a7d8f
A
861}
862
374ca955
A
863static UBool isCharNewLine(UChar c){
864 switch(c){
865 case 0x000A: /* LF */
866 case 0x000D: /* CR */
867 case 0x000C: /* FF */
868 case 0x0085: /* NEL */
869 case 0x2028: /* LS */
870 case 0x2029: /* PS */
871 return TRUE;
872 default:
873 return FALSE;
874 }
875}
876
729e4ab9
A
877/*
878 * This function is called several times when a range is processed. Each time, the next code point
879 * is processed.
880 * The following variables must be set before calling this function:
881 * src->currentRangeCp: The current code point to process.
882 * src->lastRangeCp: The last code point in the range.
883 * Pre-requisite: src->currentRangeCp <= src->lastRangeCp.
884 */
885static const UChar*
886ucol_tok_processNextCodePointInRange(UColTokenParser *src,
887 UErrorCode *status)
888{
889 // Append current code point to source
890 UChar buff[U16_MAX_LENGTH];
891 uint32_t i = 0;
892
893 uint32_t nChars = U16_LENGTH(src->currentRangeCp);
894 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
895 src->parsedToken.charsLen = nChars;
896
897 U16_APPEND_UNSAFE(buff, i, src->currentRangeCp);
898 ucol_tok_addToExtraCurrent(src, buff, nChars, status);
899
900 ++src->currentRangeCp;
901 if (src->currentRangeCp > src->lastRangeCp) {
902 src->inRange = FALSE;
903
904 if (src->currentStarredCharIndex > src->lastStarredCharIndex) {
905 src->isStarred = FALSE;
906 }
907 } else {
908 src->previousCp = src->currentRangeCp;
909 }
910 return src->current;
911}
912
913/*
914 * This function is called several times when a starred list is processed. Each time, the next code point
915 * in the list is processed.
916 * The following variables must be set before calling this function:
917 * src->currentStarredCharIndex: Index (in src->source) of the first char of the current code point.
918 * src->lastStarredCharIndex: Index to the last character in the list.
919 * Pre-requisite: src->currentStarredCharIndex <= src->lastStarredCharIndex.
920 */
921static const UChar*
922ucol_tok_processNextTokenInStarredList(UColTokenParser *src)
923{
924 // Extract the characters corresponding to the next code point.
925 UChar32 cp;
926 src->parsedToken.charsOffset = src->currentStarredCharIndex;
927 int32_t prev = src->currentStarredCharIndex;
928 U16_NEXT(src->source, src->currentStarredCharIndex, (uint32_t)(src->end - src->source), cp);
929 src->parsedToken.charsLen = src->currentStarredCharIndex - prev;
930
931 // When we are done parsing the starred string, turn the flag off so that
932 // the normal processing is restored.
933 if (src->currentStarredCharIndex > src->lastStarredCharIndex) {
934 src->isStarred = FALSE;
935 }
936 src->previousCp = cp;
937 return src->current;
938}
939
940/*
941 * Partially parses the next token, keeps the indices in src->parsedToken, and updates the counters.
942 *
943 * This routine parses and separates almost all tokens. The following are the syntax characters recognized.
944 * # : Comment character
945 * & : Reset operator
946 * = : Equality
947 * < : Primary collation
948 * << : Secondary collation
949 * <<< : Tertiary collation
950 * ; : Secondary collation
951 * , : Tertiary collation
952 * / : Expansions
953 * | : Prefix
954 * - : Range
955
956 * ! : Java Thai modifier, ignored
957 * @ : French only
958
959 * [] : Options
960 * '' : Quotes
961 *
962 * Along with operators =, <, <<, <<<, the operator * is supported to indicate a list. For example, &a<*bcdexyz
963 * is equivalent to &a<b<c<d<e<x<y<z. In lists, ranges also can be given, so &a*b-ex-z is equivalent to the above.
964 * This function do not separate the tokens in a list. Instead, &a<*b-ex-z is parsed as three tokens - "&a",
965 * "<*b", "-ex", "-z". The strength (< in this case), whether in a list, whether in a range and the previous
966 * character returned as cached so that the calling program can do further splitting.
967 */
968static const UChar*
969ucol_tok_parseNextTokenInternal(UColTokenParser *src,
970 UBool startOfRules,
971 UParseError *parseError,
972 UErrorCode *status)
46f4442e 973{
46f4442e
A
974 UBool variableTop = FALSE;
975 UBool top = FALSE;
976 UBool inChars = TRUE;
977 UBool inQuote = FALSE;
978 UBool wasInQuote = FALSE;
979 uint8_t before = 0;
980 UBool isEscaped = FALSE;
729e4ab9 981
46f4442e
A
982 // TODO: replace these variables with src->parsedToken counterparts
983 // no need to use them anymore since we have src->parsedToken.
984 // Ideally, token parser would be a nice class... Once, when I have
985 // more time (around 2020 probably).
986 uint32_t newExtensionLen = 0;
987 uint32_t extensionOffset = 0;
988 uint32_t newStrength = UCOL_TOK_UNSET;
989 UChar buff[10];
990
991 src->parsedToken.charsOffset = 0; src->parsedToken.charsLen = 0;
992 src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;
993 src->parsedToken.indirectIndex = 0;
994
995 while (src->current < src->end) {
996 UChar ch = *(src->current);
997
998 if (inQuote) {
999 if (ch == 0x0027/*'\''*/) {
1000 inQuote = FALSE;
1001 } else {
1002 if ((src->parsedToken.charsLen == 0) || inChars) {
1003 if(src->parsedToken.charsLen == 0) {
1004 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1005 }
1006 src->parsedToken.charsLen++;
1007 } else {
1008 if(newExtensionLen == 0) {
1009 extensionOffset = (uint32_t)(src->extraCurrent - src->source);
1010 }
1011 newExtensionLen++;
1012 }
b75a7d8f 1013 }
46f4442e
A
1014 }else if(isEscaped){
1015 isEscaped =FALSE;
1016 if (newStrength == UCOL_TOK_UNSET) {
1017 *status = U_INVALID_FORMAT_ERROR;
1018 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
729e4ab9 1019 DBG_FORMAT_ERROR
46f4442e
A
1020 return NULL;
1021 // enabling rules to start with non-tokens a < b
1022 // newStrength = UCOL_TOK_RESET;
b75a7d8f 1023 }
46f4442e
A
1024 if(ch != 0x0000 && src->current != src->end) {
1025 if (inChars) {
1026 if(src->parsedToken.charsLen == 0) {
1027 src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
1028 }
1029 src->parsedToken.charsLen++;
1030 } else {
1031 if(newExtensionLen == 0) {
1032 extensionOffset = (uint32_t)(src->current - src->source);
1033 }
1034 newExtensionLen++;
1035 }
b75a7d8f 1036 }
46f4442e 1037 }else {
4388f060 1038 if(!PatternProps::isWhiteSpace(ch)) {
46f4442e
A
1039 /* Sets the strength for this entry */
1040 switch (ch) {
1041 case 0x003D/*'='*/ :
1042 if (newStrength != UCOL_TOK_UNSET) {
1043 goto EndOfLoop;
1044 }
b75a7d8f 1045
46f4442e
A
1046 /* if we start with strength, we'll reset to top */
1047 if(startOfRules == TRUE) {
1048 src->parsedToken.indirectIndex = 5;
1049 top = ucol_tok_doSetTop(src, status);
1050 newStrength = UCOL_TOK_RESET;
1051 goto EndOfLoop;
1052 }
1053 newStrength = UCOL_IDENTICAL;
729e4ab9
A
1054 if(*(src->current+1) == 0x002A) {/*'*'*/
1055 src->current++;
1056 src->isStarred = TRUE;
1057 }
46f4442e 1058 break;
b75a7d8f 1059
46f4442e
A
1060 case 0x002C/*','*/:
1061 if (newStrength != UCOL_TOK_UNSET) {
1062 goto EndOfLoop;
1063 }
b75a7d8f 1064
46f4442e
A
1065 /* if we start with strength, we'll reset to top */
1066 if(startOfRules == TRUE) {
1067 src->parsedToken.indirectIndex = 5;
1068 top = ucol_tok_doSetTop(src, status);
1069 newStrength = UCOL_TOK_RESET;
1070 goto EndOfLoop;
1071 }
1072 newStrength = UCOL_TERTIARY;
1073 break;
b75a7d8f 1074
46f4442e
A
1075 case 0x003B/*';'*/:
1076 if (newStrength != UCOL_TOK_UNSET) {
1077 goto EndOfLoop;
1078 }
b75a7d8f 1079
46f4442e
A
1080 /* if we start with strength, we'll reset to top */
1081 if(startOfRules == TRUE) {
1082 src->parsedToken.indirectIndex = 5;
1083 top = ucol_tok_doSetTop(src, status);
1084 newStrength = UCOL_TOK_RESET;
1085 goto EndOfLoop;
b75a7d8f 1086 }
46f4442e
A
1087 newStrength = UCOL_SECONDARY;
1088 break;
b75a7d8f 1089
46f4442e
A
1090 case 0x003C/*'<'*/:
1091 if (newStrength != UCOL_TOK_UNSET) {
1092 goto EndOfLoop;
1093 }
b75a7d8f 1094
46f4442e
A
1095 /* if we start with strength, we'll reset to top */
1096 if(startOfRules == TRUE) {
1097 src->parsedToken.indirectIndex = 5;
1098 top = ucol_tok_doSetTop(src, status);
1099 newStrength = UCOL_TOK_RESET;
1100 goto EndOfLoop;
1101 }
1102 /* before this, do a scan to verify whether this is */
1103 /* another strength */
1104 if(*(src->current+1) == 0x003C) {
1105 src->current++;
1106 if(*(src->current+1) == 0x003C) {
1107 src->current++; /* three in a row! */
1108 newStrength = UCOL_TERTIARY;
1109 } else { /* two in a row */
1110 newStrength = UCOL_SECONDARY;
1111 }
1112 } else { /* just one */
1113 newStrength = UCOL_PRIMARY;
1114 }
729e4ab9
A
1115 if(*(src->current+1) == 0x002A) {/*'*'*/
1116 src->current++;
1117 src->isStarred = TRUE;
1118 }
46f4442e 1119 break;
b75a7d8f 1120
46f4442e
A
1121 case 0x0026/*'&'*/:
1122 if (newStrength != UCOL_TOK_UNSET) {
1123 /**/
1124 goto EndOfLoop;
1125 }
b75a7d8f 1126
46f4442e
A
1127 newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
1128 break;
1129
1130 case 0x005b/*'['*/:
1131 /* options - read an option, analyze it */
1132 if(u_strchr(src->current, 0x005d /*']'*/) != NULL) {
1133 uint8_t result = ucol_uprv_tok_readAndSetOption(src, status);
1134 if(U_SUCCESS(*status)) {
1135 if(result & UCOL_TOK_TOP) {
1136 if(newStrength == UCOL_TOK_RESET) {
1137 top = ucol_tok_doSetTop(src, status);
1138 if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
1139 src->parsedToken.charsLen+=2;
1140 buff[0] = 0x002d;
1141 buff[1] = before;
1142 ucol_tok_addToExtraCurrent(src, buff, 2, status);
1143 }
1144
1145 src->current++;
1146 goto EndOfLoop;
1147 } else {
1148 *status = U_INVALID_FORMAT_ERROR;
1149 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
729e4ab9 1150 DBG_FORMAT_ERROR
46f4442e
A
1151 }
1152 } else if(result & UCOL_TOK_VARIABLE_TOP) {
1153 if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
1154 variableTop = TRUE;
1155 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1156 src->parsedToken.charsLen = 1;
1157 buff[0] = 0xFFFF;
1158 ucol_tok_addToExtraCurrent(src, buff, 1, status);
1159 src->current++;
1160 goto EndOfLoop;
1161 } else {
1162 *status = U_INVALID_FORMAT_ERROR;
1163 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
729e4ab9 1164 DBG_FORMAT_ERROR
46f4442e
A
1165 }
1166 } else if (result & UCOL_TOK_BEFORE){
1167 if(newStrength == UCOL_TOK_RESET) {
1168 before = result & UCOL_TOK_BEFORE;
1169 } else {
1170 *status = U_INVALID_FORMAT_ERROR;
1171 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
729e4ab9 1172 DBG_FORMAT_ERROR
46f4442e
A
1173 }
1174 }
1175 } else {
1176 *status = U_INVALID_FORMAT_ERROR;
1177 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
729e4ab9 1178 DBG_FORMAT_ERROR
46f4442e
A
1179 return NULL;
1180 }
1181 }
1182 break;
1183 case 0x0021/*! skip java thai modifier reordering*/:
1184 break;
1185 case 0x002F/*'/'*/:
1186 wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
1187 inChars = FALSE; /* we're now processing expansion */
1188 break;
1189 case 0x005C /* back slash for escaped chars */:
1190 isEscaped = TRUE;
1191 break;
1192 /* found a quote, we're gonna start copying */
1193 case 0x0027/*'\''*/:
1194 if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
729e4ab9
A
1195 *status = U_INVALID_FORMAT_ERROR;
1196 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1197 DBG_FORMAT_ERROR
1198 return NULL;
1199 // enabling rules to start with a non-token character a < b
1200 // newStrength = UCOL_TOK_RESET;
46f4442e 1201 }
b75a7d8f 1202
46f4442e
A
1203 inQuote = TRUE;
1204
1205 if(inChars) { /* we're doing characters */
1206 if(wasInQuote == FALSE) {
1207 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1208 }
1209 if (src->parsedToken.charsLen != 0) {
1210 ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
1211 }
1212 src->parsedToken.charsLen++;
1213 } else { /* we're doing an expansion */
1214 if(wasInQuote == FALSE) {
1215 extensionOffset = (uint32_t)(src->extraCurrent - src->source);
1216 }
1217 if (newExtensionLen != 0) {
1218 ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status);
1219 }
1220 newExtensionLen++;
1221 }
b75a7d8f 1222
46f4442e 1223 wasInQuote = TRUE;
b75a7d8f 1224
46f4442e
A
1225 ch = *(++(src->current));
1226 if(ch == 0x0027) { /* copy the double quote */
1227 ucol_tok_addToExtraCurrent(src, &ch, 1, status);
1228 inQuote = FALSE;
1229 }
1230 break;
1231
1232 /* '@' is french only if the strength is not currently set */
1233 /* if it is, it's just a regular character in collation rules */
1234 case 0x0040/*'@'*/:
1235 if (newStrength == UCOL_TOK_UNSET) {
1236 src->opts->frenchCollation = UCOL_ON;
1237 break;
1238 }
b75a7d8f 1239
46f4442e
A
1240 case 0x007C /*|*/: /* this means we have actually been reading prefix part */
1241 // we want to store read characters to the prefix part and continue reading
1242 // the characters (proper way would be to restart reading the chars, but in
1243 // that case we would have to complicate the token hasher, which I do not
1244 // intend to play with. Instead, we will do prefixes when prefixes are due
1245 // (before adding the elements).
1246 src->parsedToken.prefixOffset = src->parsedToken.charsOffset;
1247 src->parsedToken.prefixLen = src->parsedToken.charsLen;
1248
1249 if(inChars) { /* we're doing characters */
1250 if(wasInQuote == FALSE) {
1251 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1252 }
1253 if (src->parsedToken.charsLen != 0) {
1254 ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
1255 }
1256 src->parsedToken.charsLen++;
1257 }
b75a7d8f 1258
46f4442e
A
1259 wasInQuote = TRUE;
1260
1261 do {
1262 ch = *(++(src->current));
1263 // skip whitespace between '|' and the character
4388f060 1264 } while (PatternProps::isWhiteSpace(ch));
46f4442e
A
1265 break;
1266
1267 //charsOffset = 0;
1268 //newCharsLen = 0;
1269 //break; // We want to store the whole prefix/character sequence. If we break
1270 // the '|' is going to get lost.
729e4ab9
A
1271
1272 case 0x002D /*-*/: /* A range. */
1273 if (newStrength != UCOL_TOK_UNSET) {
1274 // While processing the pending token, the isStarred field
1275 // is reset, so it needs to be saved for the next
1276 // invocation.
1277 src->savedIsStarred = src->isStarred;
1278 goto EndOfLoop;
1279 }
1280 src->isStarred = src->savedIsStarred;
1281
1282 // Ranges are valid only in starred tokens.
1283 if (!src->isStarred) {
1284 *status = U_INVALID_FORMAT_ERROR;
1285 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1286 DBG_FORMAT_ERROR
1287 return NULL;
1288 }
1289 newStrength = src->parsedToken.strength;
1290 src->inRange = TRUE;
1291 break;
1292
46f4442e
A
1293 case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */
1294 do {
1295 ch = *(++(src->current));
1296 } while (!isCharNewLine(ch));
1297
1298 break;
1299 default:
1300 if (newStrength == UCOL_TOK_UNSET) {
729e4ab9
A
1301 *status = U_INVALID_FORMAT_ERROR;
1302 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1303 DBG_FORMAT_ERROR
1304 return NULL;
46f4442e 1305 }
374ca955 1306
46f4442e
A
1307 if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
1308 *status = U_INVALID_FORMAT_ERROR;
1309 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
729e4ab9 1310 DBG_FORMAT_ERROR
46f4442e
A
1311 return NULL;
1312 }
374ca955 1313
46f4442e
A
1314 if(ch == 0x0000 && src->current+1 == src->end) {
1315 break;
1316 }
b75a7d8f 1317
46f4442e
A
1318 if (inChars) {
1319 if(src->parsedToken.charsLen == 0) {
1320 src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
1321 }
1322 src->parsedToken.charsLen++;
1323 } else {
1324 if(newExtensionLen == 0) {
1325 extensionOffset = (uint32_t)(src->current - src->source);
1326 }
1327 newExtensionLen++;
1328 }
b75a7d8f 1329
46f4442e
A
1330 break;
1331 }
b75a7d8f 1332 }
46f4442e 1333 }
b75a7d8f 1334
46f4442e
A
1335 if(wasInQuote) {
1336 if(ch != 0x27) {
4388f060 1337 if(inQuote || !PatternProps::isWhiteSpace(ch)) {
46f4442e
A
1338 ucol_tok_addToExtraCurrent(src, &ch, 1, status);
1339 }
b75a7d8f 1340 }
46f4442e 1341 }
b75a7d8f 1342
46f4442e 1343 src->current++;
b75a7d8f
A
1344 }
1345
46f4442e
A
1346EndOfLoop:
1347 wasInQuote = FALSE;
1348 if (newStrength == UCOL_TOK_UNSET) {
1349 return NULL;
b75a7d8f
A
1350 }
1351
46f4442e
A
1352 if (src->parsedToken.charsLen == 0 && top == FALSE) {
1353 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1354 *status = U_INVALID_FORMAT_ERROR;
729e4ab9 1355 DBG_FORMAT_ERROR
46f4442e 1356 return NULL;
b75a7d8f
A
1357 }
1358
46f4442e
A
1359 src->parsedToken.strength = newStrength;
1360 src->parsedToken.extensionOffset = extensionOffset;
1361 src->parsedToken.extensionLen = newExtensionLen;
1362 src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before;
b75a7d8f 1363
46f4442e 1364 return src->current;
b75a7d8f
A
1365}
1366
729e4ab9
A
1367/*
1368 * Parses the next token, keeps the indices in src->parsedToken, and updates the counters.
1369 * @see ucol_tok_parseNextTokenInternal() for the description of what operators are supported.
1370 *
1371 * In addition to what ucol_tok_parseNextTokenInternal() does, this function does the following:
1372 * 1) ucol_tok_parseNextTokenInternal() returns a range as a single token. This function separates
1373 * it to separate tokens and returns one by one. In order to do that, the necessary states are
1374 * cached as member variables of the token parser.
1375 * 2) When encountering a range, ucol_tok_parseNextTokenInternal() processes characters up to the
1376 * starting character as a single list token (which is separated into individual characters here)
1377 * and as another list token starting with the last character in the range. Before expanding it
1378 * as a list of tokens, this function expands the range by filling the intermediate characters and
1379 * returns them one by one as separate tokens.
1380 * Necessary checks are done for invalid combinations.
1381 */
1382U_CAPI const UChar* U_EXPORT2
1383ucol_tok_parseNextToken(UColTokenParser *src,
1384 UBool startOfRules,
1385 UParseError *parseError,
1386 UErrorCode *status)
1387{
1388 const UChar *nextToken;
1389
1390 if (src->inRange) {
1391 // We are not done processing a range. Continue it.
1392 return ucol_tok_processNextCodePointInRange(src, status);
1393 } else if (src->isStarred) {
1394 // We are not done processing a starred token. Continue it.
1395 return ucol_tok_processNextTokenInStarredList(src);
1396 }
1397
1398 // Get the next token.
1399 nextToken = ucol_tok_parseNextTokenInternal(src, startOfRules, parseError, status);
1400
1401 if (nextToken == NULL) {
1402 return NULL;
1403 }
1404
1405 if (src->inRange) {
1406 // A new range has started.
1407 // Check whether it is a chain of ranges with more than one hyphen.
1408 if (src->lastRangeCp > 0 && src->lastRangeCp == src->previousCp) {
1409 *status = U_INVALID_FORMAT_ERROR;
1410 syntaxError(src->source,src->parsedToken.charsOffset-1,
1411 src->parsedToken.charsOffset+src->parsedToken.charsLen, parseError);
1412 DBG_FORMAT_ERROR
1413 return NULL;
1414 }
1415
1416 // The current token indicates the second code point of the range.
1417 // Process just that, and then proceed with the star.
1418 src->currentStarredCharIndex = src->parsedToken.charsOffset;
1419 U16_NEXT(src->source, src->currentStarredCharIndex,
1420 (uint32_t)(src->end - src->source), src->lastRangeCp);
1421 if (src->lastRangeCp <= src->previousCp) {
1422 *status = U_INVALID_FORMAT_ERROR;
1423 syntaxError(src->source,src->parsedToken.charsOffset-1,
1424 src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
1425 DBG_FORMAT_ERROR
1426 return NULL;
1427 }
1428
1429 // Set current range code point to process the range loop
1430 src->currentRangeCp = src->previousCp + 1;
1431
1432 src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1;
1433
1434 return ucol_tok_processNextCodePointInRange(src, status);
1435 } else if (src->isStarred) {
1436 // We define two indices m_currentStarredCharIndex_ and m_lastStarredCharIndex_ so that
1437 // [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive, need to be
1438 // separated into several tokens and returned.
1439 src->currentStarredCharIndex = src->parsedToken.charsOffset;
1440 src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1;
1441
1442 return ucol_tok_processNextTokenInStarredList(src);
1443 } else {
1444 // Set previous codepoint
1445 U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->end - src->source), src->previousCp);
1446 }
1447 return nextToken;
1448}
1449
1450
b75a7d8f
A
1451/*
1452Processing Description
46f4442e
A
14531 Build a ListList. Each list has a header, which contains two lists (positive
1454and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and
1455reset may be null.
14562 As you process, you keep a LAST pointer that points to the last token you
1457handled.
729e4ab9 1458
b75a7d8f
A
1459*/
1460
729e4ab9 1461static UColToken *ucol_tok_initAReset(UColTokenParser *src, const UChar *expand, uint32_t *expandNext,
46f4442e
A
1462 UParseError *parseError, UErrorCode *status)
1463{
1464 if(src->resultLen == src->listCapacity) {
1465 // Unfortunately, this won't work, as we store addresses of lhs in token
1466 src->listCapacity *= 2;
1467 src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader));
1468 if(src->lh == NULL) {
1469 *status = U_MEMORY_ALLOCATION_ERROR;
1470 return NULL;
1471 }
b75a7d8f 1472 }
46f4442e
A
1473 /* do the reset thing */
1474 UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
1475 /* test for NULL */
1476 if (sourceToken == NULL) {
1477 *status = U_MEMORY_ALLOCATION_ERROR;
1478 return NULL;
1479 }
729e4ab9 1480 sourceToken->rulesToParseHdl = &(src->source);
46f4442e
A
1481 sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1482 sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
1483
1484 sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
1485 sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
1486
1487 // keep the flags around so that we know about before
1488 sourceToken->flags = src->parsedToken.flags;
1489
1490 if(src->parsedToken.prefixOffset != 0) {
1491 // this is a syntax error
1492 *status = U_INVALID_FORMAT_ERROR;
1493 syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
729e4ab9 1494 DBG_FORMAT_ERROR
46f4442e
A
1495 uprv_free(sourceToken);
1496 return 0;
b75a7d8f 1497 } else {
46f4442e 1498 sourceToken->prefix = 0;
b75a7d8f 1499 }
b75a7d8f 1500
46f4442e
A
1501 sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
1502 sourceToken->strength = UCOL_TOK_RESET;
1503 sourceToken->next = NULL;
1504 sourceToken->previous = NULL;
1505 sourceToken->noOfCEs = 0;
1506 sourceToken->noOfExpCEs = 0;
1507 sourceToken->listHeader = &src->lh[src->resultLen];
1508
1509 src->lh[src->resultLen].first = NULL;
1510 src->lh[src->resultLen].last = NULL;
1511 src->lh[src->resultLen].first = NULL;
1512 src->lh[src->resultLen].last = NULL;
b75a7d8f 1513
46f4442e 1514 src->lh[src->resultLen].reset = sourceToken;
b75a7d8f 1515
46f4442e
A
1516 /*
1517 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
1518 First convert all expansions into normal form. Examples:
1519 If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
1520 d * ... into &x * c/y * d * ...
1521 Note: reset values can never have expansions, although they can cause the
1522 very next item to have one. They may be contractions, if they are found
1523 earlier in the list.
1524 */
1525 *expandNext = 0;
1526 if(expand != NULL) {
1527 /* check to see if there is an expansion */
1528 if(src->parsedToken.charsLen > 1) {
1529 uint32_t resetCharsOffset;
1530 resetCharsOffset = (uint32_t)(expand - src->source);
1531 sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset;
1532 *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset);
1533 }
1534 }
1535
1536 src->resultLen++;
1537
1538 uhash_put(src->tailored, sourceToken, sourceToken, status);
1539
1540 return sourceToken;
b75a7d8f
A
1541}
1542
1543static
1544inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) {
46f4442e
A
1545 if(U_FAILURE(*status)) {
1546 return NULL;
1547 }
1548 /* this is a virgin before - we need to fish the anchor from the UCA */
1549 collIterate s;
1550 uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND;
1551 uint32_t CE, SecondCE;
4388f060 1552 // uint32_t invPos;
46f4442e 1553 if(sourceToken != NULL) {
729e4ab9 1554 uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s, status);
46f4442e 1555 } else {
729e4ab9
A
1556 uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s, status);
1557 }
1558 if(U_FAILURE(*status)) {
1559 return NULL;
46f4442e
A
1560 }
1561
1562 baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;
1563 baseContCE = ucol_getNextCE(src->UCA, &s, status);
1564 if(baseContCE == UCOL_NO_MORE_CES) {
1565 baseContCE = 0;
1566 }
1567
1568
1569 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1570 uint32_t ch = 0;
1571 uint32_t expandNext = 0;
1572 UColToken key;
1573
1574 if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
729e4ab9 1575 uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16);
46f4442e
A
1576 uint32_t raw = uprv_uca_getRawFromImplicit(primary);
1577 ch = uprv_uca_getCodePointFromRaw(raw-1);
1578 uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
729e4ab9
A
1579 CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
1580 SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER;
46f4442e
A
1581
1582 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1583 *src->extraCurrent++ = 0xFFFE;
1584 *src->extraCurrent++ = (UChar)ch;
1585 src->parsedToken.charsLen++;
1586
1587 key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
729e4ab9 1588 key.rulesToParseHdl = &(src->source);
46f4442e
A
1589
1590 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1591 sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1592
1593 if(sourceToken == NULL) {
1594 src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1595 if(isContinuation(SecondCE)) {
1596 src->lh[src->resultLen].baseContCE = SecondCE;
1597 } else {
1598 src->lh[src->resultLen].baseContCE = 0;
1599 }
1600 src->lh[src->resultLen].nextCE = 0;
1601 src->lh[src->resultLen].nextContCE = 0;
1602 src->lh[src->resultLen].previousCE = 0;
1603 src->lh[src->resultLen].previousContCE = 0;
1604
1605 src->lh[src->resultLen].indirect = FALSE;
1606
1607 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1608 }
1609
1610 } else {
4388f060 1611 /* invPos = */ ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
46f4442e
A
1612
1613 // we got the previous CE. Now we need to see if the difference between
1614 // the two CEs is really of the requested strength.
1615 // if it's a bigger difference (we asked for secondary and got primary), we
1616 // need to modify the CE.
1617 if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) {
1618 // adjust the strength
1619 // now we are in the situation where our baseCE should actually be modified in
1620 // order to get the CE in the right position.
374ca955 1621 if(strength == UCOL_SECONDARY) {
46f4442e 1622 CE = baseCE - 0x0200;
374ca955 1623 } else { // strength == UCOL_TERTIARY
46f4442e
A
1624 CE = baseCE - 0x02;
1625 }
1626 if(baseContCE) {
1627 if(strength == UCOL_SECONDARY) {
1628 SecondCE = baseContCE - 0x0200;
1629 } else { // strength == UCOL_TERTIARY
1630 SecondCE = baseContCE - 0x02;
1631 }
374ca955 1632 }
46f4442e 1633 }
374ca955
A
1634
1635#if 0
46f4442e
A
1636 // the code below relies on getting a code point from the inverse table, in order to be
1637 // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
1638 // 1. There are many code points that have the same CE
1639 // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
1640 // Also, in case when there is no equivalent strength before an element, we have to actually
1641 // construct one. For example, &[before 2]a << x won't result in x << a, because the element
1642 // before a is a primary difference.
374ca955 1643
46f4442e 1644 //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
374ca955
A
1645
1646
46f4442e 1647 ch = CETable[3*invPos+2];
374ca955 1648
46f4442e
A
1649 if((ch & UCOL_INV_SIZEMASK) != 0) {
1650 uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts);
1651 uint32_t offset = (ch & UCOL_INV_OFFSETMASK);
1652 ch = conts[offset];
1653 }
374ca955 1654
46f4442e
A
1655 *src->extraCurrent++ = (UChar)ch;
1656 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1);
1657 src->parsedToken.charsLen = 1;
374ca955 1658
46f4442e
A
1659 // We got an UCA before. However, this might have been tailored.
1660 // example:
1661 // &\u30ca = \u306a
1662 // &[before 3]\u306a<<<\u306a|\u309d
374ca955
A
1663
1664
46f4442e
A
1665 // uint32_t key = (*newCharsLen << 24) | *charsOffset;
1666 key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
729e4ab9 1667 key.rulesToParseHdl = &(src->source);
374ca955 1668
46f4442e
A
1669 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1670 sourceToken = (UColToken *)uhash_get(src->tailored, &key);
374ca955
A
1671#endif
1672
46f4442e
A
1673 // here is how it should be. The situation such as &[before 1]a < x, should be
1674 // resolved exactly as if we wrote &a > x.
1675 // therefore, I don't really care if the UCA value before a has been changed.
1676 // However, I do care if the strength between my element and the previous element
1677 // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
1678 // have to construct the base CE.
374ca955
A
1679
1680
1681
46f4442e
A
1682 // if we found a tailored thing, we have to use the UCA value and construct
1683 // a new reset token with constructed name
1684 //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
374ca955
A
1685 // character to which we want to anchor is already tailored.
1686 // We need to construct a new token which will be the anchor
1687 // point
1688 //*(src->extraCurrent-1) = 0xFFFE;
1689 //*src->extraCurrent++ = (UChar)ch;
1690 // grab before
1691 src->parsedToken.charsOffset -= 10;
1692 src->parsedToken.charsLen += 10;
1693 src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1694 if(isContinuation(SecondCE)) {
46f4442e 1695 src->lh[src->resultLen].baseContCE = SecondCE;
374ca955 1696 } else {
46f4442e 1697 src->lh[src->resultLen].baseContCE = 0;
374ca955
A
1698 }
1699 src->lh[src->resultLen].nextCE = 0;
1700 src->lh[src->resultLen].nextContCE = 0;
1701 src->lh[src->resultLen].previousCE = 0;
1702 src->lh[src->resultLen].previousContCE = 0;
b75a7d8f 1703
374ca955 1704 src->lh[src->resultLen].indirect = FALSE;
b75a7d8f 1705
374ca955 1706 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
46f4442e
A
1707 //}
1708 }
b75a7d8f 1709
46f4442e 1710 return sourceToken;
b75a7d8f
A
1711
1712}
1713
1714uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) {
46f4442e
A
1715 UColToken *lastToken = NULL;
1716 const UChar *parseEnd = NULL;
1717 uint32_t expandNext = 0;
1718 UBool variableTop = FALSE;
1719 UBool top = FALSE;
1720 uint16_t specs = 0;
1721 UColTokListHeader *ListList = NULL;
b75a7d8f 1722
46f4442e 1723 src->parsedToken.strength = UCOL_TOK_UNSET;
b75a7d8f 1724
46f4442e 1725 ListList = src->lh;
b75a7d8f 1726
46f4442e
A
1727 if(U_FAILURE(*status)) {
1728 return 0;
1729 }
729e4ab9
A
1730#ifdef DEBUG_FOR_CODE_POINTS
1731 char filename[35];
1732 sprintf(filename, "/tmp/debug_for_cp_%09d.txt", getpid());
1733 dfcp_fp = fopen(filename, "a");
1734 fprintf(stdout, "Output is in the file %s.\n", filename);
1735#endif
1736
1737#ifdef DEBUG_FOR_COLL_RULES
1738 std::string s3;
1739 UnicodeString(src->source).toUTF8String(s3);
1740 std::cout << "src->source = " << s3 << std::endl;
1741#endif
b75a7d8f 1742
729e4ab9 1743 while(src->current < src->end || src->isStarred) {
46f4442e 1744 src->parsedToken.prefixOffset = 0;
374ca955 1745
46f4442e
A
1746 parseEnd = ucol_tok_parseNextToken(src,
1747 (UBool)(lastToken == NULL),
1748 parseError,
1749 status);
b75a7d8f 1750
46f4442e 1751 specs = src->parsedToken.flags;
b75a7d8f
A
1752
1753
46f4442e
A
1754 variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0);
1755 top = ((specs & UCOL_TOK_TOP) != 0);
b75a7d8f 1756
46f4442e
A
1757 if(U_SUCCESS(*status) && parseEnd != NULL) {
1758 UColToken *sourceToken = NULL;
1759 //uint32_t key = 0;
1760 uint32_t lastStrength = UCOL_TOK_UNSET;
374ca955 1761
46f4442e
A
1762 if(lastToken != NULL ) {
1763 lastStrength = lastToken->strength;
1764 }
b75a7d8f 1765
729e4ab9
A
1766#ifdef DEBUG_FOR_CODE_POINTS
1767 UChar32 cp;
1768 U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->extraEnd - src->source), cp);
1769 fprintf(dfcp_fp, "Code point = %x, Strength = %x\n", cp, src->parsedToken.strength);
1770#endif
46f4442e
A
1771 //key = newCharsLen << 24 | charsOffset;
1772 UColToken key;
1773 key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
729e4ab9 1774 key.rulesToParseHdl = &(src->source);
b75a7d8f 1775
46f4442e
A
1776 /* 4 Lookup each source in the CharsToToken map, and find a sourceToken */
1777 sourceToken = (UColToken *)uhash_get(src->tailored, &key);
b75a7d8f 1778
46f4442e
A
1779 if(src->parsedToken.strength != UCOL_TOK_RESET) {
1780 if(lastToken == NULL) { /* this means that rules haven't started properly */
1781 *status = U_INVALID_FORMAT_ERROR;
1782 syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
729e4ab9 1783 DBG_FORMAT_ERROR
46f4442e
A
1784 return 0;
1785 }
1786 /* 6 Otherwise (when relation != reset) */
1787 if(sourceToken == NULL) {
1788 /* If sourceToken is null, create new one, */
1789 sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
1790 /* test for NULL */
1791 if (sourceToken == NULL) {
1792 *status = U_MEMORY_ALLOCATION_ERROR;
1793 return 0;
1794 }
729e4ab9 1795 sourceToken->rulesToParseHdl = &(src->source);
46f4442e
A
1796 sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1797
1798 sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
1799
1800 sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset;
1801 sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset);
1802
1803 sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
1804 sourceToken->next = NULL;
1805 sourceToken->previous = NULL;
1806 sourceToken->noOfCEs = 0;
1807 sourceToken->noOfExpCEs = 0;
1808 // keep the flags around so that we know about before
1809 sourceToken->flags = src->parsedToken.flags;
1810 uhash_put(src->tailored, sourceToken, sourceToken, status);
1811 if(U_FAILURE(*status)) {
1812 return 0;
1813 }
1814 } else {
1815 /* we could have fished out a reset here */
1816 if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) {
1817 /* otherwise remove sourceToken from where it was. */
1818 if(sourceToken->next != NULL) {
1819 if(sourceToken->next->strength > sourceToken->strength) {
1820 sourceToken->next->strength = sourceToken->strength;
1821 }
1822 sourceToken->next->previous = sourceToken->previous;
1823 } else {
1824 sourceToken->listHeader->last = sourceToken->previous;
1825 }
1826
1827 if(sourceToken->previous != NULL) {
1828 sourceToken->previous->next = sourceToken->next;
1829 } else {
1830 sourceToken->listHeader->first = sourceToken->next;
1831 }
1832 sourceToken->next = NULL;
1833 sourceToken->previous = NULL;
1834 }
1835 }
b75a7d8f 1836
46f4442e
A
1837 sourceToken->strength = src->parsedToken.strength;
1838 sourceToken->listHeader = lastToken->listHeader;
1839
1840 /*
1841 1. Find the strongest strength in each list, and set strongestP and strongestN
1842 accordingly in the headers.
1843 */
1844 if(lastStrength == UCOL_TOK_RESET
1845 || sourceToken->listHeader->first == 0) {
1846 /* If LAST is a reset
1847 insert sourceToken in the list. */
1848 if(sourceToken->listHeader->first == 0) {
1849 sourceToken->listHeader->first = sourceToken;
1850 sourceToken->listHeader->last = sourceToken;
1851 } else { /* we need to find a place for us */
1852 /* and we'll get in front of the same strength */
1853 if(sourceToken->listHeader->first->strength <= sourceToken->strength) {
1854 sourceToken->next = sourceToken->listHeader->first;
1855 sourceToken->next->previous = sourceToken;
1856 sourceToken->listHeader->first = sourceToken;
1857 sourceToken->previous = NULL;
1858 } else {
1859 lastToken = sourceToken->listHeader->first;
1860 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
1861 lastToken = lastToken->next;
1862 }
1863 if(lastToken->next != NULL) {
1864 lastToken->next->previous = sourceToken;
1865 } else {
1866 sourceToken->listHeader->last = sourceToken;
1867 }
1868 sourceToken->previous = lastToken;
1869 sourceToken->next = lastToken->next;
1870 lastToken->next = sourceToken;
1871 }
1872 }
1873 } else {
1874 /* Otherwise (when LAST is not a reset)
1875 if polarity (LAST) == polarity(relation), insert sourceToken after LAST,
1876 otherwise insert before.
1877 when inserting after or before, search to the next position with the same
1878 strength in that direction. (This is called postpone insertion). */
1879 if(sourceToken != lastToken) {
1880 if(lastToken->polarity == sourceToken->polarity) {
1881 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
1882 lastToken = lastToken->next;
1883 }
1884 sourceToken->previous = lastToken;
1885 if(lastToken->next != NULL) {
1886 lastToken->next->previous = sourceToken;
1887 } else {
1888 sourceToken->listHeader->last = sourceToken;
1889 }
1890
1891 sourceToken->next = lastToken->next;
1892 lastToken->next = sourceToken;
1893 } else {
1894 while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) {
1895 lastToken = lastToken->previous;
1896 }
1897 sourceToken->next = lastToken;
1898 if(lastToken->previous != NULL) {
1899 lastToken->previous->next = sourceToken;
1900 } else {
1901 sourceToken->listHeader->first = sourceToken;
1902 }
1903 sourceToken->previous = lastToken->previous;
1904 lastToken->previous = sourceToken;
1905 }
1906 } else { /* repeated one thing twice in rules, stay with the stronger strength */
1907 if(lastStrength < sourceToken->strength) {
1908 sourceToken->strength = lastStrength;
1909 }
1910 }
1911 }
b75a7d8f 1912
46f4442e
A
1913 /* if the token was a variable top, we're gonna put it in */
1914 if(variableTop == TRUE && src->varTop == NULL) {
1915 variableTop = FALSE;
1916 src->varTop = sourceToken;
1917 }
1918
1919 // Treat the expansions.
1920 // There are two types of expansions: explicit (x / y) and reset based propagating expansions
1921 // (&abc * d * e <=> &ab * d / c * e / c)
1922 // if both of them are in effect for a token, they are combined.
1923
1924 sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
1925
1926 if(expandNext != 0) {
1927 if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */
1928 expandNext = 0;
1929 } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */
1930 sourceToken->expansion = expandNext;
1931 } else { /* there is both explicit and implicit expansion. We need to make a combination */
1932 uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar));
1933 uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar));
1934 sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source));
1935 src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen;
1936 }
1937 }
1938
1939 // This is just for debugging purposes
1940 if(sourceToken->expansion != 0) {
1941 sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
1942 } else {
1943 sourceToken->debugExpansion = 0;
1944 }
1945 // if the previous token was a reset before, the strength of this
1946 // token must match the strength of before. Otherwise we have an
1947 // undefined situation.
1948 // In other words, we currently have a cludge which we use to
1949 // represent &a >> x. This is written as &[before 2]a << x.
1950 if((lastToken->flags & UCOL_TOK_BEFORE) != 0) {
1951 uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1;
1952 if(beforeStrength != sourceToken->strength) {
1953 *status = U_INVALID_FORMAT_ERROR;
1954 syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
729e4ab9 1955 DBG_FORMAT_ERROR
46f4442e
A
1956 return 0;
1957 }
1958 }
b75a7d8f 1959 } else {
46f4442e
A
1960 if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
1961 /* if the previous token was also a reset, */
1962 /*this means that we have two consecutive resets */
1963 /* and we want to remove the previous one if empty*/
1964 if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
1965 src->resultLen--;
1966 }
1967 }
b75a7d8f 1968
46f4442e
A
1969 if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
1970 uint32_t searchCharsLen = src->parsedToken.charsLen;
1971 while(searchCharsLen > 1 && sourceToken == NULL) {
1972 searchCharsLen--;
1973 //key = searchCharsLen << 24 | charsOffset;
1974 UColToken key;
1975 key.source = searchCharsLen << 24 | src->parsedToken.charsOffset;
729e4ab9 1976 key.rulesToParseHdl = &(src->source);
46f4442e
A
1977 sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1978 }
1979 if(sourceToken != NULL) {
1980 expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen);
1981 }
1982 }
b75a7d8f 1983
46f4442e
A
1984 if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */
1985 if(top == FALSE) { /* there is no indirection */
1986 uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
1987 if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
1988 /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
1989 while(sourceToken->strength > strength && sourceToken->previous != NULL) {
1990 sourceToken = sourceToken->previous;
1991 }
1992 /* here, either we hit the strength or NULL */
1993 if(sourceToken->strength == strength) {
1994 if(sourceToken->previous != NULL) {
1995 sourceToken = sourceToken->previous;
1996 } else { /* start of list */
1997 sourceToken = sourceToken->listHeader->reset;
1998 }
1999 } else { /* we hit NULL */
2000 /* we should be doing the else part */
2001 sourceToken = sourceToken->listHeader->reset;
2002 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
2003 }
2004 } else {
2005 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
2006 }
2007 } else { /* this is both before and indirection */
2008 top = FALSE;
2009 ListList[src->resultLen].previousCE = 0;
2010 ListList[src->resultLen].previousContCE = 0;
2011 ListList[src->resultLen].indirect = TRUE;
2012 /* we need to do slightly more work. we need to get the baseCE using the */
2013 /* inverse UCA & getPrevious. The next bound is not set, and will be decided */
2014 /* in ucol_bld */
2015 uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
2016 uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
2017 uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F;
2018 uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
2019
2020 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
729e4ab9
A
2021 if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) &&
2022 (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
2023 uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16);
46f4442e
A
2024 uint32_t raw = uprv_uca_getRawFromImplicit(primary);
2025 uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
729e4ab9
A
2026 CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
2027 SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER;
46f4442e
A
2028 } else {
2029 /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/
2030 ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
2031 }
2032
2033 ListList[src->resultLen].baseCE = CE;
2034 ListList[src->resultLen].baseContCE = SecondCE;
2035 ListList[src->resultLen].nextCE = 0;
2036 ListList[src->resultLen].nextContCE = 0;
2037
2038 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
2039 }
2040 }
b75a7d8f 2041
b75a7d8f 2042
46f4442e
A
2043 /* 5 If the relation is a reset:
2044 If sourceToken is null
2045 Create new list, create new sourceToken, make the baseCE from source, put
2046 the sourceToken in ListHeader of the new list */
2047 if(sourceToken == NULL) {
2048 /*
2049 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
2050 First convert all expansions into normal form. Examples:
2051 If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
2052 d * ... into &x * c/y * d * ...
2053 Note: reset values can never have expansions, although they can cause the
2054 very next item to have one. They may be contractions, if they are found
2055 earlier in the list.
2056 */
2057 if(top == FALSE) {
2058 collIterate s;
2059 uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
2060
729e4ab9 2061 uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s, status);
46f4442e
A
2062
2063 CE = ucol_getNextCE(src->UCA, &s, status);
729e4ab9 2064 const UChar *expand = s.pos;
46f4442e
A
2065 SecondCE = ucol_getNextCE(src->UCA, &s, status);
2066
2067 ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;
2068 if(isContinuation(SecondCE)) {
2069 ListList[src->resultLen].baseContCE = SecondCE;
2070 } else {
2071 ListList[src->resultLen].baseContCE = 0;
2072 }
2073 ListList[src->resultLen].nextCE = 0;
2074 ListList[src->resultLen].nextContCE = 0;
2075 ListList[src->resultLen].previousCE = 0;
2076 ListList[src->resultLen].previousContCE = 0;
2077 ListList[src->resultLen].indirect = FALSE;
2078 sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status);
2079 } else { /* top == TRUE */
2080 /* just use the supplied values */
2081 top = FALSE;
2082 ListList[src->resultLen].previousCE = 0;
2083 ListList[src->resultLen].previousContCE = 0;
2084 ListList[src->resultLen].indirect = TRUE;
2085 ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
2086 ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
2087 ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
2088 ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;
2089
2090 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
b75a7d8f 2091
46f4442e
A
2092 }
2093 } else { /* reset to something already in rules */
2094 top = FALSE;
374ca955 2095 }
b75a7d8f 2096 }
46f4442e
A
2097 /* 7 After all this, set LAST to point to sourceToken, and goto step 3. */
2098 lastToken = sourceToken;
2099 } else {
2100 if(U_FAILURE(*status)) {
2101 return 0;
374ca955 2102 }
b75a7d8f 2103 }
46f4442e 2104 }
729e4ab9
A
2105#ifdef DEBUG_FOR_CODE_POINTS
2106 fclose(dfcp_fp);
2107#endif
2108
b75a7d8f 2109
46f4442e
A
2110 if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
2111 src->resultLen--;
2112 }
2113 return src->resultLen;
2114}
b75a7d8f 2115
729e4ab9
A
2116const UChar* ucol_tok_getRulesFromBundle(
2117 void* /*context*/,
2118 const char* locale,
2119 const char* type,
2120 int32_t* pLength,
2121 UErrorCode* status)
2122{
2123 const UChar* rules = NULL;
2124 UResourceBundle* bundle;
2125 UResourceBundle* collations;
2126 UResourceBundle* collation;
2127
2128 *pLength = 0;
2129
2130 bundle = ures_open(U_ICUDATA_COLL, locale, status);
2131 if(U_SUCCESS(*status)){
2132 collations = ures_getByKey(bundle, "collations", NULL, status);
2133 if(U_SUCCESS(*status)){
2134 collation = ures_getByKey(collations, type, NULL, status);
2135 if(U_SUCCESS(*status)){
2136 rules = ures_getStringByKey(collation, "Sequence", pLength, status);
2137 if(U_FAILURE(*status)){
2138 *pLength = 0;
2139 rules = NULL;
2140 }
2141 ures_close(collation);
2142 }
2143 ures_close(collations);
2144 }
2145 }
2146
2147 ures_close(bundle);
2148
2149 return rules;
2150}
2151
2152void ucol_tok_initTokenList(
2153 UColTokenParser *src,
2154 const UChar *rules,
2155 uint32_t rulesLength,
2156 const UCollator *UCA,
2157 GetCollationRulesFunction importFunc,
2158 void* context,
2159 UErrorCode *status) {
46f4442e 2160 U_NAMESPACE_USE
b75a7d8f 2161
46f4442e
A
2162 uint32_t nSize = 0;
2163 uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE);
729e4ab9
A
2164
2165 bool needToDeallocRules = false;
2166
46f4442e
A
2167 if(U_FAILURE(*status)) {
2168 return;
2169 }
b75a7d8f 2170
46f4442e
A
2171 // set everything to zero, so that we can clean up gracefully
2172 uprv_memset(src, 0, sizeof(UColTokenParser));
2173
2174 // first we need to find options that don't like to be normalized,
2175 // like copy and remove...
2176 //const UChar *openBrace = rules;
2177 int32_t optionNumber = -1;
729e4ab9 2178 const UChar *setStart = NULL;
46f4442e
A
2179 uint32_t i = 0;
2180 while(i < rulesLength) {
729e4ab9
A
2181 if(rules[i] == 0x005B) { // '[': start of an option
2182 /* Gets the following:
2183 optionNumber: The index of the option.
2184 setStart: The pointer at which the option arguments start.
2185 */
46f4442e 2186 optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart);
729e4ab9 2187
46f4442e 2188 if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */
729e4ab9 2189 // [optimize]
46f4442e
A
2190 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
2191 if(U_SUCCESS(*status)) {
2192 if(src->copySet == NULL) {
2193 src->copySet = newSet;
2194 } else {
2195 uset_addAll(src->copySet, newSet);
2196 uset_close(newSet);
2197 }
2198 } else {
2199 return;
2200 }
2201 } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {
2202 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
2203 if(U_SUCCESS(*status)) {
2204 if(src->removeSet == NULL) {
2205 src->removeSet = newSet;
2206 } else {
2207 uset_addAll(src->removeSet, newSet);
2208 uset_close(newSet);
2209 }
2210 } else {
2211 return;
2212 }
729e4ab9
A
2213 } else if(optionNumber == OPTION_IMPORT){
2214 // [import <collation-name>]
2215
2216 // Find the address of the closing ].
2217 UChar* import_end = u_strchr(setStart, 0x005D);
2218 int32_t optionEndOffset = (int32_t)(import_end + 1 - rules);
2219 // Ignore trailing whitespace.
4388f060 2220 while(PatternProps::isWhiteSpace(*(import_end-1))) {
729e4ab9
A
2221 --import_end;
2222 }
2223
2224 int32_t optionLength = (int32_t)(import_end - setStart);
2225 char option[50];
2226 if(optionLength >= (int32_t)sizeof(option)) {
2227 *status = U_ILLEGAL_ARGUMENT_ERROR;
2228 return;
2229 }
2230 u_UCharsToChars(setStart, option, optionLength);
2231 option[optionLength] = 0;
2232
2233 *status = U_ZERO_ERROR;
2234 char locale[50];
2235 int32_t templ;
2236 uloc_forLanguageTag(option, locale, (int32_t)sizeof(locale), &templ, status);
2237 if(U_FAILURE(*status)) {
2238 *status = U_ILLEGAL_ARGUMENT_ERROR;
2239 return;
2240 }
2241
2242 char type[50];
2243 if (uloc_getKeywordValue(locale, "collation", type, (int32_t)sizeof(type), status) <= 0 ||
2244 U_FAILURE(*status)
2245 ) {
2246 *status = U_ZERO_ERROR;
2247 uprv_strcpy(type, "standard");
2248 }
2249
2250 // TODO: Use public functions when available, see ticket #8134.
2251 char *keywords = (char *)locale_getKeywordsStart(locale);
2252 if(keywords != NULL) {
2253 *keywords = 0;
2254 }
2255
2256 int32_t importRulesLength = 0;
2257 const UChar* importRules = importFunc(context, locale, type, &importRulesLength, status);
2258
2259#ifdef DEBUG_FOR_COLL_RULES
2260 std::string s;
2261 UnicodeString(importRules).toUTF8String(s);
2262 std::cout << "Import rules = " << s << std::endl;
2263#endif
2264
2265 // Add the length of the imported rules to length of the original rules,
2266 // and subtract the length of the import option.
2267 uint32_t newRulesLength = rulesLength + importRulesLength - (optionEndOffset - i);
2268
2269 UChar* newRules = (UChar*)uprv_malloc(newRulesLength*sizeof(UChar));
2270
2271#ifdef DEBUG_FOR_COLL_RULES
2272 std::string s1;
2273 UnicodeString(rules).toUTF8String(s1);
2274 std::cout << "Original rules = " << s1 << std::endl;
2275#endif
2276
2277
2278 // Copy the section of the original rules leading up to the import
2279 uprv_memcpy(newRules, rules, i*sizeof(UChar));
2280 // Copy the imported rules
2281 uprv_memcpy(newRules+i, importRules, importRulesLength*sizeof(UChar));
2282 // Copy the rest of the original rules (minus the import option itself)
2283 uprv_memcpy(newRules+i+importRulesLength,
2284 rules+optionEndOffset,
2285 (rulesLength-optionEndOffset)*sizeof(UChar));
2286
2287#ifdef DEBUG_FOR_COLL_RULES
2288 std::string s2;
2289 UnicodeString(newRules).toUTF8String(s2);
2290 std::cout << "Resulting rules = " << s2 << std::endl;
2291#endif
2292
2293 if(needToDeallocRules){
2294 // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free
2295 uprv_free((void*)rules);
2296 }
2297 needToDeallocRules = true;
2298 rules = newRules;
2299 rulesLength = newRulesLength;
2300
2301 estimatedSize += importRulesLength*2;
2302
2303 // First character of the new rules needs to be processed
2304 i--;
46f4442e 2305 }
b75a7d8f 2306 }
46f4442e
A
2307 //openBrace++;
2308 i++;
b75a7d8f 2309 }
b75a7d8f 2310
46f4442e
A
2311 src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar));
2312 /* test for NULL */
2313 if (src->source == NULL) {
2314 *status = U_MEMORY_ALLOCATION_ERROR;
2315 return;
2316 }
2317 uprv_memset(src->source, 0, estimatedSize*sizeof(UChar));
2318 nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status);
2319 if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) {
2320 *status = U_ZERO_ERROR;
2321 src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
2322 /* test for NULL */
2323 if (src->source == NULL) {
2324 *status = U_MEMORY_ALLOCATION_ERROR;
2325 return;
b75a7d8f 2326 }
46f4442e
A
2327 nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);
2328 }
729e4ab9
A
2329 if(needToDeallocRules){
2330 // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free
2331 uprv_free((void*)rules);
2332 }
2333
2334
46f4442e
A
2335 src->current = src->source;
2336 src->end = src->source+nSize;
2337 src->sourceCurrent = src->source;
2338 src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly
2339 src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
2340 src->varTop = NULL;
2341 src->UCA = UCA;
2342 src->invUCA = ucol_initInverseUCA(status);
2343 src->parsedToken.charsLen = 0;
2344 src->parsedToken.charsOffset = 0;
2345 src->parsedToken.extensionLen = 0;
2346 src->parsedToken.extensionOffset = 0;
2347 src->parsedToken.prefixLen = 0;
2348 src->parsedToken.prefixOffset = 0;
2349 src->parsedToken.flags = 0;
2350 src->parsedToken.strength = UCOL_TOK_UNSET;
2351 src->buildCCTabFlag = FALSE;
729e4ab9
A
2352 src->isStarred = FALSE;
2353 src->inRange = FALSE;
2354 src->lastRangeCp = 0;
2355 src->previousCp = 0;
46f4442e
A
2356
2357 if(U_FAILURE(*status)) {
2358 return;
b75a7d8f 2359 }
46f4442e
A
2360 src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status);
2361 if(U_FAILURE(*status)) {
2362 return;
2363 }
4388f060 2364 uhash_setValueDeleter(src->tailored, uprv_free);
46f4442e
A
2365
2366 src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
b75a7d8f 2367 /* test for NULL */
46f4442e 2368 if (src->opts == NULL) {
b75a7d8f
A
2369 *status = U_MEMORY_ALLOCATION_ERROR;
2370 return;
2371 }
46f4442e
A
2372
2373 uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet));
2374
46f4442e
A
2375 src->lh = 0;
2376 src->listCapacity = 1024;
2377 src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader));
2378 //Test for NULL
2379 if (src->lh == NULL) {
2380 *status = U_MEMORY_ALLOCATION_ERROR;
2381 return;
2382 }
2383 uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader));
2384 src->resultLen = 0;
2385
2386 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
2387
2388 // UCOL_RESET_TOP_VALUE
2389 setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
2390 // UCOL_FIRST_PRIMARY_IGNORABLE
2391 setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);
2392 // UCOL_LAST_PRIMARY_IGNORABLE
2393 setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);
2394 // UCOL_FIRST_SECONDARY_IGNORABLE
2395 setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);
2396 // UCOL_LAST_SECONDARY_IGNORABLE
2397 setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);
2398 // UCOL_FIRST_TERTIARY_IGNORABLE
2399 setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);
2400 // UCOL_LAST_TERTIARY_IGNORABLE
2401 setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);
2402 // UCOL_FIRST_VARIABLE
2403 setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);
2404 // UCOL_LAST_VARIABLE
2405 setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);
2406 // UCOL_FIRST_NON_VARIABLE
2407 setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);
2408 // UCOL_LAST_NON_VARIABLE
2409 setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
2410 // UCOL_FIRST_IMPLICIT
2411 setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0);
2412 // UCOL_LAST_IMPLICIT
2413 setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING);
2414 // UCOL_FIRST_TRAILING
2415 setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0);
2416 // UCOL_LAST_TRAILING
2417 setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0);
2418 ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24);
b75a7d8f
A
2419}
2420
2421
2422void ucol_tok_closeTokenList(UColTokenParser *src) {
46f4442e
A
2423 if(src->copySet != NULL) {
2424 uset_close(src->copySet);
2425 }
2426 if(src->removeSet != NULL) {
2427 uset_close(src->removeSet);
2428 }
2429 if(src->tailored != NULL) {
2430 uhash_close(src->tailored);
2431 }
2432 if(src->lh != NULL) {
2433 uprv_free(src->lh);
2434 }
2435 if(src->source != NULL) {
2436 uprv_free(src->source);
2437 }
2438 if(src->opts != NULL) {
2439 uprv_free(src->opts);
2440 }
729e4ab9
A
2441 if (src->reorderCodes != NULL) {
2442 uprv_free(src->reorderCodes);
2443 }
b75a7d8f
A
2444}
2445
2446#endif /* #if !UCONFIG_NO_COLLATION */