]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/ucol_tok.cpp
ICU-400.39.tar.gz
[apple/icu.git] / icuSources / i18n / ucol_tok.cpp
CommitLineData
b75a7d8f
A
1/*
2*******************************************************************************
3*
46f4442e 4* Copyright (C) 2001-2008, International Business Machines
b75a7d8f
A
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: ucol_tok.cpp
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created 02/22/2001
14* created by: Vladimir Weinstein
15*
374ca955 16* This module reads a tailoring rule string and produces a list of
b75a7d8f 17* tokens that will be turned into collation elements
374ca955 18*
b75a7d8f
A
19*/
20
21#include "unicode/utypes.h"
22
23#if !UCONFIG_NO_COLLATION
24
25#include "unicode/ustring.h"
26#include "unicode/uchar.h"
27#include "unicode/uniset.h"
374ca955 28
b75a7d8f 29#include "ucol_tok.h"
46f4442e 30#include "ucol_bld.h"
b75a7d8f 31#include "cmemory.h"
374ca955 32#include "util.h"
b75a7d8f
A
33
34U_CDECL_BEGIN
73c04bcf 35static int32_t U_CALLCONV
b75a7d8f
A
36uhash_hashTokens(const UHashTok k)
37{
38 int32_t hash = 0;
39 //uint32_t key = (uint32_t)k.integer;
40 UColToken *key = (UColToken *)k.pointer;
41 if (key != 0) {
42 //int32_t len = (key & 0xFF000000)>>24;
43 int32_t len = (key->source & 0xFF000000)>>24;
44 int32_t inc = ((len - 32) / 32) + 1;
374ca955 45
b75a7d8f
A
46 //const UChar *p = (key & 0x00FFFFFF) + rulesToParse;
47 const UChar *p = (key->source & 0x00FFFFFF) + key->rulesToParse;
374ca955 48 const UChar *limit = p + len;
b75a7d8f
A
49
50 while (p<limit) {
51 hash = (hash * 37) + *p;
52 p += inc;
53 }
54 }
55 return hash;
56}
57
73c04bcf 58static UBool U_CALLCONV
b75a7d8f
A
59uhash_compareTokens(const UHashTok key1, const UHashTok key2)
60{
61 //uint32_t p1 = (uint32_t) key1.integer;
62 //uint32_t p2 = (uint32_t) key2.integer;
63 UColToken *p1 = (UColToken *)key1.pointer;
64 UColToken *p2 = (UColToken *)key2.pointer;
65 const UChar *s1 = (p1->source & 0x00FFFFFF) + p1->rulesToParse;
66 const UChar *s2 = (p2->source & 0x00FFFFFF) + p2->rulesToParse;
67 uint32_t s1L = ((p1->source & 0xFF000000) >> 24);
68 uint32_t s2L = ((p2->source & 0xFF000000) >> 24);
69 const UChar *end = s1+s1L-1;
70
71 if (p1 == p2) {
72 return TRUE;
73 }
74 if (p1->source == 0 || p2->source == 0) {
75 return FALSE;
76 }
77 if(s1L != s2L) {
46f4442e 78 return FALSE;
b75a7d8f
A
79 }
80 if(p1->source == p2->source) {
46f4442e 81 return TRUE;
b75a7d8f
A
82 }
83 while((s1 < end) && *s1 == *s2) {
46f4442e
A
84 ++s1;
85 ++s2;
b75a7d8f
A
86 }
87 if(*s1 == *s2) {
46f4442e 88 return TRUE;
b75a7d8f 89 } else {
46f4442e 90 return FALSE;
b75a7d8f
A
91 }
92}
93U_CDECL_END
94
46f4442e 95/*static inline void U_CALLCONV
b75a7d8f 96uhash_freeBlockWrapper(void *obj) {
46f4442e
A
97 uhash_freeBlock(obj);
98}*/
b75a7d8f
A
99
100
101typedef struct {
46f4442e
A
102 uint32_t startCE;
103 uint32_t startContCE;
104 uint32_t limitCE;
105 uint32_t limitContCE;
b75a7d8f
A
106} indirectBoundaries;
107
108/* these values are used for finding CE values for indirect positioning. */
109/* Indirect positioning is a mechanism for allowing resets on symbolic */
110/* values. It only works for resets and you cannot tailor indirect names */
111/* An indirect name can define either an anchor point or a range. An */
112/* anchor point behaves in exactly the same way as a code point in reset */
113/* would, except that it cannot be tailored. A range (we currently only */
114/* know for the [top] range will explicitly set the upper bound for */
115/* generated CEs, thus allowing for better control over how many CEs can */
116/* be squeezed between in the range without performance penalty. */
117/* In that respect, we use [top] for tailoring of locales that use CJK */
118/* characters. Other indirect values are currently a pure convenience, */
119/* they can be used to assure that the CEs will be always positioned in */
120/* the same place relative to a point with known properties (e.g. first */
121/* primary ignorable). */
122static indirectBoundaries ucolIndirectBoundaries[15];
123/*
124static indirectBoundaries ucolIndirectBoundaries[11] = {
46f4442e
A
125{ UCOL_RESET_TOP_VALUE, 0,
126UCOL_NEXT_TOP_VALUE, 0 },
127{ UCOL_FIRST_PRIMARY_IGNORABLE, 0,
1280, 0 },
129{ UCOL_LAST_PRIMARY_IGNORABLE, UCOL_LAST_PRIMARY_IGNORABLE_CONT,
1300, 0 },
131{ UCOL_FIRST_SECONDARY_IGNORABLE, 0,
1320, 0 },
133{ UCOL_LAST_SECONDARY_IGNORABLE, 0,
1340, 0 },
135{ UCOL_FIRST_TERTIARY_IGNORABLE, 0,
1360, 0 },
137{ UCOL_LAST_TERTIARY_IGNORABLE, 0,
1380, 0 },
139{ UCOL_FIRST_VARIABLE, 0,
1400, 0 },
141{ UCOL_LAST_VARIABLE, 0,
1420, 0 },
143{ UCOL_FIRST_NON_VARIABLE, 0,
1440, 0 },
145{ UCOL_LAST_NON_VARIABLE, 0,
1460, 0 },
b75a7d8f
A
147};
148*/
149
374ca955
A
150static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) {
151
46f4442e
A
152 // Set values for the top - TODO: once we have values for all the indirects, we are going
153 // to initalize here.
154 ucolIndirectBoundaries[indexR].startCE = start[0];
155 ucolIndirectBoundaries[indexR].startContCE = start[1];
156 if(end) {
157 ucolIndirectBoundaries[indexR].limitCE = end[0];
158 ucolIndirectBoundaries[indexR].limitContCE = end[1];
159 } else {
160 ucolIndirectBoundaries[indexR].limitCE = 0;
161 ucolIndirectBoundaries[indexR].limitContCE = 0;
162 }
b75a7d8f
A
163}
164
165
374ca955
A
166static inline
167void syntaxError(const UChar* rules,
b75a7d8f
A
168 int32_t pos,
169 int32_t rulesLen,
46f4442e
A
170 UParseError* parseError)
171{
b75a7d8f
A
172 parseError->offset = pos;
173 parseError->line = 0 ; /* we are not using line numbers */
374ca955 174
b75a7d8f 175 // for pre-context
46f4442e 176 int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
b75a7d8f 177 int32_t stop = pos;
374ca955 178
b75a7d8f
A
179 u_memcpy(parseError->preContext,rules+start,stop-start);
180 //null terminate the buffer
181 parseError->preContext[stop-start] = 0;
374ca955 182
b75a7d8f
A
183 //for post-context
184 start = pos+1;
374ca955 185 stop = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) :
46f4442e 186 rulesLen;
b75a7d8f 187
374ca955 188 if(start < stop) {
46f4442e
A
189 u_memcpy(parseError->postContext,rules+start,stop-start);
190 //null terminate the buffer
191 parseError->postContext[stop-start]= 0;
374ca955 192 } else {
46f4442e 193 parseError->postContext[0] = 0;
374ca955 194 }
b75a7d8f
A
195}
196
197static
198void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) {
46f4442e
A
199 switch(attrib) {
200 case UCOL_HIRAGANA_QUATERNARY_MODE:
201 opts->hiraganaQ = value;
202 break;
203 case UCOL_FRENCH_COLLATION:
204 opts->frenchCollation = value;
205 break;
206 case UCOL_ALTERNATE_HANDLING:
207 opts->alternateHandling = value;
208 break;
209 case UCOL_CASE_FIRST:
210 opts->caseFirst = value;
211 break;
212 case UCOL_CASE_LEVEL:
213 opts->caseLevel = value;
214 break;
215 case UCOL_NORMALIZATION_MODE:
216 opts->normalizationMode = value;
217 break;
218 case UCOL_STRENGTH:
219 opts->strength = value;
220 break;
221 case UCOL_NUMERIC_COLLATION:
222 opts->numericCollation = value;
223 break;
224 case UCOL_ATTRIBUTE_COUNT:
225 default:
226 break;
227 }
b75a7d8f
A
228}
229
230#define UTOK_OPTION_COUNT 20
231
232static UBool didInit = FALSE;
233/* we can be strict, or we can be lenient */
234/* I'd surely be lenient with the option arguments */
235/* maybe even with options */
236U_STRING_DECL(suboption_00, "non-ignorable", 13);
237U_STRING_DECL(suboption_01, "shifted", 7);
238
239U_STRING_DECL(suboption_02, "lower", 5);
240U_STRING_DECL(suboption_03, "upper", 5);
241U_STRING_DECL(suboption_04, "off", 3);
242U_STRING_DECL(suboption_05, "on", 2);
243U_STRING_DECL(suboption_06, "1", 1);
244U_STRING_DECL(suboption_07, "2", 1);
245U_STRING_DECL(suboption_08, "3", 1);
246U_STRING_DECL(suboption_09, "4", 1);
247U_STRING_DECL(suboption_10, "I", 1);
248
249U_STRING_DECL(suboption_11, "primary", 7);
250U_STRING_DECL(suboption_12, "secondary", 9);
251U_STRING_DECL(suboption_13, "tertiary", 8);
252U_STRING_DECL(suboption_14, "variable", 8);
253U_STRING_DECL(suboption_15, "regular", 7);
254U_STRING_DECL(suboption_16, "implicit", 8);
255U_STRING_DECL(suboption_17, "trailing", 8);
256
257
258U_STRING_DECL(option_00, "undefined", 9);
374ca955 259U_STRING_DECL(option_01, "rearrange", 9);
b75a7d8f 260U_STRING_DECL(option_02, "alternate", 9);
374ca955
A
261U_STRING_DECL(option_03, "backwards", 9);
262U_STRING_DECL(option_04, "variable top", 12);
263U_STRING_DECL(option_05, "top", 3);
264U_STRING_DECL(option_06, "normalization", 13);
265U_STRING_DECL(option_07, "caseLevel", 9);
266U_STRING_DECL(option_08, "caseFirst", 9);
267U_STRING_DECL(option_09, "scriptOrder", 11);
268U_STRING_DECL(option_10, "charsetname", 11);
269U_STRING_DECL(option_11, "charset", 7);
270U_STRING_DECL(option_12, "before", 6);
b75a7d8f
A
271U_STRING_DECL(option_13, "hiraganaQ", 9);
272U_STRING_DECL(option_14, "strength", 8);
273U_STRING_DECL(option_15, "first", 5);
274U_STRING_DECL(option_16, "last", 4);
275U_STRING_DECL(option_17, "optimize", 8);
276U_STRING_DECL(option_18, "suppressContractions", 20);
374ca955 277U_STRING_DECL(option_19, "numericOrdering", 15);
b75a7d8f
A
278
279
280/*
374ca955
A
281[last variable] last variable value
282[last primary ignorable] largest CE for primary ignorable
283[last secondary ignorable] largest CE for secondary ignorable
284[last tertiary ignorable] largest CE for tertiary ignorable
285[top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
b75a7d8f
A
286*/
287
288
289static const ucolTokSuboption alternateSub[2] = {
46f4442e
A
290 {suboption_00, 13, UCOL_NON_IGNORABLE},
291 {suboption_01, 7, UCOL_SHIFTED}
b75a7d8f
A
292};
293
294static const ucolTokSuboption caseFirstSub[3] = {
46f4442e
A
295 {suboption_02, 5, UCOL_LOWER_FIRST},
296 {suboption_03, 5, UCOL_UPPER_FIRST},
297 {suboption_04, 3, UCOL_OFF},
b75a7d8f
A
298};
299
300static const ucolTokSuboption onOffSub[2] = {
46f4442e
A
301 {suboption_04, 3, UCOL_OFF},
302 {suboption_05, 2, UCOL_ON}
b75a7d8f
A
303};
304
305static const ucolTokSuboption frenchSub[1] = {
46f4442e 306 {suboption_07, 1, UCOL_ON}
b75a7d8f
A
307};
308
309static const ucolTokSuboption beforeSub[3] = {
46f4442e
A
310 {suboption_06, 1, UCOL_PRIMARY},
311 {suboption_07, 1, UCOL_SECONDARY},
312 {suboption_08, 1, UCOL_TERTIARY}
b75a7d8f
A
313};
314
315static const ucolTokSuboption strengthSub[5] = {
46f4442e
A
316 {suboption_06, 1, UCOL_PRIMARY},
317 {suboption_07, 1, UCOL_SECONDARY},
318 {suboption_08, 1, UCOL_TERTIARY},
319 {suboption_09, 1, UCOL_QUATERNARY},
320 {suboption_10, 1, UCOL_IDENTICAL},
b75a7d8f
A
321};
322
323static const ucolTokSuboption firstLastSub[7] = {
46f4442e
A
324 {suboption_11, 7, UCOL_PRIMARY},
325 {suboption_12, 9, UCOL_PRIMARY},
326 {suboption_13, 8, UCOL_PRIMARY},
327 {suboption_14, 8, UCOL_PRIMARY},
328 {suboption_15, 7, UCOL_PRIMARY},
329 {suboption_16, 8, UCOL_PRIMARY},
330 {suboption_17, 8, UCOL_PRIMARY},
b75a7d8f
A
331};
332
333enum OptionNumber {
46f4442e 334 OPTION_ALTERNATE_HANDLING = 0,
b75a7d8f
A
335 OPTION_FRENCH_COLLATION,
336 OPTION_CASE_LEVEL,
337 OPTION_CASE_FIRST,
338 OPTION_NORMALIZATION_MODE,
339 OPTION_HIRAGANA_QUATERNARY,
340 OPTION_STRENGTH,
341 OPTION_NUMERIC_COLLATION,
342 OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,
343 OPTION_VARIABLE_TOP,
344 OPTION_REARRANGE,
345 OPTION_BEFORE,
346 OPTION_TOP,
347 OPTION_FIRST,
348 OPTION_LAST,
349 OPTION_OPTIMIZE,
350 OPTION_SUPPRESS_CONTRACTIONS,
351 OPTION_UNDEFINED,
352 OPTION_SCRIPT_ORDER,
353 OPTION_CHARSET_NAME,
354 OPTION_CHARSET
355} ;
356
357static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
46f4442e
A
358 /*00*/ {option_02, 9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */
359 /*01*/ {option_03, 9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards" */
360 /*02*/ {option_07, 9, onOffSub, 2, UCOL_CASE_LEVEL}, /*"caseLevel" */
361 /*03*/ {option_08, 9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst" */
362 /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */
363 /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */
364 /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */
365 /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION}, /*"numericOrdering"*/
366 /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top" */
367 /*09*/ {option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange" */
368 /*10*/ {option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before" */
369 /*11*/ {option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top" */
370 /*12*/ {option_15, 5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */
371 /*13*/ {option_16, 4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */
372 /*14*/ {option_17, 8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize" */
373 /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions" */
374 /*16*/ {option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined" */
375 /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder" */
376 /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname" */
377 /*19*/ {option_11, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT} /*"charset" */
b75a7d8f
A
378};
379
380static
374ca955 381int32_t u_strncmpNoCase(const UChar *s1,
46f4442e
A
382 const UChar *s2,
383 int32_t n)
b75a7d8f
A
384{
385 if(n > 0) {
386 int32_t rc;
387 for(;;) {
388 rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2);
389 if(rc != 0 || *s1 == 0 || --n == 0) {
390 return rc;
391 }
392 ++s1;
393 ++s2;
394 }
395 }
396 return 0;
397}
398
399static
400void ucol_uprv_tok_initData() {
46f4442e
A
401 if(!didInit) {
402 U_STRING_INIT(suboption_00, "non-ignorable", 13);
403 U_STRING_INIT(suboption_01, "shifted", 7);
404
405 U_STRING_INIT(suboption_02, "lower", 5);
406 U_STRING_INIT(suboption_03, "upper", 5);
407 U_STRING_INIT(suboption_04, "off", 3);
408 U_STRING_INIT(suboption_05, "on", 2);
409
410 U_STRING_INIT(suboption_06, "1", 1);
411 U_STRING_INIT(suboption_07, "2", 1);
412 U_STRING_INIT(suboption_08, "3", 1);
413 U_STRING_INIT(suboption_09, "4", 1);
414 U_STRING_INIT(suboption_10, "I", 1);
415
416 U_STRING_INIT(suboption_11, "primary", 7);
417 U_STRING_INIT(suboption_12, "secondary", 9);
418 U_STRING_INIT(suboption_13, "tertiary", 8);
419 U_STRING_INIT(suboption_14, "variable", 8);
420 U_STRING_INIT(suboption_15, "regular", 7);
421 U_STRING_INIT(suboption_16, "implicit", 8);
422 U_STRING_INIT(suboption_17, "trailing", 8);
423
424
425 U_STRING_INIT(option_00, "undefined", 9);
426 U_STRING_INIT(option_01, "rearrange", 9);
427 U_STRING_INIT(option_02, "alternate", 9);
428 U_STRING_INIT(option_03, "backwards", 9);
429 U_STRING_INIT(option_04, "variable top", 12);
430 U_STRING_INIT(option_05, "top", 3);
431 U_STRING_INIT(option_06, "normalization", 13);
432 U_STRING_INIT(option_07, "caseLevel", 9);
433 U_STRING_INIT(option_08, "caseFirst", 9);
434 U_STRING_INIT(option_09, "scriptOrder", 11);
435 U_STRING_INIT(option_10, "charsetname", 11);
436 U_STRING_INIT(option_11, "charset", 7);
437 U_STRING_INIT(option_12, "before", 6);
438 U_STRING_INIT(option_13, "hiraganaQ", 9);
439 U_STRING_INIT(option_14, "strength", 8);
440 U_STRING_INIT(option_15, "first", 5);
441 U_STRING_INIT(option_16, "last", 4);
442 U_STRING_INIT(option_17, "optimize", 8);
443 U_STRING_INIT(option_18, "suppressContractions", 20);
444 U_STRING_INIT(option_19, "numericOrdering", 15);
445 didInit = TRUE;
446 }
b75a7d8f
A
447}
448
449
450// This function reads basic options to set in the runtime collator
451// used by data driven tests. Should not support build time options
452U_CAPI const UChar * U_EXPORT2
374ca955 453ucol_tok_getNextArgument(const UChar *start, const UChar *end,
46f4442e
A
454 UColAttribute *attrib, UColAttributeValue *value,
455 UErrorCode *status)
456{
457 uint32_t i = 0;
458 int32_t j=0;
459 UBool foundOption = FALSE;
460 const UChar *optionArg = NULL;
461
462 ucol_uprv_tok_initData();
463
464 while(start < end && u_isWhitespace(*start)) { /* eat whitespace */
465 start++;
466 }
467 if(start >= end) {
468 return NULL;
469 }
470 /* skip opening '[' */
471 if(*start == 0x005b) {
472 start++;
473 } else {
474 *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '['
475 return NULL;
b75a7d8f 476 }
b75a7d8f 477
46f4442e
A
478 while(i < UTOK_OPTION_COUNT) {
479 if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
480 foundOption = TRUE;
481 if(end - start > rulesOptions[i].optionLen) {
482 optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */
483 while(u_isWhitespace(*optionArg)) { /* eat whitespace */
484 optionArg++;
485 }
486 }
487 break;
b75a7d8f 488 }
46f4442e
A
489 i++;
490 }
491
492 if(!foundOption) {
493 *status = U_ILLEGAL_ARGUMENT_ERROR;
494 return NULL;
495 }
496
497 if(optionArg) {
498 for(j = 0; j<rulesOptions[i].subSize; j++) {
499 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
500 //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
501 *attrib = rulesOptions[i].attr;
502 *value = rulesOptions[i].subopts[j].attrVal;
503 optionArg += rulesOptions[i].subopts[j].subLen;
504 while(u_isWhitespace(*optionArg)) { /* eat whitespace */
505 optionArg++;
506 }
507 if(*optionArg == 0x005d) {
508 optionArg++;
509 return optionArg;
510 } else {
511 *status = U_ILLEGAL_ARGUMENT_ERROR;
512 return NULL;
513 }
514 }
b75a7d8f 515 }
b75a7d8f 516 }
46f4442e
A
517 *status = U_ILLEGAL_ARGUMENT_ERROR;
518 return NULL;
b75a7d8f
A
519}
520
374ca955 521static
b75a7d8f 522USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) {
46f4442e
A
523 while(*start != 0x005b) { /* advance while we find the first '[' */
524 start++;
525 }
526 // now we need to get a balanced set of '[]'. The problem is that a set can have
527 // many, and *end point to the first closing '['
528 int32_t noOpenBraces = 1;
529 int32_t current = 1; // skip the opening brace
530 while(start+current < end && noOpenBraces != 0) {
531 if(start[current] == 0x005b) {
532 noOpenBraces++;
533 } else if(start[current] == 0x005D) { // closing brace
534 noOpenBraces--;
535 }
536 current++;
b75a7d8f 537 }
b75a7d8f 538
46f4442e
A
539 if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) {
540 *status = U_ILLEGAL_ARGUMENT_ERROR;
541 return NULL;
542 }
543 return uset_openPattern(start, current, status);
b75a7d8f
A
544}
545
546static
547int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) {
46f4442e
A
548 int32_t i = 0;
549 ucol_uprv_tok_initData();
550
551 while(u_isWhitespace(*start)) { /* eat whitespace */
552 start++;
553 }
554 while(i < UTOK_OPTION_COUNT) {
555 if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
556 if(end - start > rulesOptions[i].optionLen) {
557 *optionArg = start+rulesOptions[i].optionLen; /* start of the options*/
558 while(u_isWhitespace(**optionArg)) { /* eat whitespace */
559 (*optionArg)++;
560 }
561 }
562 break;
b75a7d8f 563 }
46f4442e
A
564 i++;
565 }
566 if(i == UTOK_OPTION_COUNT) {
567 i = -1; // didn't find an option
b75a7d8f 568 }
46f4442e 569 return i;
b75a7d8f
A
570}
571
572
573// reads and conforms to various options in rules
574// end is the position of the first closing ']'
575// However, some of the options take an UnicodeSet definition
576// which needs to duplicate the closing ']'
577// for example: '[copy [\uAC00-\uD7FF]]'
374ca955 578// These options will move end to the second ']' and the
b75a7d8f
A
579// caller will set the current to it.
580static
581uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) {
46f4442e
A
582 const UChar* start = src->current;
583 int32_t i = 0;
584 int32_t j=0;
585 const UChar *optionArg = NULL;
b75a7d8f 586
46f4442e 587 uint8_t result = 0;
b75a7d8f 588
46f4442e
A
589 start++; /*skip opening '['*/
590 i = ucol_uprv_tok_readOption(start, src->end, &optionArg);
591 if(optionArg) {
592 src->current = optionArg;
593 }
b75a7d8f 594
46f4442e
A
595 if(i < 0) {
596 *status = U_ILLEGAL_ARGUMENT_ERROR;
597 } else {
598 int32_t noOpenBraces = 1;
599 switch(i) {
b75a7d8f
A
600 case OPTION_ALTERNATE_HANDLING:
601 case OPTION_FRENCH_COLLATION:
602 case OPTION_CASE_LEVEL:
603 case OPTION_CASE_FIRST:
604 case OPTION_NORMALIZATION_MODE:
605 case OPTION_HIRAGANA_QUATERNARY:
606 case OPTION_STRENGTH:
607 case OPTION_NUMERIC_COLLATION:
46f4442e
A
608 if(optionArg) {
609 for(j = 0; j<rulesOptions[i].subSize; j++) {
610 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
611 ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
612 result = UCOL_TOK_SUCCESS;
613 }
614 }
b75a7d8f 615 }
46f4442e
A
616 if(result == 0) {
617 *status = U_ILLEGAL_ARGUMENT_ERROR;
618 }
619 break;
b75a7d8f 620 case OPTION_VARIABLE_TOP:
46f4442e
A
621 result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP;
622 break;
b75a7d8f 623 case OPTION_REARRANGE:
46f4442e
A
624 result = UCOL_TOK_SUCCESS;
625 break;
b75a7d8f 626 case OPTION_BEFORE:
46f4442e
A
627 if(optionArg) {
628 for(j = 0; j<rulesOptions[i].subSize; j++) {
629 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
630 result = UCOL_TOK_SUCCESS | rulesOptions[i].subopts[j].attrVal + 1;
631 }
632 }
b75a7d8f 633 }
46f4442e
A
634 if(result == 0) {
635 *status = U_ILLEGAL_ARGUMENT_ERROR;
636 }
637 break;
b75a7d8f 638 case OPTION_TOP: /* we are going to have an array with structures of limit CEs */
46f4442e
A
639 /* index to this array will be src->parsedToken.indirectIndex*/
640 src->parsedToken.indirectIndex = 0;
641 result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;
642 break;
b75a7d8f
A
643 case OPTION_FIRST:
644 case OPTION_LAST: /* first, last */
46f4442e
A
645 for(j = 0; j<rulesOptions[i].subSize; j++) {
646 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
647 // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first
648 // element of indirect boundaries is reserved for top.
649 src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2);
650 result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;;
651 }
b75a7d8f 652 }
46f4442e
A
653 if(result == 0) {
654 *status = U_ILLEGAL_ARGUMENT_ERROR;
655 }
656 break;
b75a7d8f
A
657 case OPTION_OPTIMIZE:
658 case OPTION_SUPPRESS_CONTRACTIONS: // copy and remove are handled before normalization
46f4442e
A
659 // we need to move end here
660 src->current++; // skip opening brace
661 while(src->current < src->end && noOpenBraces != 0) {
662 if(*src->current == 0x005b) {
663 noOpenBraces++;
664 } else if(*src->current == 0x005D) { // closing brace
665 noOpenBraces--;
666 }
667 src->current++;
b75a7d8f 668 }
46f4442e
A
669 result = UCOL_TOK_SUCCESS;
670 break;
b75a7d8f 671 default:
46f4442e
A
672 *status = U_UNSUPPORTED_ERROR;
673 break;
674 }
b75a7d8f 675 }
46f4442e
A
676 src->current = u_memchr(src->current, 0x005d, src->end-src->current);
677 return result;
b75a7d8f
A
678}
679
374ca955
A
680
681inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) {
46f4442e 682 if(src->extraCurrent+len >= src->extraEnd) {
374ca955
A
683 /* reallocate */
684 UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar));
685 if(newSrc != NULL) {
46f4442e
A
686 src->current = newSrc + (src->current - src->source);
687 src->extraCurrent = newSrc + (src->extraCurrent - src->source);
688 src->end = newSrc + (src->end - src->source);
689 src->extraEnd = newSrc + (src->extraEnd-src->source)*2;
690 src->sourceCurrent = newSrc + (src->sourceCurrent-src->source);
691 src->source = newSrc;
374ca955 692 } else {
46f4442e 693 *status = U_MEMORY_ALLOCATION_ERROR;
374ca955 694 }
46f4442e
A
695 }
696 if(len == 1) {
697 *src->extraCurrent++ = *stuff;
698 } else {
374ca955
A
699 uprv_memcpy(src->extraCurrent, stuff, len*sizeof(UChar));
700 src->extraCurrent += len;
46f4442e 701 }
374ca955
A
702
703
704}
705
706inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) {
46f4442e
A
707 /*
708 top = TRUE;
709 */
710 UChar buff[5];
711 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
712 buff[0] = 0xFFFE;
713 buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
714 buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
715 if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
716 src->parsedToken.charsLen = 3;
717 ucol_tok_addToExtraCurrent(src, buff, 3, status);
718 } else {
719 buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
720 buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
721 src->parsedToken.charsLen = 5;
722 ucol_tok_addToExtraCurrent(src, buff, 5, status);
723 }
724 return TRUE;
b75a7d8f
A
725}
726
374ca955
A
727static UBool isCharNewLine(UChar c){
728 switch(c){
729 case 0x000A: /* LF */
730 case 0x000D: /* CR */
731 case 0x000C: /* FF */
732 case 0x0085: /* NEL */
733 case 0x2028: /* LS */
734 case 0x2029: /* PS */
735 return TRUE;
736 default:
737 return FALSE;
738 }
739}
740
b75a7d8f 741U_CAPI const UChar* U_EXPORT2
374ca955 742ucol_tok_parseNextToken(UColTokenParser *src,
b75a7d8f
A
743 UBool startOfRules,
744 UParseError *parseError,
46f4442e
A
745 UErrorCode *status)
746{
747 /* parsing part */
748 UBool variableTop = FALSE;
749 UBool top = FALSE;
750 UBool inChars = TRUE;
751 UBool inQuote = FALSE;
752 UBool wasInQuote = FALSE;
753 uint8_t before = 0;
754 UBool isEscaped = FALSE;
755 // TODO: replace these variables with src->parsedToken counterparts
756 // no need to use them anymore since we have src->parsedToken.
757 // Ideally, token parser would be a nice class... Once, when I have
758 // more time (around 2020 probably).
759 uint32_t newExtensionLen = 0;
760 uint32_t extensionOffset = 0;
761 uint32_t newStrength = UCOL_TOK_UNSET;
762 UChar buff[10];
763
764 src->parsedToken.charsOffset = 0; src->parsedToken.charsLen = 0;
765 src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;
766 src->parsedToken.indirectIndex = 0;
767
768 while (src->current < src->end) {
769 UChar ch = *(src->current);
770
771 if (inQuote) {
772 if (ch == 0x0027/*'\''*/) {
773 inQuote = FALSE;
774 } else {
775 if ((src->parsedToken.charsLen == 0) || inChars) {
776 if(src->parsedToken.charsLen == 0) {
777 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
778 }
779 src->parsedToken.charsLen++;
780 } else {
781 if(newExtensionLen == 0) {
782 extensionOffset = (uint32_t)(src->extraCurrent - src->source);
783 }
784 newExtensionLen++;
785 }
b75a7d8f 786 }
46f4442e
A
787 }else if(isEscaped){
788 isEscaped =FALSE;
789 if (newStrength == UCOL_TOK_UNSET) {
790 *status = U_INVALID_FORMAT_ERROR;
791 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
792 return NULL;
793 // enabling rules to start with non-tokens a < b
794 // newStrength = UCOL_TOK_RESET;
b75a7d8f 795 }
46f4442e
A
796 if(ch != 0x0000 && src->current != src->end) {
797 if (inChars) {
798 if(src->parsedToken.charsLen == 0) {
799 src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
800 }
801 src->parsedToken.charsLen++;
802 } else {
803 if(newExtensionLen == 0) {
804 extensionOffset = (uint32_t)(src->current - src->source);
805 }
806 newExtensionLen++;
807 }
b75a7d8f 808 }
46f4442e
A
809 }else {
810 if(!uprv_isRuleWhiteSpace(ch)) {
811 /* Sets the strength for this entry */
812 switch (ch) {
813 case 0x003D/*'='*/ :
814 if (newStrength != UCOL_TOK_UNSET) {
815 goto EndOfLoop;
816 }
b75a7d8f 817
46f4442e
A
818 /* if we start with strength, we'll reset to top */
819 if(startOfRules == TRUE) {
820 src->parsedToken.indirectIndex = 5;
821 top = ucol_tok_doSetTop(src, status);
822 newStrength = UCOL_TOK_RESET;
823 goto EndOfLoop;
824 }
825 newStrength = UCOL_IDENTICAL;
826 break;
b75a7d8f 827
46f4442e
A
828 case 0x002C/*','*/:
829 if (newStrength != UCOL_TOK_UNSET) {
830 goto EndOfLoop;
831 }
b75a7d8f 832
46f4442e
A
833 /* if we start with strength, we'll reset to top */
834 if(startOfRules == TRUE) {
835 src->parsedToken.indirectIndex = 5;
836 top = ucol_tok_doSetTop(src, status);
837 newStrength = UCOL_TOK_RESET;
838 goto EndOfLoop;
839 }
840 newStrength = UCOL_TERTIARY;
841 break;
b75a7d8f 842
46f4442e
A
843 case 0x003B/*';'*/:
844 if (newStrength != UCOL_TOK_UNSET) {
845 goto EndOfLoop;
846 }
b75a7d8f 847
46f4442e
A
848 /* if we start with strength, we'll reset to top */
849 if(startOfRules == TRUE) {
850 src->parsedToken.indirectIndex = 5;
851 top = ucol_tok_doSetTop(src, status);
852 newStrength = UCOL_TOK_RESET;
853 goto EndOfLoop;
b75a7d8f 854 }
46f4442e
A
855 newStrength = UCOL_SECONDARY;
856 break;
b75a7d8f 857
46f4442e
A
858 case 0x003C/*'<'*/:
859 if (newStrength != UCOL_TOK_UNSET) {
860 goto EndOfLoop;
861 }
b75a7d8f 862
46f4442e
A
863 /* if we start with strength, we'll reset to top */
864 if(startOfRules == TRUE) {
865 src->parsedToken.indirectIndex = 5;
866 top = ucol_tok_doSetTop(src, status);
867 newStrength = UCOL_TOK_RESET;
868 goto EndOfLoop;
869 }
870 /* before this, do a scan to verify whether this is */
871 /* another strength */
872 if(*(src->current+1) == 0x003C) {
873 src->current++;
874 if(*(src->current+1) == 0x003C) {
875 src->current++; /* three in a row! */
876 newStrength = UCOL_TERTIARY;
877 } else { /* two in a row */
878 newStrength = UCOL_SECONDARY;
879 }
880 } else { /* just one */
881 newStrength = UCOL_PRIMARY;
882 }
883 break;
b75a7d8f 884
46f4442e
A
885 case 0x0026/*'&'*/:
886 if (newStrength != UCOL_TOK_UNSET) {
887 /**/
888 goto EndOfLoop;
889 }
b75a7d8f 890
46f4442e
A
891 newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
892 break;
893
894 case 0x005b/*'['*/:
895 /* options - read an option, analyze it */
896 if(u_strchr(src->current, 0x005d /*']'*/) != NULL) {
897 uint8_t result = ucol_uprv_tok_readAndSetOption(src, status);
898 if(U_SUCCESS(*status)) {
899 if(result & UCOL_TOK_TOP) {
900 if(newStrength == UCOL_TOK_RESET) {
901 top = ucol_tok_doSetTop(src, status);
902 if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
903 src->parsedToken.charsLen+=2;
904 buff[0] = 0x002d;
905 buff[1] = before;
906 ucol_tok_addToExtraCurrent(src, buff, 2, status);
907 }
908
909 src->current++;
910 goto EndOfLoop;
911 } else {
912 *status = U_INVALID_FORMAT_ERROR;
913 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
914 }
915 } else if(result & UCOL_TOK_VARIABLE_TOP) {
916 if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
917 variableTop = TRUE;
918 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
919 src->parsedToken.charsLen = 1;
920 buff[0] = 0xFFFF;
921 ucol_tok_addToExtraCurrent(src, buff, 1, status);
922 src->current++;
923 goto EndOfLoop;
924 } else {
925 *status = U_INVALID_FORMAT_ERROR;
926 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
927 }
928 } else if (result & UCOL_TOK_BEFORE){
929 if(newStrength == UCOL_TOK_RESET) {
930 before = result & UCOL_TOK_BEFORE;
931 } else {
932 *status = U_INVALID_FORMAT_ERROR;
933 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
934
935 }
936 }
937 } else {
938 *status = U_INVALID_FORMAT_ERROR;
939 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
940 return NULL;
941 }
942 }
943 break;
944 case 0x0021/*! skip java thai modifier reordering*/:
945 break;
946 case 0x002F/*'/'*/:
947 wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
948 inChars = FALSE; /* we're now processing expansion */
949 break;
950 case 0x005C /* back slash for escaped chars */:
951 isEscaped = TRUE;
952 break;
953 /* found a quote, we're gonna start copying */
954 case 0x0027/*'\''*/:
955 if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
956 *status = U_INVALID_FORMAT_ERROR;
957 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
958 return NULL;
959 // enabling rules to start with a non-token character a < b
960 // newStrength = UCOL_TOK_RESET;
961 }
b75a7d8f 962
46f4442e
A
963 inQuote = TRUE;
964
965 if(inChars) { /* we're doing characters */
966 if(wasInQuote == FALSE) {
967 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
968 }
969 if (src->parsedToken.charsLen != 0) {
970 ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
971 }
972 src->parsedToken.charsLen++;
973 } else { /* we're doing an expansion */
974 if(wasInQuote == FALSE) {
975 extensionOffset = (uint32_t)(src->extraCurrent - src->source);
976 }
977 if (newExtensionLen != 0) {
978 ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status);
979 }
980 newExtensionLen++;
981 }
b75a7d8f 982
46f4442e 983 wasInQuote = TRUE;
b75a7d8f 984
46f4442e
A
985 ch = *(++(src->current));
986 if(ch == 0x0027) { /* copy the double quote */
987 ucol_tok_addToExtraCurrent(src, &ch, 1, status);
988 inQuote = FALSE;
989 }
990 break;
991
992 /* '@' is french only if the strength is not currently set */
993 /* if it is, it's just a regular character in collation rules */
994 case 0x0040/*'@'*/:
995 if (newStrength == UCOL_TOK_UNSET) {
996 src->opts->frenchCollation = UCOL_ON;
997 break;
998 }
b75a7d8f 999
46f4442e
A
1000 case 0x007C /*|*/: /* this means we have actually been reading prefix part */
1001 // we want to store read characters to the prefix part and continue reading
1002 // the characters (proper way would be to restart reading the chars, but in
1003 // that case we would have to complicate the token hasher, which I do not
1004 // intend to play with. Instead, we will do prefixes when prefixes are due
1005 // (before adding the elements).
1006 src->parsedToken.prefixOffset = src->parsedToken.charsOffset;
1007 src->parsedToken.prefixLen = src->parsedToken.charsLen;
1008
1009 if(inChars) { /* we're doing characters */
1010 if(wasInQuote == FALSE) {
1011 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1012 }
1013 if (src->parsedToken.charsLen != 0) {
1014 ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
1015 }
1016 src->parsedToken.charsLen++;
1017 }
b75a7d8f 1018
46f4442e
A
1019 wasInQuote = TRUE;
1020
1021 do {
1022 ch = *(++(src->current));
1023 // skip whitespace between '|' and the character
1024 } while (uprv_isRuleWhiteSpace(ch));
1025 break;
1026
1027 //charsOffset = 0;
1028 //newCharsLen = 0;
1029 //break; // We want to store the whole prefix/character sequence. If we break
1030 // the '|' is going to get lost.
1031 case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */
1032 do {
1033 ch = *(++(src->current));
1034 } while (!isCharNewLine(ch));
1035
1036 break;
1037 default:
1038 if (newStrength == UCOL_TOK_UNSET) {
1039 *status = U_INVALID_FORMAT_ERROR;
1040 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1041 return NULL;
1042 }
374ca955 1043
46f4442e
A
1044 if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
1045 *status = U_INVALID_FORMAT_ERROR;
1046 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1047 return NULL;
1048 }
374ca955 1049
46f4442e
A
1050 if(ch == 0x0000 && src->current+1 == src->end) {
1051 break;
1052 }
b75a7d8f 1053
46f4442e
A
1054 if (inChars) {
1055 if(src->parsedToken.charsLen == 0) {
1056 src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
1057 }
1058 src->parsedToken.charsLen++;
1059 } else {
1060 if(newExtensionLen == 0) {
1061 extensionOffset = (uint32_t)(src->current - src->source);
1062 }
1063 newExtensionLen++;
1064 }
b75a7d8f 1065
46f4442e
A
1066 break;
1067 }
b75a7d8f 1068 }
46f4442e 1069 }
b75a7d8f 1070
46f4442e
A
1071 if(wasInQuote) {
1072 if(ch != 0x27) {
1073 if(inQuote || !uprv_isRuleWhiteSpace(ch)) {
1074 ucol_tok_addToExtraCurrent(src, &ch, 1, status);
1075 }
b75a7d8f 1076 }
46f4442e 1077 }
b75a7d8f 1078
46f4442e 1079 src->current++;
b75a7d8f
A
1080 }
1081
46f4442e
A
1082EndOfLoop:
1083 wasInQuote = FALSE;
1084 if (newStrength == UCOL_TOK_UNSET) {
1085 return NULL;
b75a7d8f
A
1086 }
1087
46f4442e
A
1088 if (src->parsedToken.charsLen == 0 && top == FALSE) {
1089 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1090 *status = U_INVALID_FORMAT_ERROR;
1091 return NULL;
b75a7d8f
A
1092 }
1093
46f4442e
A
1094 src->parsedToken.strength = newStrength;
1095 src->parsedToken.extensionOffset = extensionOffset;
1096 src->parsedToken.extensionLen = newExtensionLen;
1097 src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before;
b75a7d8f 1098
46f4442e 1099 return src->current;
b75a7d8f
A
1100}
1101
1102/*
1103Processing Description
46f4442e
A
11041 Build a ListList. Each list has a header, which contains two lists (positive
1105and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and
1106reset may be null.
11072 As you process, you keep a LAST pointer that points to the last token you
1108handled.
b75a7d8f
A
1109*/
1110
1111static UColToken *ucol_tok_initAReset(UColTokenParser *src, UChar *expand, uint32_t *expandNext,
46f4442e
A
1112 UParseError *parseError, UErrorCode *status)
1113{
1114 if(src->resultLen == src->listCapacity) {
1115 // Unfortunately, this won't work, as we store addresses of lhs in token
1116 src->listCapacity *= 2;
1117 src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader));
1118 if(src->lh == NULL) {
1119 *status = U_MEMORY_ALLOCATION_ERROR;
1120 return NULL;
1121 }
b75a7d8f 1122 }
46f4442e
A
1123 /* do the reset thing */
1124 UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
1125 /* test for NULL */
1126 if (sourceToken == NULL) {
1127 *status = U_MEMORY_ALLOCATION_ERROR;
1128 return NULL;
1129 }
1130 sourceToken->rulesToParse = src->source;
1131 sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1132 sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
1133
1134 sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
1135 sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
1136
1137 // keep the flags around so that we know about before
1138 sourceToken->flags = src->parsedToken.flags;
1139
1140 if(src->parsedToken.prefixOffset != 0) {
1141 // this is a syntax error
1142 *status = U_INVALID_FORMAT_ERROR;
1143 syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
1144 uprv_free(sourceToken);
1145 return 0;
b75a7d8f 1146 } else {
46f4442e 1147 sourceToken->prefix = 0;
b75a7d8f 1148 }
b75a7d8f 1149
46f4442e
A
1150 sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
1151 sourceToken->strength = UCOL_TOK_RESET;
1152 sourceToken->next = NULL;
1153 sourceToken->previous = NULL;
1154 sourceToken->noOfCEs = 0;
1155 sourceToken->noOfExpCEs = 0;
1156 sourceToken->listHeader = &src->lh[src->resultLen];
1157
1158 src->lh[src->resultLen].first = NULL;
1159 src->lh[src->resultLen].last = NULL;
1160 src->lh[src->resultLen].first = NULL;
1161 src->lh[src->resultLen].last = NULL;
b75a7d8f 1162
46f4442e 1163 src->lh[src->resultLen].reset = sourceToken;
b75a7d8f 1164
46f4442e
A
1165 /*
1166 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
1167 First convert all expansions into normal form. Examples:
1168 If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
1169 d * ... into &x * c/y * d * ...
1170 Note: reset values can never have expansions, although they can cause the
1171 very next item to have one. They may be contractions, if they are found
1172 earlier in the list.
1173 */
1174 *expandNext = 0;
1175 if(expand != NULL) {
1176 /* check to see if there is an expansion */
1177 if(src->parsedToken.charsLen > 1) {
1178 uint32_t resetCharsOffset;
1179 resetCharsOffset = (uint32_t)(expand - src->source);
1180 sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset;
1181 *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset);
1182 }
1183 }
1184
1185 src->resultLen++;
1186
1187 uhash_put(src->tailored, sourceToken, sourceToken, status);
1188
1189 return sourceToken;
b75a7d8f
A
1190}
1191
1192static
1193inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) {
46f4442e
A
1194 if(U_FAILURE(*status)) {
1195 return NULL;
1196 }
1197 /* this is a virgin before - we need to fish the anchor from the UCA */
1198 collIterate s;
1199 uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND;
1200 uint32_t CE, SecondCE;
1201 uint32_t invPos;
1202 if(sourceToken != NULL) {
1203 uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s);
1204 } else {
1205 uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s);
1206 }
1207
1208 baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;
1209 baseContCE = ucol_getNextCE(src->UCA, &s, status);
1210 if(baseContCE == UCOL_NO_MORE_CES) {
1211 baseContCE = 0;
1212 }
1213
1214
1215 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1216 uint32_t ch = 0;
1217 uint32_t expandNext = 0;
1218 UColToken key;
1219
1220 if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
1221 uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16;
1222 uint32_t raw = uprv_uca_getRawFromImplicit(primary);
1223 ch = uprv_uca_getCodePointFromRaw(raw-1);
1224 uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
1225 CE = primaryCE & UCOL_PRIMARYMASK | 0x0505;
1226 SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER;
1227
1228 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1229 *src->extraCurrent++ = 0xFFFE;
1230 *src->extraCurrent++ = (UChar)ch;
1231 src->parsedToken.charsLen++;
1232
1233 key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
1234 key.rulesToParse = src->source;
1235
1236 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1237 sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1238
1239 if(sourceToken == NULL) {
1240 src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1241 if(isContinuation(SecondCE)) {
1242 src->lh[src->resultLen].baseContCE = SecondCE;
1243 } else {
1244 src->lh[src->resultLen].baseContCE = 0;
1245 }
1246 src->lh[src->resultLen].nextCE = 0;
1247 src->lh[src->resultLen].nextContCE = 0;
1248 src->lh[src->resultLen].previousCE = 0;
1249 src->lh[src->resultLen].previousContCE = 0;
1250
1251 src->lh[src->resultLen].indirect = FALSE;
1252
1253 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1254 }
1255
1256 } else {
1257 invPos = ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
1258
1259 // we got the previous CE. Now we need to see if the difference between
1260 // the two CEs is really of the requested strength.
1261 // if it's a bigger difference (we asked for secondary and got primary), we
1262 // need to modify the CE.
1263 if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) {
1264 // adjust the strength
1265 // now we are in the situation where our baseCE should actually be modified in
1266 // order to get the CE in the right position.
374ca955 1267 if(strength == UCOL_SECONDARY) {
46f4442e 1268 CE = baseCE - 0x0200;
374ca955 1269 } else { // strength == UCOL_TERTIARY
46f4442e
A
1270 CE = baseCE - 0x02;
1271 }
1272 if(baseContCE) {
1273 if(strength == UCOL_SECONDARY) {
1274 SecondCE = baseContCE - 0x0200;
1275 } else { // strength == UCOL_TERTIARY
1276 SecondCE = baseContCE - 0x02;
1277 }
374ca955 1278 }
46f4442e 1279 }
374ca955
A
1280
1281#if 0
46f4442e
A
1282 // the code below relies on getting a code point from the inverse table, in order to be
1283 // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
1284 // 1. There are many code points that have the same CE
1285 // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
1286 // Also, in case when there is no equivalent strength before an element, we have to actually
1287 // construct one. For example, &[before 2]a << x won't result in x << a, because the element
1288 // before a is a primary difference.
374ca955 1289
46f4442e 1290 //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
374ca955
A
1291
1292
46f4442e 1293 ch = CETable[3*invPos+2];
374ca955 1294
46f4442e
A
1295 if((ch & UCOL_INV_SIZEMASK) != 0) {
1296 uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts);
1297 uint32_t offset = (ch & UCOL_INV_OFFSETMASK);
1298 ch = conts[offset];
1299 }
374ca955 1300
46f4442e
A
1301 *src->extraCurrent++ = (UChar)ch;
1302 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1);
1303 src->parsedToken.charsLen = 1;
374ca955 1304
46f4442e
A
1305 // We got an UCA before. However, this might have been tailored.
1306 // example:
1307 // &\u30ca = \u306a
1308 // &[before 3]\u306a<<<\u306a|\u309d
374ca955
A
1309
1310
46f4442e
A
1311 // uint32_t key = (*newCharsLen << 24) | *charsOffset;
1312 key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
1313 key.rulesToParse = src->source;
374ca955 1314
46f4442e
A
1315 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1316 sourceToken = (UColToken *)uhash_get(src->tailored, &key);
374ca955
A
1317#endif
1318
46f4442e
A
1319 // here is how it should be. The situation such as &[before 1]a < x, should be
1320 // resolved exactly as if we wrote &a > x.
1321 // therefore, I don't really care if the UCA value before a has been changed.
1322 // However, I do care if the strength between my element and the previous element
1323 // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
1324 // have to construct the base CE.
374ca955
A
1325
1326
1327
46f4442e
A
1328 // if we found a tailored thing, we have to use the UCA value and construct
1329 // a new reset token with constructed name
1330 //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
374ca955
A
1331 // character to which we want to anchor is already tailored.
1332 // We need to construct a new token which will be the anchor
1333 // point
1334 //*(src->extraCurrent-1) = 0xFFFE;
1335 //*src->extraCurrent++ = (UChar)ch;
1336 // grab before
1337 src->parsedToken.charsOffset -= 10;
1338 src->parsedToken.charsLen += 10;
1339 src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1340 if(isContinuation(SecondCE)) {
46f4442e 1341 src->lh[src->resultLen].baseContCE = SecondCE;
374ca955 1342 } else {
46f4442e 1343 src->lh[src->resultLen].baseContCE = 0;
374ca955
A
1344 }
1345 src->lh[src->resultLen].nextCE = 0;
1346 src->lh[src->resultLen].nextContCE = 0;
1347 src->lh[src->resultLen].previousCE = 0;
1348 src->lh[src->resultLen].previousContCE = 0;
b75a7d8f 1349
374ca955 1350 src->lh[src->resultLen].indirect = FALSE;
b75a7d8f 1351
374ca955 1352 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
46f4442e
A
1353 //}
1354 }
b75a7d8f 1355
46f4442e 1356 return sourceToken;
b75a7d8f
A
1357
1358}
1359
1360uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) {
46f4442e
A
1361 UColToken *lastToken = NULL;
1362 const UChar *parseEnd = NULL;
1363 uint32_t expandNext = 0;
1364 UBool variableTop = FALSE;
1365 UBool top = FALSE;
1366 uint16_t specs = 0;
1367 UColTokListHeader *ListList = NULL;
b75a7d8f 1368
46f4442e 1369 src->parsedToken.strength = UCOL_TOK_UNSET;
b75a7d8f 1370
46f4442e 1371 ListList = src->lh;
b75a7d8f 1372
46f4442e
A
1373 if(U_FAILURE(*status)) {
1374 return 0;
1375 }
b75a7d8f 1376
46f4442e
A
1377 while(src->current < src->end) {
1378 src->parsedToken.prefixOffset = 0;
374ca955 1379
46f4442e
A
1380 parseEnd = ucol_tok_parseNextToken(src,
1381 (UBool)(lastToken == NULL),
1382 parseError,
1383 status);
b75a7d8f 1384
46f4442e 1385 specs = src->parsedToken.flags;
b75a7d8f
A
1386
1387
46f4442e
A
1388 variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0);
1389 top = ((specs & UCOL_TOK_TOP) != 0);
b75a7d8f 1390
46f4442e
A
1391 if(U_SUCCESS(*status) && parseEnd != NULL) {
1392 UColToken *sourceToken = NULL;
1393 //uint32_t key = 0;
1394 uint32_t lastStrength = UCOL_TOK_UNSET;
374ca955 1395
46f4442e
A
1396 if(lastToken != NULL ) {
1397 lastStrength = lastToken->strength;
1398 }
b75a7d8f 1399
46f4442e
A
1400 //key = newCharsLen << 24 | charsOffset;
1401 UColToken key;
1402 key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1403 key.rulesToParse = src->source;
b75a7d8f 1404
46f4442e
A
1405 /* 4 Lookup each source in the CharsToToken map, and find a sourceToken */
1406 sourceToken = (UColToken *)uhash_get(src->tailored, &key);
b75a7d8f 1407
46f4442e
A
1408 if(src->parsedToken.strength != UCOL_TOK_RESET) {
1409 if(lastToken == NULL) { /* this means that rules haven't started properly */
1410 *status = U_INVALID_FORMAT_ERROR;
1411 syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
1412 return 0;
1413 }
1414 /* 6 Otherwise (when relation != reset) */
1415 if(sourceToken == NULL) {
1416 /* If sourceToken is null, create new one, */
1417 sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
1418 /* test for NULL */
1419 if (sourceToken == NULL) {
1420 *status = U_MEMORY_ALLOCATION_ERROR;
1421 return 0;
1422 }
1423 sourceToken->rulesToParse = src->source;
1424 sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1425
1426 sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
1427
1428 sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset;
1429 sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset);
1430
1431 sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
1432 sourceToken->next = NULL;
1433 sourceToken->previous = NULL;
1434 sourceToken->noOfCEs = 0;
1435 sourceToken->noOfExpCEs = 0;
1436 // keep the flags around so that we know about before
1437 sourceToken->flags = src->parsedToken.flags;
1438 uhash_put(src->tailored, sourceToken, sourceToken, status);
1439 if(U_FAILURE(*status)) {
1440 return 0;
1441 }
1442 } else {
1443 /* we could have fished out a reset here */
1444 if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) {
1445 /* otherwise remove sourceToken from where it was. */
1446 if(sourceToken->next != NULL) {
1447 if(sourceToken->next->strength > sourceToken->strength) {
1448 sourceToken->next->strength = sourceToken->strength;
1449 }
1450 sourceToken->next->previous = sourceToken->previous;
1451 } else {
1452 sourceToken->listHeader->last = sourceToken->previous;
1453 }
1454
1455 if(sourceToken->previous != NULL) {
1456 sourceToken->previous->next = sourceToken->next;
1457 } else {
1458 sourceToken->listHeader->first = sourceToken->next;
1459 }
1460 sourceToken->next = NULL;
1461 sourceToken->previous = NULL;
1462 }
1463 }
b75a7d8f 1464
46f4442e
A
1465 sourceToken->strength = src->parsedToken.strength;
1466 sourceToken->listHeader = lastToken->listHeader;
1467
1468 /*
1469 1. Find the strongest strength in each list, and set strongestP and strongestN
1470 accordingly in the headers.
1471 */
1472 if(lastStrength == UCOL_TOK_RESET
1473 || sourceToken->listHeader->first == 0) {
1474 /* If LAST is a reset
1475 insert sourceToken in the list. */
1476 if(sourceToken->listHeader->first == 0) {
1477 sourceToken->listHeader->first = sourceToken;
1478 sourceToken->listHeader->last = sourceToken;
1479 } else { /* we need to find a place for us */
1480 /* and we'll get in front of the same strength */
1481 if(sourceToken->listHeader->first->strength <= sourceToken->strength) {
1482 sourceToken->next = sourceToken->listHeader->first;
1483 sourceToken->next->previous = sourceToken;
1484 sourceToken->listHeader->first = sourceToken;
1485 sourceToken->previous = NULL;
1486 } else {
1487 lastToken = sourceToken->listHeader->first;
1488 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
1489 lastToken = lastToken->next;
1490 }
1491 if(lastToken->next != NULL) {
1492 lastToken->next->previous = sourceToken;
1493 } else {
1494 sourceToken->listHeader->last = sourceToken;
1495 }
1496 sourceToken->previous = lastToken;
1497 sourceToken->next = lastToken->next;
1498 lastToken->next = sourceToken;
1499 }
1500 }
1501 } else {
1502 /* Otherwise (when LAST is not a reset)
1503 if polarity (LAST) == polarity(relation), insert sourceToken after LAST,
1504 otherwise insert before.
1505 when inserting after or before, search to the next position with the same
1506 strength in that direction. (This is called postpone insertion). */
1507 if(sourceToken != lastToken) {
1508 if(lastToken->polarity == sourceToken->polarity) {
1509 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
1510 lastToken = lastToken->next;
1511 }
1512 sourceToken->previous = lastToken;
1513 if(lastToken->next != NULL) {
1514 lastToken->next->previous = sourceToken;
1515 } else {
1516 sourceToken->listHeader->last = sourceToken;
1517 }
1518
1519 sourceToken->next = lastToken->next;
1520 lastToken->next = sourceToken;
1521 } else {
1522 while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) {
1523 lastToken = lastToken->previous;
1524 }
1525 sourceToken->next = lastToken;
1526 if(lastToken->previous != NULL) {
1527 lastToken->previous->next = sourceToken;
1528 } else {
1529 sourceToken->listHeader->first = sourceToken;
1530 }
1531 sourceToken->previous = lastToken->previous;
1532 lastToken->previous = sourceToken;
1533 }
1534 } else { /* repeated one thing twice in rules, stay with the stronger strength */
1535 if(lastStrength < sourceToken->strength) {
1536 sourceToken->strength = lastStrength;
1537 }
1538 }
1539 }
b75a7d8f 1540
46f4442e
A
1541 /* if the token was a variable top, we're gonna put it in */
1542 if(variableTop == TRUE && src->varTop == NULL) {
1543 variableTop = FALSE;
1544 src->varTop = sourceToken;
1545 }
1546
1547 // Treat the expansions.
1548 // There are two types of expansions: explicit (x / y) and reset based propagating expansions
1549 // (&abc * d * e <=> &ab * d / c * e / c)
1550 // if both of them are in effect for a token, they are combined.
1551
1552 sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
1553
1554 if(expandNext != 0) {
1555 if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */
1556 expandNext = 0;
1557 } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */
1558 sourceToken->expansion = expandNext;
1559 } else { /* there is both explicit and implicit expansion. We need to make a combination */
1560 uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar));
1561 uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar));
1562 sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source));
1563 src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen;
1564 }
1565 }
1566
1567 // This is just for debugging purposes
1568 if(sourceToken->expansion != 0) {
1569 sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
1570 } else {
1571 sourceToken->debugExpansion = 0;
1572 }
1573 // if the previous token was a reset before, the strength of this
1574 // token must match the strength of before. Otherwise we have an
1575 // undefined situation.
1576 // In other words, we currently have a cludge which we use to
1577 // represent &a >> x. This is written as &[before 2]a << x.
1578 if((lastToken->flags & UCOL_TOK_BEFORE) != 0) {
1579 uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1;
1580 if(beforeStrength != sourceToken->strength) {
1581 *status = U_INVALID_FORMAT_ERROR;
1582 syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
1583 return 0;
1584 }
1585 }
b75a7d8f 1586 } else {
46f4442e
A
1587 if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
1588 /* if the previous token was also a reset, */
1589 /*this means that we have two consecutive resets */
1590 /* and we want to remove the previous one if empty*/
1591 if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
1592 src->resultLen--;
1593 }
1594 }
b75a7d8f 1595
46f4442e
A
1596 if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
1597 uint32_t searchCharsLen = src->parsedToken.charsLen;
1598 while(searchCharsLen > 1 && sourceToken == NULL) {
1599 searchCharsLen--;
1600 //key = searchCharsLen << 24 | charsOffset;
1601 UColToken key;
1602 key.source = searchCharsLen << 24 | src->parsedToken.charsOffset;
1603 key.rulesToParse = src->source;
1604 sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1605 }
1606 if(sourceToken != NULL) {
1607 expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen);
1608 }
1609 }
b75a7d8f 1610
46f4442e
A
1611 if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */
1612 if(top == FALSE) { /* there is no indirection */
1613 uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
1614 if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
1615 /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
1616 while(sourceToken->strength > strength && sourceToken->previous != NULL) {
1617 sourceToken = sourceToken->previous;
1618 }
1619 /* here, either we hit the strength or NULL */
1620 if(sourceToken->strength == strength) {
1621 if(sourceToken->previous != NULL) {
1622 sourceToken = sourceToken->previous;
1623 } else { /* start of list */
1624 sourceToken = sourceToken->listHeader->reset;
1625 }
1626 } else { /* we hit NULL */
1627 /* we should be doing the else part */
1628 sourceToken = sourceToken->listHeader->reset;
1629 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
1630 }
1631 } else {
1632 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
1633 }
1634 } else { /* this is both before and indirection */
1635 top = FALSE;
1636 ListList[src->resultLen].previousCE = 0;
1637 ListList[src->resultLen].previousContCE = 0;
1638 ListList[src->resultLen].indirect = TRUE;
1639 /* we need to do slightly more work. we need to get the baseCE using the */
1640 /* inverse UCA & getPrevious. The next bound is not set, and will be decided */
1641 /* in ucol_bld */
1642 uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
1643 uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
1644 uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F;
1645 uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
1646
1647 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1648 if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
1649 uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16;
1650 uint32_t raw = uprv_uca_getRawFromImplicit(primary);
1651 uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
1652 CE = primaryCE & UCOL_PRIMARYMASK | 0x0505;
1653 SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER;
1654 } else {
1655 /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/
1656 ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
1657 }
1658
1659 ListList[src->resultLen].baseCE = CE;
1660 ListList[src->resultLen].baseContCE = SecondCE;
1661 ListList[src->resultLen].nextCE = 0;
1662 ListList[src->resultLen].nextContCE = 0;
1663
1664 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1665 }
1666 }
b75a7d8f 1667
b75a7d8f 1668
46f4442e
A
1669 /* 5 If the relation is a reset:
1670 If sourceToken is null
1671 Create new list, create new sourceToken, make the baseCE from source, put
1672 the sourceToken in ListHeader of the new list */
1673 if(sourceToken == NULL) {
1674 /*
1675 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
1676 First convert all expansions into normal form. Examples:
1677 If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
1678 d * ... into &x * c/y * d * ...
1679 Note: reset values can never have expansions, although they can cause the
1680 very next item to have one. They may be contractions, if they are found
1681 earlier in the list.
1682 */
1683 if(top == FALSE) {
1684 collIterate s;
1685 uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
1686
1687 uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s);
1688
1689 CE = ucol_getNextCE(src->UCA, &s, status);
1690 UChar *expand = s.pos;
1691 SecondCE = ucol_getNextCE(src->UCA, &s, status);
1692
1693 ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1694 if(isContinuation(SecondCE)) {
1695 ListList[src->resultLen].baseContCE = SecondCE;
1696 } else {
1697 ListList[src->resultLen].baseContCE = 0;
1698 }
1699 ListList[src->resultLen].nextCE = 0;
1700 ListList[src->resultLen].nextContCE = 0;
1701 ListList[src->resultLen].previousCE = 0;
1702 ListList[src->resultLen].previousContCE = 0;
1703 ListList[src->resultLen].indirect = FALSE;
1704 sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status);
1705 } else { /* top == TRUE */
1706 /* just use the supplied values */
1707 top = FALSE;
1708 ListList[src->resultLen].previousCE = 0;
1709 ListList[src->resultLen].previousContCE = 0;
1710 ListList[src->resultLen].indirect = TRUE;
1711 ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
1712 ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
1713 ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
1714 ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;
1715
1716 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
b75a7d8f 1717
46f4442e
A
1718 }
1719 } else { /* reset to something already in rules */
1720 top = FALSE;
374ca955 1721 }
b75a7d8f 1722 }
46f4442e
A
1723 /* 7 After all this, set LAST to point to sourceToken, and goto step 3. */
1724 lastToken = sourceToken;
1725 } else {
1726 if(U_FAILURE(*status)) {
1727 return 0;
374ca955 1728 }
b75a7d8f 1729 }
46f4442e 1730 }
b75a7d8f 1731
46f4442e
A
1732 if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
1733 src->resultLen--;
1734 }
1735 return src->resultLen;
1736}
b75a7d8f 1737
46f4442e
A
1738void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint32_t rulesLength, const UCollator *UCA, UErrorCode *status) {
1739 U_NAMESPACE_USE
b75a7d8f 1740
46f4442e
A
1741 uint32_t nSize = 0;
1742 uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE);
1743 if(U_FAILURE(*status)) {
1744 return;
1745 }
b75a7d8f 1746
46f4442e
A
1747 // set everything to zero, so that we can clean up gracefully
1748 uprv_memset(src, 0, sizeof(UColTokenParser));
1749
1750 // first we need to find options that don't like to be normalized,
1751 // like copy and remove...
1752 //const UChar *openBrace = rules;
1753 int32_t optionNumber = -1;
1754 const UChar *setStart;
1755 uint32_t i = 0;
1756 while(i < rulesLength) {
1757 if(rules[i] == 0x005B) {
1758 // while((openBrace = u_strchr(openBrace, 0x005B)) != NULL) { // find open braces
1759 //optionNumber = ucol_uprv_tok_readOption(openBrace+1, rules+rulesLength, &setStart);
1760 optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart);
1761 if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */
1762 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
1763 if(U_SUCCESS(*status)) {
1764 if(src->copySet == NULL) {
1765 src->copySet = newSet;
1766 } else {
1767 uset_addAll(src->copySet, newSet);
1768 uset_close(newSet);
1769 }
1770 } else {
1771 return;
1772 }
1773 } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {
1774 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
1775 if(U_SUCCESS(*status)) {
1776 if(src->removeSet == NULL) {
1777 src->removeSet = newSet;
1778 } else {
1779 uset_addAll(src->removeSet, newSet);
1780 uset_close(newSet);
1781 }
1782 } else {
1783 return;
1784 }
1785 }
b75a7d8f 1786 }
46f4442e
A
1787 //openBrace++;
1788 i++;
b75a7d8f 1789 }
b75a7d8f 1790
46f4442e
A
1791 src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar));
1792 /* test for NULL */
1793 if (src->source == NULL) {
1794 *status = U_MEMORY_ALLOCATION_ERROR;
1795 return;
1796 }
1797 uprv_memset(src->source, 0, estimatedSize*sizeof(UChar));
1798 nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status);
1799 if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) {
1800 *status = U_ZERO_ERROR;
1801 src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
1802 /* test for NULL */
1803 if (src->source == NULL) {
1804 *status = U_MEMORY_ALLOCATION_ERROR;
1805 return;
b75a7d8f 1806 }
46f4442e
A
1807 nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);
1808 }
1809 src->current = src->source;
1810 src->end = src->source+nSize;
1811 src->sourceCurrent = src->source;
1812 src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly
1813 src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
1814 src->varTop = NULL;
1815 src->UCA = UCA;
1816 src->invUCA = ucol_initInverseUCA(status);
1817 src->parsedToken.charsLen = 0;
1818 src->parsedToken.charsOffset = 0;
1819 src->parsedToken.extensionLen = 0;
1820 src->parsedToken.extensionOffset = 0;
1821 src->parsedToken.prefixLen = 0;
1822 src->parsedToken.prefixOffset = 0;
1823 src->parsedToken.flags = 0;
1824 src->parsedToken.strength = UCOL_TOK_UNSET;
1825 src->buildCCTabFlag = FALSE;
1826
1827 if(U_FAILURE(*status)) {
1828 return;
b75a7d8f 1829 }
46f4442e
A
1830 src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status);
1831 if(U_FAILURE(*status)) {
1832 return;
1833 }
1834 uhash_setValueDeleter(src->tailored, uhash_freeBlock);
1835
1836 src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
b75a7d8f 1837 /* test for NULL */
46f4442e 1838 if (src->opts == NULL) {
b75a7d8f
A
1839 *status = U_MEMORY_ALLOCATION_ERROR;
1840 return;
1841 }
46f4442e
A
1842
1843 uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet));
1844
1845 // rulesToParse = src->source;
1846 src->lh = 0;
1847 src->listCapacity = 1024;
1848 src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader));
1849 //Test for NULL
1850 if (src->lh == NULL) {
1851 *status = U_MEMORY_ALLOCATION_ERROR;
1852 return;
1853 }
1854 uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader));
1855 src->resultLen = 0;
1856
1857 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1858
1859 // UCOL_RESET_TOP_VALUE
1860 setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
1861 // UCOL_FIRST_PRIMARY_IGNORABLE
1862 setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);
1863 // UCOL_LAST_PRIMARY_IGNORABLE
1864 setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);
1865 // UCOL_FIRST_SECONDARY_IGNORABLE
1866 setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);
1867 // UCOL_LAST_SECONDARY_IGNORABLE
1868 setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);
1869 // UCOL_FIRST_TERTIARY_IGNORABLE
1870 setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);
1871 // UCOL_LAST_TERTIARY_IGNORABLE
1872 setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);
1873 // UCOL_FIRST_VARIABLE
1874 setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);
1875 // UCOL_LAST_VARIABLE
1876 setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);
1877 // UCOL_FIRST_NON_VARIABLE
1878 setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);
1879 // UCOL_LAST_NON_VARIABLE
1880 setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
1881 // UCOL_FIRST_IMPLICIT
1882 setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0);
1883 // UCOL_LAST_IMPLICIT
1884 setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING);
1885 // UCOL_FIRST_TRAILING
1886 setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0);
1887 // UCOL_LAST_TRAILING
1888 setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0);
1889 ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24);
b75a7d8f
A
1890}
1891
1892
1893void ucol_tok_closeTokenList(UColTokenParser *src) {
46f4442e
A
1894 if(src->copySet != NULL) {
1895 uset_close(src->copySet);
1896 }
1897 if(src->removeSet != NULL) {
1898 uset_close(src->removeSet);
1899 }
1900 if(src->tailored != NULL) {
1901 uhash_close(src->tailored);
1902 }
1903 if(src->lh != NULL) {
1904 uprv_free(src->lh);
1905 }
1906 if(src->source != NULL) {
1907 uprv_free(src->source);
1908 }
1909 if(src->opts != NULL) {
1910 uprv_free(src->opts);
1911 }
b75a7d8f
A
1912}
1913
1914#endif /* #if !UCONFIG_NO_COLLATION */
1915