2 *******************************************************************************
4 * Copyright (C) 2001-2012, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: ucol_tok.cpp
10 * tab size: 8 (not used)
14 * created by: Vladimir Weinstein
16 * This module reads a tailoring rule string and produces a list of
17 * tokens that will be turned into collation elements
21 #include "unicode/utypes.h"
23 #if !UCONFIG_NO_COLLATION
25 #include "unicode/uscript.h"
26 #include "unicode/ustring.h"
27 #include "unicode/uchar.h"
28 #include "unicode/uniset.h"
32 #include "patternprops.h"
38 // Define this only for debugging.
39 // #define DEBUG_FOR_COLL_RULES 1
41 #ifdef DEBUG_FOR_COLL_RULES
48 static int32_t U_CALLCONV
49 uhash_hashTokens(const UHashTok k
)
52 //uint32_t key = (uint32_t)k.integer;
53 UColToken
*key
= (UColToken
*)k
.pointer
;
55 int32_t len
= (key
->source
& 0xFF000000)>>24;
56 int32_t inc
= ((len
- 32) / 32) + 1;
58 const UChar
*p
= (key
->source
& 0x00FFFFFF) + *(key
->rulesToParseHdl
);
59 const UChar
*limit
= p
+ len
;
62 hash
= (hash
* 37) + *p
;
69 static UBool U_CALLCONV
70 uhash_compareTokens(const UHashTok key1
, const UHashTok key2
)
72 //uint32_t p1 = (uint32_t) key1.integer;
73 //uint32_t p2 = (uint32_t) key2.integer;
74 UColToken
*p1
= (UColToken
*)key1
.pointer
;
75 UColToken
*p2
= (UColToken
*)key2
.pointer
;
76 const UChar
*s1
= (p1
->source
& 0x00FFFFFF) + *(p1
->rulesToParseHdl
);
77 const UChar
*s2
= (p2
->source
& 0x00FFFFFF) + *(p2
->rulesToParseHdl
);
78 uint32_t s1L
= ((p1
->source
& 0xFF000000) >> 24);
79 uint32_t s2L
= ((p2
->source
& 0xFF000000) >> 24);
80 const UChar
*end
= s1
+s1L
-1;
85 if (p1
->source
== 0 || p2
->source
== 0) {
91 if(p1
->source
== p2
->source
) {
94 while((s1
< end
) && *s1
== *s2
) {
107 * Debug messages used to pinpoint where a format error occurred.
108 * A better way is to include context-sensitive information in syntaxError() function.
110 * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_FORMAT_ERROR
111 * in the compile line.
113 /* #define DEBUG_FOR_FORMAT_ERROR 1 */
115 #ifdef DEBUG_FOR_FORMAT_ERROR
116 #define DBG_FORMAT_ERROR { printf("U_INVALID_FORMAT_ERROR at line %d", __LINE__);}
118 #define DBG_FORMAT_ERROR
123 * Controls debug messages so that the output can be compared before and after a
124 * big change. Prints the information of every code point that comes out of the
125 * collation parser and its strength into a file. When a big change in format
126 * happens, the files before and after the change should be identical.
128 * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_CODE_POINTS
129 * in the compile line.
131 // #define DEBUG_FOR_CODE_POINTS 1
133 #ifdef DEBUG_FOR_CODE_POINTS
134 FILE* dfcp_fp
= NULL
;
140 uint32_t startContCE
;
142 uint32_t limitContCE
;
143 } indirectBoundaries
;
145 /* these values are used for finding CE values for indirect positioning. */
146 /* Indirect positioning is a mechanism for allowing resets on symbolic */
147 /* values. It only works for resets and you cannot tailor indirect names */
148 /* An indirect name can define either an anchor point or a range. An */
149 /* anchor point behaves in exactly the same way as a code point in reset */
150 /* would, except that it cannot be tailored. A range (we currently only */
151 /* know for the [top] range will explicitly set the upper bound for */
152 /* generated CEs, thus allowing for better control over how many CEs can */
153 /* be squeezed between in the range without performance penalty. */
154 /* In that respect, we use [top] for tailoring of locales that use CJK */
155 /* characters. Other indirect values are currently a pure convenience, */
156 /* they can be used to assure that the CEs will be always positioned in */
157 /* the same place relative to a point with known properties (e.g. first */
158 /* primary ignorable). */
159 static indirectBoundaries ucolIndirectBoundaries
[15];
161 static indirectBoundaries ucolIndirectBoundaries[11] = {
162 { UCOL_RESET_TOP_VALUE, 0,
163 UCOL_NEXT_TOP_VALUE, 0 },
164 { UCOL_FIRST_PRIMARY_IGNORABLE, 0,
166 { UCOL_LAST_PRIMARY_IGNORABLE, UCOL_LAST_PRIMARY_IGNORABLE_CONT,
168 { UCOL_FIRST_SECONDARY_IGNORABLE, 0,
170 { UCOL_LAST_SECONDARY_IGNORABLE, 0,
172 { UCOL_FIRST_TERTIARY_IGNORABLE, 0,
174 { UCOL_LAST_TERTIARY_IGNORABLE, 0,
176 { UCOL_FIRST_VARIABLE, 0,
178 { UCOL_LAST_VARIABLE, 0,
180 { UCOL_FIRST_NON_VARIABLE, 0,
182 { UCOL_LAST_NON_VARIABLE, 0,
187 static void setIndirectBoundaries(uint32_t indexR
, uint32_t *start
, uint32_t *end
) {
189 // Set values for the top - TODO: once we have values for all the indirects, we are going
190 // to initalize here.
191 ucolIndirectBoundaries
[indexR
].startCE
= start
[0];
192 ucolIndirectBoundaries
[indexR
].startContCE
= start
[1];
194 ucolIndirectBoundaries
[indexR
].limitCE
= end
[0];
195 ucolIndirectBoundaries
[indexR
].limitContCE
= end
[1];
197 ucolIndirectBoundaries
[indexR
].limitCE
= 0;
198 ucolIndirectBoundaries
[indexR
].limitContCE
= 0;
204 void syntaxError(const UChar
* rules
,
207 UParseError
* parseError
)
209 parseError
->offset
= pos
;
210 parseError
->line
= 0 ; /* we are not using line numbers */
213 int32_t start
= (pos
< U_PARSE_CONTEXT_LEN
)? 0 : (pos
- (U_PARSE_CONTEXT_LEN
-1));
216 u_memcpy(parseError
->preContext
,rules
+start
,stop
-start
);
217 //null terminate the buffer
218 parseError
->preContext
[stop
-start
] = 0;
222 stop
= ((pos
+U_PARSE_CONTEXT_LEN
)<= rulesLen
)? (pos
+(U_PARSE_CONTEXT_LEN
-1)) :
226 u_memcpy(parseError
->postContext
,rules
+start
,stop
-start
);
227 //null terminate the buffer
228 parseError
->postContext
[stop
-start
]= 0;
230 parseError
->postContext
[0] = 0;
235 void ucol_uprv_tok_setOptionInImage(UColOptionSet
*opts
, UColAttribute attrib
, UColAttributeValue value
) {
237 case UCOL_HIRAGANA_QUATERNARY_MODE
:
238 opts
->hiraganaQ
= value
;
240 case UCOL_FRENCH_COLLATION
:
241 opts
->frenchCollation
= value
;
243 case UCOL_ALTERNATE_HANDLING
:
244 opts
->alternateHandling
= value
;
246 case UCOL_CASE_FIRST
:
247 opts
->caseFirst
= value
;
249 case UCOL_CASE_LEVEL
:
250 opts
->caseLevel
= value
;
252 case UCOL_NORMALIZATION_MODE
:
253 opts
->normalizationMode
= value
;
256 opts
->strength
= value
;
258 case UCOL_NUMERIC_COLLATION
:
259 opts
->numericCollation
= value
;
261 case UCOL_ATTRIBUTE_COUNT
:
267 #define UTOK_OPTION_COUNT 22
269 static UBool didInit
= FALSE
;
270 /* we can be strict, or we can be lenient */
271 /* I'd surely be lenient with the option arguments */
272 /* maybe even with options */
273 U_STRING_DECL(suboption_00
, "non-ignorable", 13);
274 U_STRING_DECL(suboption_01
, "shifted", 7);
276 U_STRING_DECL(suboption_02
, "lower", 5);
277 U_STRING_DECL(suboption_03
, "upper", 5);
278 U_STRING_DECL(suboption_04
, "off", 3);
279 U_STRING_DECL(suboption_05
, "on", 2);
280 U_STRING_DECL(suboption_06
, "1", 1);
281 U_STRING_DECL(suboption_07
, "2", 1);
282 U_STRING_DECL(suboption_08
, "3", 1);
283 U_STRING_DECL(suboption_09
, "4", 1);
284 U_STRING_DECL(suboption_10
, "I", 1);
286 U_STRING_DECL(suboption_11
, "primary", 7);
287 U_STRING_DECL(suboption_12
, "secondary", 9);
288 U_STRING_DECL(suboption_13
, "tertiary", 8);
289 U_STRING_DECL(suboption_14
, "variable", 8);
290 U_STRING_DECL(suboption_15
, "regular", 7);
291 U_STRING_DECL(suboption_16
, "implicit", 8);
292 U_STRING_DECL(suboption_17
, "trailing", 8);
295 U_STRING_DECL(option_00
, "undefined", 9);
296 U_STRING_DECL(option_01
, "rearrange", 9);
297 U_STRING_DECL(option_02
, "alternate", 9);
298 U_STRING_DECL(option_03
, "backwards", 9);
299 U_STRING_DECL(option_04
, "variable top", 12);
300 U_STRING_DECL(option_05
, "top", 3);
301 U_STRING_DECL(option_06
, "normalization", 13);
302 U_STRING_DECL(option_07
, "caseLevel", 9);
303 U_STRING_DECL(option_08
, "caseFirst", 9);
304 U_STRING_DECL(option_09
, "scriptOrder", 11);
305 U_STRING_DECL(option_10
, "charsetname", 11);
306 U_STRING_DECL(option_11
, "charset", 7);
307 U_STRING_DECL(option_12
, "before", 6);
308 U_STRING_DECL(option_13
, "hiraganaQ", 9);
309 U_STRING_DECL(option_14
, "strength", 8);
310 U_STRING_DECL(option_15
, "first", 5);
311 U_STRING_DECL(option_16
, "last", 4);
312 U_STRING_DECL(option_17
, "optimize", 8);
313 U_STRING_DECL(option_18
, "suppressContractions", 20);
314 U_STRING_DECL(option_19
, "numericOrdering", 15);
315 U_STRING_DECL(option_20
, "import", 6);
316 U_STRING_DECL(option_21
, "reorder", 7);
319 [last variable] last variable value
320 [last primary ignorable] largest CE for primary ignorable
321 [last secondary ignorable] largest CE for secondary ignorable
322 [last tertiary ignorable] largest CE for tertiary ignorable
323 [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
327 static const ucolTokSuboption alternateSub
[2] = {
328 {suboption_00
, 13, UCOL_NON_IGNORABLE
},
329 {suboption_01
, 7, UCOL_SHIFTED
}
332 static const ucolTokSuboption caseFirstSub
[3] = {
333 {suboption_02
, 5, UCOL_LOWER_FIRST
},
334 {suboption_03
, 5, UCOL_UPPER_FIRST
},
335 {suboption_04
, 3, UCOL_OFF
},
338 static const ucolTokSuboption onOffSub
[2] = {
339 {suboption_04
, 3, UCOL_OFF
},
340 {suboption_05
, 2, UCOL_ON
}
343 static const ucolTokSuboption frenchSub
[1] = {
344 {suboption_07
, 1, UCOL_ON
}
347 static const ucolTokSuboption beforeSub
[3] = {
348 {suboption_06
, 1, UCOL_PRIMARY
},
349 {suboption_07
, 1, UCOL_SECONDARY
},
350 {suboption_08
, 1, UCOL_TERTIARY
}
353 static const ucolTokSuboption strengthSub
[5] = {
354 {suboption_06
, 1, UCOL_PRIMARY
},
355 {suboption_07
, 1, UCOL_SECONDARY
},
356 {suboption_08
, 1, UCOL_TERTIARY
},
357 {suboption_09
, 1, UCOL_QUATERNARY
},
358 {suboption_10
, 1, UCOL_IDENTICAL
},
361 static const ucolTokSuboption firstLastSub
[7] = {
362 {suboption_11
, 7, UCOL_PRIMARY
},
363 {suboption_12
, 9, UCOL_PRIMARY
},
364 {suboption_13
, 8, UCOL_PRIMARY
},
365 {suboption_14
, 8, UCOL_PRIMARY
},
366 {suboption_15
, 7, UCOL_PRIMARY
},
367 {suboption_16
, 8, UCOL_PRIMARY
},
368 {suboption_17
, 8, UCOL_PRIMARY
},
372 OPTION_ALTERNATE_HANDLING
= 0,
373 OPTION_FRENCH_COLLATION
,
376 OPTION_NORMALIZATION_MODE
,
377 OPTION_HIRAGANA_QUATERNARY
,
379 OPTION_NUMERIC_COLLATION
,
380 OPTION_NORMAL_OPTIONS_LIMIT
= OPTION_NUMERIC_COLLATION
,
388 OPTION_SUPPRESS_CONTRACTIONS
,
397 static const ucolTokOption rulesOptions
[UTOK_OPTION_COUNT
] = {
398 /*00*/ {option_02
, 9, alternateSub
, 2, UCOL_ALTERNATE_HANDLING
}, /*"alternate" */
399 /*01*/ {option_03
, 9, frenchSub
, 1, UCOL_FRENCH_COLLATION
}, /*"backwards" */
400 /*02*/ {option_07
, 9, onOffSub
, 2, UCOL_CASE_LEVEL
}, /*"caseLevel" */
401 /*03*/ {option_08
, 9, caseFirstSub
, 3, UCOL_CASE_FIRST
}, /*"caseFirst" */
402 /*04*/ {option_06
, 13, onOffSub
, 2, UCOL_NORMALIZATION_MODE
}, /*"normalization" */
403 /*05*/ {option_13
, 9, onOffSub
, 2, UCOL_HIRAGANA_QUATERNARY_MODE
}, /*"hiraganaQ" */
404 /*06*/ {option_14
, 8, strengthSub
, 5, UCOL_STRENGTH
}, /*"strength" */
405 /*07*/ {option_19
, 15, onOffSub
, 2, UCOL_NUMERIC_COLLATION
}, /*"numericOrdering"*/
406 /*08*/ {option_04
, 12, NULL
, 0, UCOL_ATTRIBUTE_COUNT
}, /*"variable top" */
407 /*09*/ {option_01
, 9, NULL
, 0, UCOL_ATTRIBUTE_COUNT
}, /*"rearrange" */
408 /*10*/ {option_12
, 6, beforeSub
, 3, UCOL_ATTRIBUTE_COUNT
}, /*"before" */
409 /*11*/ {option_05
, 3, NULL
, 0, UCOL_ATTRIBUTE_COUNT
}, /*"top" */
410 /*12*/ {option_15
, 5, firstLastSub
, 7, UCOL_ATTRIBUTE_COUNT
}, /*"first" */
411 /*13*/ {option_16
, 4, firstLastSub
, 7, UCOL_ATTRIBUTE_COUNT
}, /*"last" */
412 /*14*/ {option_17
, 8, NULL
, 0, UCOL_ATTRIBUTE_COUNT
}, /*"optimize" */
413 /*15*/ {option_18
, 20, NULL
, 0, UCOL_ATTRIBUTE_COUNT
}, /*"suppressContractions" */
414 /*16*/ {option_00
, 9, NULL
, 0, UCOL_ATTRIBUTE_COUNT
}, /*"undefined" */
415 /*17*/ {option_09
, 11, NULL
, 0, UCOL_ATTRIBUTE_COUNT
}, /*"scriptOrder" */
416 /*18*/ {option_10
, 11, NULL
, 0, UCOL_ATTRIBUTE_COUNT
}, /*"charsetname" */
417 /*19*/ {option_11
, 7, NULL
, 0, UCOL_ATTRIBUTE_COUNT
}, /*"charset" */
418 /*20*/ {option_20
, 6, NULL
, 0, UCOL_ATTRIBUTE_COUNT
}, /*"import" */
419 /*21*/ {option_21
, 7, NULL
, 0, UCOL_ATTRIBUTE_COUNT
} /*"reorder" */
423 int32_t u_strncmpNoCase(const UChar
*s1
,
430 rc
= (int32_t)u_tolower(*s1
) - (int32_t)u_tolower(*s2
);
431 if(rc
!= 0 || *s1
== 0 || --n
== 0) {
442 void ucol_uprv_tok_initData() {
444 U_STRING_INIT(suboption_00
, "non-ignorable", 13);
445 U_STRING_INIT(suboption_01
, "shifted", 7);
447 U_STRING_INIT(suboption_02
, "lower", 5);
448 U_STRING_INIT(suboption_03
, "upper", 5);
449 U_STRING_INIT(suboption_04
, "off", 3);
450 U_STRING_INIT(suboption_05
, "on", 2);
452 U_STRING_INIT(suboption_06
, "1", 1);
453 U_STRING_INIT(suboption_07
, "2", 1);
454 U_STRING_INIT(suboption_08
, "3", 1);
455 U_STRING_INIT(suboption_09
, "4", 1);
456 U_STRING_INIT(suboption_10
, "I", 1);
458 U_STRING_INIT(suboption_11
, "primary", 7);
459 U_STRING_INIT(suboption_12
, "secondary", 9);
460 U_STRING_INIT(suboption_13
, "tertiary", 8);
461 U_STRING_INIT(suboption_14
, "variable", 8);
462 U_STRING_INIT(suboption_15
, "regular", 7);
463 U_STRING_INIT(suboption_16
, "implicit", 8);
464 U_STRING_INIT(suboption_17
, "trailing", 8);
467 U_STRING_INIT(option_00
, "undefined", 9);
468 U_STRING_INIT(option_01
, "rearrange", 9);
469 U_STRING_INIT(option_02
, "alternate", 9);
470 U_STRING_INIT(option_03
, "backwards", 9);
471 U_STRING_INIT(option_04
, "variable top", 12);
472 U_STRING_INIT(option_05
, "top", 3);
473 U_STRING_INIT(option_06
, "normalization", 13);
474 U_STRING_INIT(option_07
, "caseLevel", 9);
475 U_STRING_INIT(option_08
, "caseFirst", 9);
476 U_STRING_INIT(option_09
, "scriptOrder", 11);
477 U_STRING_INIT(option_10
, "charsetname", 11);
478 U_STRING_INIT(option_11
, "charset", 7);
479 U_STRING_INIT(option_12
, "before", 6);
480 U_STRING_INIT(option_13
, "hiraganaQ", 9);
481 U_STRING_INIT(option_14
, "strength", 8);
482 U_STRING_INIT(option_15
, "first", 5);
483 U_STRING_INIT(option_16
, "last", 4);
484 U_STRING_INIT(option_17
, "optimize", 8);
485 U_STRING_INIT(option_18
, "suppressContractions", 20);
486 U_STRING_INIT(option_19
, "numericOrdering", 15);
487 U_STRING_INIT(option_20
, "import ", 6);
488 U_STRING_INIT(option_21
, "reorder", 7);
494 // This function reads basic options to set in the runtime collator
495 // used by data driven tests. Should not support build time options
496 U_CAPI
const UChar
* U_EXPORT2
497 ucol_tok_getNextArgument(const UChar
*start
, const UChar
*end
,
498 UColAttribute
*attrib
, UColAttributeValue
*value
,
503 UBool foundOption
= FALSE
;
504 const UChar
*optionArg
= NULL
;
506 ucol_uprv_tok_initData();
508 while(start
< end
&& PatternProps::isWhiteSpace(*start
)) { /* eat whitespace */
514 /* skip opening '[' */
515 if(*start
== 0x005b) {
518 *status
= U_ILLEGAL_ARGUMENT_ERROR
; // no opening '['
522 while(i
< UTOK_OPTION_COUNT
) {
523 if(u_strncmpNoCase(start
, rulesOptions
[i
].optionName
, rulesOptions
[i
].optionLen
) == 0) {
525 if(end
- start
> rulesOptions
[i
].optionLen
) {
526 optionArg
= start
+rulesOptions
[i
].optionLen
+1; /* start of the options, skip space */
527 while(PatternProps::isWhiteSpace(*optionArg
)) { /* eat whitespace */
537 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
542 for(j
= 0; j
<rulesOptions
[i
].subSize
; j
++) {
543 if(u_strncmpNoCase(optionArg
, rulesOptions
[i
].subopts
[j
].subName
, rulesOptions
[i
].subopts
[j
].subLen
) == 0) {
544 //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
545 *attrib
= rulesOptions
[i
].attr
;
546 *value
= rulesOptions
[i
].subopts
[j
].attrVal
;
547 optionArg
+= rulesOptions
[i
].subopts
[j
].subLen
;
548 while(PatternProps::isWhiteSpace(*optionArg
)) { /* eat whitespace */
551 if(*optionArg
== 0x005d) {
555 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
561 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
566 USet
*ucol_uprv_tok_readAndSetUnicodeSet(const UChar
*start
, const UChar
*end
, UErrorCode
*status
) {
567 while(*start
!= 0x005b) { /* advance while we find the first '[' */
570 // now we need to get a balanced set of '[]'. The problem is that a set can have
571 // many, and *end point to the first closing '['
572 int32_t noOpenBraces
= 1;
573 int32_t current
= 1; // skip the opening brace
574 while(start
+current
< end
&& noOpenBraces
!= 0) {
575 if(start
[current
] == 0x005b) {
577 } else if(start
[current
] == 0x005D) { // closing brace
583 if(noOpenBraces
!= 0 || u_strchr(start
+current
, 0x005d /*']'*/) == NULL
) {
584 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
587 return uset_openPattern(start
, current
, status
);
591 * Reads an option and matches the option name with the predefined options. (Case-insensitive.)
592 * @param start Pointer to the start UChar.
593 * @param end Pointer to the last valid pointer beyond which the option will not extend.
594 * @param optionArg Address of the pointer at which the options start (after the option name)
595 * @return The index of the option, or -1 if the option is not valid.
598 int32_t ucol_uprv_tok_readOption(const UChar
*start
, const UChar
*end
, const UChar
**optionArg
) {
600 ucol_uprv_tok_initData();
602 while(PatternProps::isWhiteSpace(*start
)) { /* eat whitespace */
605 while(i
< UTOK_OPTION_COUNT
) {
606 if(u_strncmpNoCase(start
, rulesOptions
[i
].optionName
, rulesOptions
[i
].optionLen
) == 0) {
607 if(end
- start
> rulesOptions
[i
].optionLen
) {
608 *optionArg
= start
+rulesOptions
[i
].optionLen
; /* End of option name; start of the options */
609 while(PatternProps::isWhiteSpace(**optionArg
)) { /* eat whitespace */
617 if(i
== UTOK_OPTION_COUNT
) {
618 i
= -1; // didn't find an option
625 void ucol_tok_parseScriptReorder(UColTokenParser
*src
, UErrorCode
*status
) {
626 int32_t codeCount
= 0;
627 int32_t codeIndex
= 0;
629 int32_t tokenLength
= 0;
632 const UChar
* current
= src
->current
;
633 const UChar
* end
= u_memchr(src
->current
, 0x005d, src
->end
- src
->current
);
635 // eat leading whitespace
636 while(current
< end
&& u_isWhitespace(*current
)) {
640 while(current
< end
) {
641 space
= u_memchr(current
, 0x0020, end
- current
);
642 space
= space
== 0 ? end
: space
;
643 tokenLength
= space
- current
;
644 if (tokenLength
< 4) {
645 *status
= U_INVALID_FORMAT_ERROR
;
649 current
+= tokenLength
;
650 while(current
< end
&& u_isWhitespace(*current
)) { /* eat whitespace */
655 if (codeCount
== 0) {
656 *status
= U_INVALID_FORMAT_ERROR
;
659 src
->reorderCodesLength
= codeCount
;
660 src
->reorderCodes
= (int32_t*)uprv_malloc(codeCount
* sizeof(int32_t));
661 current
= src
->current
;
663 // eat leading whitespace
664 while(current
< end
&& u_isWhitespace(*current
)) {
668 while(current
< end
) {
669 space
= u_memchr(current
, 0x0020, end
- current
);
670 space
= space
== 0 ? end
: space
;
671 tokenLength
= space
- current
;
672 if (tokenLength
< 4) {
673 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
676 u_UCharsToChars(current
, conversion
, tokenLength
);
677 conversion
[tokenLength
] = '\0';
678 src
->reorderCodes
[codeIndex
] = ucol_findReorderingEntry(conversion
);
679 if (src
->reorderCodes
[codeIndex
] == USCRIPT_INVALID_CODE
) {
680 src
->reorderCodes
[codeIndex
] = u_getPropertyValueEnum(UCHAR_SCRIPT
, conversion
);
682 if (src
->reorderCodes
[codeIndex
] == USCRIPT_INVALID_CODE
) {
683 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
687 current
+= tokenLength
;
688 while(current
< end
&& u_isWhitespace(*current
)) { /* eat whitespace */
694 // reads and conforms to various options in rules
695 // end is the position of the first closing ']'
696 // However, some of the options take an UnicodeSet definition
697 // which needs to duplicate the closing ']'
698 // for example: '[copy [\uAC00-\uD7FF]]'
699 // These options will move end to the second ']' and the
700 // caller will set the current to it.
702 uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser
*src
, UErrorCode
*status
) {
703 const UChar
* start
= src
->current
;
706 const UChar
*optionArg
= NULL
;
710 start
++; /*skip opening '['*/
711 i
= ucol_uprv_tok_readOption(start
, src
->end
, &optionArg
);
713 src
->current
= optionArg
;
717 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
719 int32_t noOpenBraces
= 1;
721 case OPTION_ALTERNATE_HANDLING
:
722 case OPTION_FRENCH_COLLATION
:
723 case OPTION_CASE_LEVEL
:
724 case OPTION_CASE_FIRST
:
725 case OPTION_NORMALIZATION_MODE
:
726 case OPTION_HIRAGANA_QUATERNARY
:
727 case OPTION_STRENGTH
:
728 case OPTION_NUMERIC_COLLATION
:
730 for(j
= 0; j
<rulesOptions
[i
].subSize
; j
++) {
731 if(u_strncmpNoCase(optionArg
, rulesOptions
[i
].subopts
[j
].subName
, rulesOptions
[i
].subopts
[j
].subLen
) == 0) {
732 ucol_uprv_tok_setOptionInImage(src
->opts
, rulesOptions
[i
].attr
, rulesOptions
[i
].subopts
[j
].attrVal
);
733 result
= UCOL_TOK_SUCCESS
;
738 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
741 case OPTION_VARIABLE_TOP
:
742 result
= UCOL_TOK_SUCCESS
| UCOL_TOK_VARIABLE_TOP
;
744 case OPTION_REARRANGE
:
745 result
= UCOL_TOK_SUCCESS
;
749 for(j
= 0; j
<rulesOptions
[i
].subSize
; j
++) {
750 if(u_strncmpNoCase(optionArg
, rulesOptions
[i
].subopts
[j
].subName
, rulesOptions
[i
].subopts
[j
].subLen
) == 0) {
751 result
= UCOL_TOK_SUCCESS
| (rulesOptions
[i
].subopts
[j
].attrVal
+ 1);
756 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
759 case OPTION_TOP
: /* we are going to have an array with structures of limit CEs */
760 /* index to this array will be src->parsedToken.indirectIndex*/
761 src
->parsedToken
.indirectIndex
= 0;
762 result
= UCOL_TOK_SUCCESS
| UCOL_TOK_TOP
;
765 case OPTION_LAST
: /* first, last */
766 for(j
= 0; j
<rulesOptions
[i
].subSize
; j
++) {
767 if(u_strncmpNoCase(optionArg
, rulesOptions
[i
].subopts
[j
].subName
, rulesOptions
[i
].subopts
[j
].subLen
) == 0) {
768 // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first
769 // element of indirect boundaries is reserved for top.
770 src
->parsedToken
.indirectIndex
= (uint16_t)(i
-OPTION_FIRST
+1+j
*2);
771 result
= UCOL_TOK_SUCCESS
| UCOL_TOK_TOP
;;
775 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
778 case OPTION_OPTIMIZE
:
779 case OPTION_SUPPRESS_CONTRACTIONS
: // copy and remove are handled before normalization
780 // we need to move end here
781 src
->current
++; // skip opening brace
782 while(src
->current
< src
->end
&& noOpenBraces
!= 0) {
783 if(*src
->current
== 0x005b) {
785 } else if(*src
->current
== 0x005D) { // closing brace
790 result
= UCOL_TOK_SUCCESS
;
792 case OPTION_SCRIPTREORDER
:
793 ucol_tok_parseScriptReorder(src
, status
);
796 *status
= U_UNSUPPORTED_ERROR
;
800 src
->current
= u_memchr(src
->current
, 0x005d, (int32_t)(src
->end
-src
->current
));
805 inline void ucol_tok_addToExtraCurrent(UColTokenParser
*src
, const UChar
*stuff
, int32_t len
, UErrorCode
*status
) {
806 if (stuff
== NULL
|| len
<= 0) {
809 UnicodeString
tempStuff(FALSE
, stuff
, len
);
810 if(src
->extraCurrent
+len
>= src
->extraEnd
) {
812 if (stuff
>= src
->source
&& stuff
<= src
->end
) {
813 // Copy the "stuff" contents into tempStuff's own buffer.
814 // UnicodeString is copy-on-write.
816 tempStuff
.setCharAt(0, tempStuff
[0]);
821 UChar
*newSrc
= (UChar
*)uprv_realloc(src
->source
, (src
->extraEnd
-src
->source
)*2*sizeof(UChar
));
823 src
->current
= newSrc
+ (src
->current
- src
->source
);
824 src
->extraCurrent
= newSrc
+ (src
->extraCurrent
- src
->source
);
825 src
->end
= newSrc
+ (src
->end
- src
->source
);
826 src
->extraEnd
= newSrc
+ (src
->extraEnd
-src
->source
)*2;
827 src
->sourceCurrent
= newSrc
+ (src
->sourceCurrent
-src
->source
);
828 src
->source
= newSrc
;
830 *status
= U_MEMORY_ALLOCATION_ERROR
;
835 *src
->extraCurrent
++ = tempStuff
[0];
837 u_memcpy(src
->extraCurrent
, tempStuff
.getBuffer(), len
);
838 src
->extraCurrent
+= len
;
842 inline UBool
ucol_tok_doSetTop(UColTokenParser
*src
, UErrorCode
*status
) {
847 src
->parsedToken
.charsOffset
= (uint32_t)(src
->extraCurrent
- src
->source
);
849 buff
[1] = (UChar
)(ucolIndirectBoundaries
[src
->parsedToken
.indirectIndex
].startCE
>> 16);
850 buff
[2] = (UChar
)(ucolIndirectBoundaries
[src
->parsedToken
.indirectIndex
].startCE
& 0xFFFF);
851 if(ucolIndirectBoundaries
[src
->parsedToken
.indirectIndex
].startContCE
== 0) {
852 src
->parsedToken
.charsLen
= 3;
853 ucol_tok_addToExtraCurrent(src
, buff
, 3, status
);
855 buff
[3] = (UChar
)(ucolIndirectBoundaries
[src
->parsedToken
.indirectIndex
].startContCE
>> 16);
856 buff
[4] = (UChar
)(ucolIndirectBoundaries
[src
->parsedToken
.indirectIndex
].startContCE
& 0xFFFF);
857 src
->parsedToken
.charsLen
= 5;
858 ucol_tok_addToExtraCurrent(src
, buff
, 5, status
);
863 static UBool
isCharNewLine(UChar c
){
865 case 0x000A: /* LF */
866 case 0x000D: /* CR */
867 case 0x000C: /* FF */
868 case 0x0085: /* NEL */
869 case 0x2028: /* LS */
870 case 0x2029: /* PS */
878 * This function is called several times when a range is processed. Each time, the next code point
880 * The following variables must be set before calling this function:
881 * src->currentRangeCp: The current code point to process.
882 * src->lastRangeCp: The last code point in the range.
883 * Pre-requisite: src->currentRangeCp <= src->lastRangeCp.
886 ucol_tok_processNextCodePointInRange(UColTokenParser
*src
,
889 // Append current code point to source
890 UChar buff
[U16_MAX_LENGTH
];
893 uint32_t nChars
= U16_LENGTH(src
->currentRangeCp
);
894 src
->parsedToken
.charsOffset
= (uint32_t)(src
->extraCurrent
- src
->source
);
895 src
->parsedToken
.charsLen
= nChars
;
897 U16_APPEND_UNSAFE(buff
, i
, src
->currentRangeCp
);
898 ucol_tok_addToExtraCurrent(src
, buff
, nChars
, status
);
900 ++src
->currentRangeCp
;
901 if (src
->currentRangeCp
> src
->lastRangeCp
) {
902 src
->inRange
= FALSE
;
904 if (src
->currentStarredCharIndex
> src
->lastStarredCharIndex
) {
905 src
->isStarred
= FALSE
;
908 src
->previousCp
= src
->currentRangeCp
;
914 * This function is called several times when a starred list is processed. Each time, the next code point
915 * in the list is processed.
916 * The following variables must be set before calling this function:
917 * src->currentStarredCharIndex: Index (in src->source) of the first char of the current code point.
918 * src->lastStarredCharIndex: Index to the last character in the list.
919 * Pre-requisite: src->currentStarredCharIndex <= src->lastStarredCharIndex.
922 ucol_tok_processNextTokenInStarredList(UColTokenParser
*src
)
924 // Extract the characters corresponding to the next code point.
926 src
->parsedToken
.charsOffset
= src
->currentStarredCharIndex
;
927 int32_t prev
= src
->currentStarredCharIndex
;
928 U16_NEXT(src
->source
, src
->currentStarredCharIndex
, (uint32_t)(src
->end
- src
->source
), cp
);
929 src
->parsedToken
.charsLen
= src
->currentStarredCharIndex
- prev
;
931 // When we are done parsing the starred string, turn the flag off so that
932 // the normal processing is restored.
933 if (src
->currentStarredCharIndex
> src
->lastStarredCharIndex
) {
934 src
->isStarred
= FALSE
;
936 src
->previousCp
= cp
;
941 * Partially parses the next token, keeps the indices in src->parsedToken, and updates the counters.
943 * This routine parses and separates almost all tokens. The following are the syntax characters recognized.
944 * # : Comment character
947 * < : Primary collation
948 * << : Secondary collation
949 * <<< : Tertiary collation
950 * ; : Secondary collation
951 * , : Tertiary collation
956 * ! : Java Thai modifier, ignored
962 * Along with operators =, <, <<, <<<, the operator * is supported to indicate a list. For example, &a<*bcdexyz
963 * is equivalent to &a<b<c<d<e<x<y<z. In lists, ranges also can be given, so &a*b-ex-z is equivalent to the above.
964 * This function do not separate the tokens in a list. Instead, &a<*b-ex-z is parsed as three tokens - "&a",
965 * "<*b", "-ex", "-z". The strength (< in this case), whether in a list, whether in a range and the previous
966 * character returned as cached so that the calling program can do further splitting.
969 ucol_tok_parseNextTokenInternal(UColTokenParser
*src
,
971 UParseError
*parseError
,
974 UBool variableTop
= FALSE
;
976 UBool inChars
= TRUE
;
977 UBool inQuote
= FALSE
;
978 UBool wasInQuote
= FALSE
;
980 UBool isEscaped
= FALSE
;
982 // TODO: replace these variables with src->parsedToken counterparts
983 // no need to use them anymore since we have src->parsedToken.
984 // Ideally, token parser would be a nice class... Once, when I have
985 // more time (around 2020 probably).
986 uint32_t newExtensionLen
= 0;
987 uint32_t extensionOffset
= 0;
988 uint32_t newStrength
= UCOL_TOK_UNSET
;
991 src
->parsedToken
.charsOffset
= 0; src
->parsedToken
.charsLen
= 0;
992 src
->parsedToken
.prefixOffset
= 0; src
->parsedToken
.prefixLen
= 0;
993 src
->parsedToken
.indirectIndex
= 0;
995 while (src
->current
< src
->end
) {
996 UChar ch
= *(src
->current
);
999 if (ch
== 0x0027/*'\''*/) {
1002 if ((src
->parsedToken
.charsLen
== 0) || inChars
) {
1003 if(src
->parsedToken
.charsLen
== 0) {
1004 src
->parsedToken
.charsOffset
= (uint32_t)(src
->extraCurrent
- src
->source
);
1006 src
->parsedToken
.charsLen
++;
1008 if(newExtensionLen
== 0) {
1009 extensionOffset
= (uint32_t)(src
->extraCurrent
- src
->source
);
1014 }else if(isEscaped
){
1016 if (newStrength
== UCOL_TOK_UNSET
) {
1017 *status
= U_INVALID_FORMAT_ERROR
;
1018 syntaxError(src
->source
,(int32_t)(src
->current
-src
->source
),(int32_t)(src
->end
-src
->source
),parseError
);
1021 // enabling rules to start with non-tokens a < b
1022 // newStrength = UCOL_TOK_RESET;
1024 if(ch
!= 0x0000 && src
->current
!= src
->end
) {
1026 if(src
->parsedToken
.charsLen
== 0) {
1027 src
->parsedToken
.charsOffset
= (uint32_t)(src
->current
- src
->source
);
1029 src
->parsedToken
.charsLen
++;
1031 if(newExtensionLen
== 0) {
1032 extensionOffset
= (uint32_t)(src
->current
- src
->source
);
1038 if(!PatternProps::isWhiteSpace(ch
)) {
1039 /* Sets the strength for this entry */
1041 case 0x003D/*'='*/ :
1042 if (newStrength
!= UCOL_TOK_UNSET
) {
1046 /* if we start with strength, we'll reset to top */
1047 if(startOfRules
== TRUE
) {
1048 src
->parsedToken
.indirectIndex
= 5;
1049 top
= ucol_tok_doSetTop(src
, status
);
1050 newStrength
= UCOL_TOK_RESET
;
1053 newStrength
= UCOL_IDENTICAL
;
1054 if(*(src
->current
+1) == 0x002A) {/*'*'*/
1056 src
->isStarred
= TRUE
;
1061 if (newStrength
!= UCOL_TOK_UNSET
) {
1065 /* if we start with strength, we'll reset to top */
1066 if(startOfRules
== TRUE
) {
1067 src
->parsedToken
.indirectIndex
= 5;
1068 top
= ucol_tok_doSetTop(src
, status
);
1069 newStrength
= UCOL_TOK_RESET
;
1072 newStrength
= UCOL_TERTIARY
;
1076 if (newStrength
!= UCOL_TOK_UNSET
) {
1080 /* if we start with strength, we'll reset to top */
1081 if(startOfRules
== TRUE
) {
1082 src
->parsedToken
.indirectIndex
= 5;
1083 top
= ucol_tok_doSetTop(src
, status
);
1084 newStrength
= UCOL_TOK_RESET
;
1087 newStrength
= UCOL_SECONDARY
;
1091 if (newStrength
!= UCOL_TOK_UNSET
) {
1095 /* if we start with strength, we'll reset to top */
1096 if(startOfRules
== TRUE
) {
1097 src
->parsedToken
.indirectIndex
= 5;
1098 top
= ucol_tok_doSetTop(src
, status
);
1099 newStrength
= UCOL_TOK_RESET
;
1102 /* before this, do a scan to verify whether this is */
1103 /* another strength */
1104 if(*(src
->current
+1) == 0x003C) {
1106 if(*(src
->current
+1) == 0x003C) {
1107 src
->current
++; /* three in a row! */
1108 newStrength
= UCOL_TERTIARY
;
1109 } else { /* two in a row */
1110 newStrength
= UCOL_SECONDARY
;
1112 } else { /* just one */
1113 newStrength
= UCOL_PRIMARY
;
1115 if(*(src
->current
+1) == 0x002A) {/*'*'*/
1117 src
->isStarred
= TRUE
;
1122 if (newStrength
!= UCOL_TOK_UNSET
) {
1127 newStrength
= UCOL_TOK_RESET
; /* PatternEntry::RESET = 0 */
1131 /* options - read an option, analyze it */
1132 if(u_strchr(src
->current
, 0x005d /*']'*/) != NULL
) {
1133 uint8_t result
= ucol_uprv_tok_readAndSetOption(src
, status
);
1134 if(U_SUCCESS(*status
)) {
1135 if(result
& UCOL_TOK_TOP
) {
1136 if(newStrength
== UCOL_TOK_RESET
) {
1137 top
= ucol_tok_doSetTop(src
, status
);
1138 if(before
) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
1139 src
->parsedToken
.charsLen
+=2;
1142 ucol_tok_addToExtraCurrent(src
, buff
, 2, status
);
1148 *status
= U_INVALID_FORMAT_ERROR
;
1149 syntaxError(src
->source
,(int32_t)(src
->current
-src
->source
),(int32_t)(src
->end
-src
->source
),parseError
);
1152 } else if(result
& UCOL_TOK_VARIABLE_TOP
) {
1153 if(newStrength
!= UCOL_TOK_RESET
&& newStrength
!= UCOL_TOK_UNSET
) {
1155 src
->parsedToken
.charsOffset
= (uint32_t)(src
->extraCurrent
- src
->source
);
1156 src
->parsedToken
.charsLen
= 1;
1158 ucol_tok_addToExtraCurrent(src
, buff
, 1, status
);
1162 *status
= U_INVALID_FORMAT_ERROR
;
1163 syntaxError(src
->source
,(int32_t)(src
->current
-src
->source
),(int32_t)(src
->end
-src
->source
),parseError
);
1166 } else if (result
& UCOL_TOK_BEFORE
){
1167 if(newStrength
== UCOL_TOK_RESET
) {
1168 before
= result
& UCOL_TOK_BEFORE
;
1170 *status
= U_INVALID_FORMAT_ERROR
;
1171 syntaxError(src
->source
,(int32_t)(src
->current
-src
->source
),(int32_t)(src
->end
-src
->source
),parseError
);
1176 *status
= U_INVALID_FORMAT_ERROR
;
1177 syntaxError(src
->source
,(int32_t)(src
->current
-src
->source
),(int32_t)(src
->end
-src
->source
),parseError
);
1183 case 0x0021/*! skip java thai modifier reordering*/:
1186 wasInQuote
= FALSE
; /* if we were copying source characters, we want to stop now */
1187 inChars
= FALSE
; /* we're now processing expansion */
1189 case 0x005C /* back slash for escaped chars */:
1192 /* found a quote, we're gonna start copying */
1193 case 0x0027/*'\''*/:
1194 if (newStrength
== UCOL_TOK_UNSET
) { /* quote is illegal until we have a strength */
1195 *status
= U_INVALID_FORMAT_ERROR
;
1196 syntaxError(src
->source
,(int32_t)(src
->current
-src
->source
),(int32_t)(src
->end
-src
->source
),parseError
);
1199 // enabling rules to start with a non-token character a < b
1200 // newStrength = UCOL_TOK_RESET;
1205 if(inChars
) { /* we're doing characters */
1206 if(wasInQuote
== FALSE
) {
1207 src
->parsedToken
.charsOffset
= (uint32_t)(src
->extraCurrent
- src
->source
);
1209 if (src
->parsedToken
.charsLen
!= 0) {
1210 ucol_tok_addToExtraCurrent(src
, src
->current
- src
->parsedToken
.charsLen
, src
->parsedToken
.charsLen
, status
);
1212 src
->parsedToken
.charsLen
++;
1213 } else { /* we're doing an expansion */
1214 if(wasInQuote
== FALSE
) {
1215 extensionOffset
= (uint32_t)(src
->extraCurrent
- src
->source
);
1217 if (newExtensionLen
!= 0) {
1218 ucol_tok_addToExtraCurrent(src
, src
->current
- newExtensionLen
, newExtensionLen
, status
);
1225 ch
= *(++(src
->current
));
1226 if(ch
== 0x0027) { /* copy the double quote */
1227 ucol_tok_addToExtraCurrent(src
, &ch
, 1, status
);
1232 /* '@' is french only if the strength is not currently set */
1233 /* if it is, it's just a regular character in collation rules */
1235 if (newStrength
== UCOL_TOK_UNSET
) {
1236 src
->opts
->frenchCollation
= UCOL_ON
;
1240 case 0x007C /*|*/: /* this means we have actually been reading prefix part */
1241 // we want to store read characters to the prefix part and continue reading
1242 // the characters (proper way would be to restart reading the chars, but in
1243 // that case we would have to complicate the token hasher, which I do not
1244 // intend to play with. Instead, we will do prefixes when prefixes are due
1245 // (before adding the elements).
1246 src
->parsedToken
.prefixOffset
= src
->parsedToken
.charsOffset
;
1247 src
->parsedToken
.prefixLen
= src
->parsedToken
.charsLen
;
1249 if(inChars
) { /* we're doing characters */
1250 if(wasInQuote
== FALSE
) {
1251 src
->parsedToken
.charsOffset
= (uint32_t)(src
->extraCurrent
- src
->source
);
1253 if (src
->parsedToken
.charsLen
!= 0) {
1254 ucol_tok_addToExtraCurrent(src
, src
->current
- src
->parsedToken
.charsLen
, src
->parsedToken
.charsLen
, status
);
1256 src
->parsedToken
.charsLen
++;
1262 ch
= *(++(src
->current
));
1263 // skip whitespace between '|' and the character
1264 } while (PatternProps::isWhiteSpace(ch
));
1269 //break; // We want to store the whole prefix/character sequence. If we break
1270 // the '|' is going to get lost.
1272 case 0x002D /*-*/: /* A range. */
1273 if (newStrength
!= UCOL_TOK_UNSET
) {
1274 // While processing the pending token, the isStarred field
1275 // is reset, so it needs to be saved for the next
1277 src
->savedIsStarred
= src
->isStarred
;
1280 src
->isStarred
= src
->savedIsStarred
;
1282 // Ranges are valid only in starred tokens.
1283 if (!src
->isStarred
) {
1284 *status
= U_INVALID_FORMAT_ERROR
;
1285 syntaxError(src
->source
,(int32_t)(src
->current
-src
->source
),(int32_t)(src
->end
-src
->source
),parseError
);
1289 newStrength
= src
->parsedToken
.strength
;
1290 src
->inRange
= TRUE
;
1293 case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */
1295 ch
= *(++(src
->current
));
1296 } while (!isCharNewLine(ch
));
1300 if (newStrength
== UCOL_TOK_UNSET
) {
1301 *status
= U_INVALID_FORMAT_ERROR
;
1302 syntaxError(src
->source
,(int32_t)(src
->current
-src
->source
),(int32_t)(src
->end
-src
->source
),parseError
);
1307 if (ucol_tok_isSpecialChar(ch
) && (inQuote
== FALSE
)) {
1308 *status
= U_INVALID_FORMAT_ERROR
;
1309 syntaxError(src
->source
,(int32_t)(src
->current
-src
->source
),(int32_t)(src
->end
-src
->source
),parseError
);
1314 if(ch
== 0x0000 && src
->current
+1 == src
->end
) {
1319 if(src
->parsedToken
.charsLen
== 0) {
1320 src
->parsedToken
.charsOffset
= (uint32_t)(src
->current
- src
->source
);
1322 src
->parsedToken
.charsLen
++;
1324 if(newExtensionLen
== 0) {
1325 extensionOffset
= (uint32_t)(src
->current
- src
->source
);
1337 if(inQuote
|| !PatternProps::isWhiteSpace(ch
)) {
1338 ucol_tok_addToExtraCurrent(src
, &ch
, 1, status
);
1348 if (newStrength
== UCOL_TOK_UNSET
) {
1352 if (src
->parsedToken
.charsLen
== 0 && top
== FALSE
) {
1353 syntaxError(src
->source
,(int32_t)(src
->current
-src
->source
),(int32_t)(src
->end
-src
->source
),parseError
);
1354 *status
= U_INVALID_FORMAT_ERROR
;
1359 src
->parsedToken
.strength
= newStrength
;
1360 src
->parsedToken
.extensionOffset
= extensionOffset
;
1361 src
->parsedToken
.extensionLen
= newExtensionLen
;
1362 src
->parsedToken
.flags
= (UCOL_TOK_VARIABLE_TOP
* (variableTop
?1:0)) | (UCOL_TOK_TOP
* (top
?1:0)) | before
;
1364 return src
->current
;
1368 * Parses the next token, keeps the indices in src->parsedToken, and updates the counters.
1369 * @see ucol_tok_parseNextTokenInternal() for the description of what operators are supported.
1371 * In addition to what ucol_tok_parseNextTokenInternal() does, this function does the following:
1372 * 1) ucol_tok_parseNextTokenInternal() returns a range as a single token. This function separates
1373 * it to separate tokens and returns one by one. In order to do that, the necessary states are
1374 * cached as member variables of the token parser.
1375 * 2) When encountering a range, ucol_tok_parseNextTokenInternal() processes characters up to the
1376 * starting character as a single list token (which is separated into individual characters here)
1377 * and as another list token starting with the last character in the range. Before expanding it
1378 * as a list of tokens, this function expands the range by filling the intermediate characters and
1379 * returns them one by one as separate tokens.
1380 * Necessary checks are done for invalid combinations.
1382 U_CAPI
const UChar
* U_EXPORT2
1383 ucol_tok_parseNextToken(UColTokenParser
*src
,
1385 UParseError
*parseError
,
1388 const UChar
*nextToken
;
1391 // We are not done processing a range. Continue it.
1392 return ucol_tok_processNextCodePointInRange(src
, status
);
1393 } else if (src
->isStarred
) {
1394 // We are not done processing a starred token. Continue it.
1395 return ucol_tok_processNextTokenInStarredList(src
);
1398 // Get the next token.
1399 nextToken
= ucol_tok_parseNextTokenInternal(src
, startOfRules
, parseError
, status
);
1401 if (nextToken
== NULL
) {
1406 // A new range has started.
1407 // Check whether it is a chain of ranges with more than one hyphen.
1408 if (src
->lastRangeCp
> 0 && src
->lastRangeCp
== src
->previousCp
) {
1409 *status
= U_INVALID_FORMAT_ERROR
;
1410 syntaxError(src
->source
,src
->parsedToken
.charsOffset
-1,
1411 src
->parsedToken
.charsOffset
+src
->parsedToken
.charsLen
, parseError
);
1416 // The current token indicates the second code point of the range.
1417 // Process just that, and then proceed with the star.
1418 src
->currentStarredCharIndex
= src
->parsedToken
.charsOffset
;
1419 U16_NEXT(src
->source
, src
->currentStarredCharIndex
,
1420 (uint32_t)(src
->end
- src
->source
), src
->lastRangeCp
);
1421 if (src
->lastRangeCp
<= src
->previousCp
) {
1422 *status
= U_INVALID_FORMAT_ERROR
;
1423 syntaxError(src
->source
,src
->parsedToken
.charsOffset
-1,
1424 src
->parsedToken
.charsOffset
+src
->parsedToken
.charsLen
,parseError
);
1429 // Set current range code point to process the range loop
1430 src
->currentRangeCp
= src
->previousCp
+ 1;
1432 src
->lastStarredCharIndex
= src
->parsedToken
.charsOffset
+ src
->parsedToken
.charsLen
- 1;
1434 return ucol_tok_processNextCodePointInRange(src
, status
);
1435 } else if (src
->isStarred
) {
1436 // We define two indices m_currentStarredCharIndex_ and m_lastStarredCharIndex_ so that
1437 // [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive, need to be
1438 // separated into several tokens and returned.
1439 src
->currentStarredCharIndex
= src
->parsedToken
.charsOffset
;
1440 src
->lastStarredCharIndex
= src
->parsedToken
.charsOffset
+ src
->parsedToken
.charsLen
- 1;
1442 return ucol_tok_processNextTokenInStarredList(src
);
1444 // Set previous codepoint
1445 U16_GET(src
->source
, 0, src
->parsedToken
.charsOffset
, (uint32_t)(src
->end
- src
->source
), src
->previousCp
);
1452 Processing Description
1453 1 Build a ListList. Each list has a header, which contains two lists (positive
1454 and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and
1456 2 As you process, you keep a LAST pointer that points to the last token you
1461 static UColToken
*ucol_tok_initAReset(UColTokenParser
*src
, const UChar
*expand
, uint32_t *expandNext
,
1462 UParseError
*parseError
, UErrorCode
*status
)
1464 if(src
->resultLen
== src
->listCapacity
) {
1465 // Unfortunately, this won't work, as we store addresses of lhs in token
1466 src
->listCapacity
*= 2;
1467 src
->lh
= (UColTokListHeader
*)uprv_realloc(src
->lh
, src
->listCapacity
*sizeof(UColTokListHeader
));
1468 if(src
->lh
== NULL
) {
1469 *status
= U_MEMORY_ALLOCATION_ERROR
;
1473 /* do the reset thing */
1474 UColToken
*sourceToken
= (UColToken
*)uprv_malloc(sizeof(UColToken
));
1476 if (sourceToken
== NULL
) {
1477 *status
= U_MEMORY_ALLOCATION_ERROR
;
1480 sourceToken
->rulesToParseHdl
= &(src
->source
);
1481 sourceToken
->source
= src
->parsedToken
.charsLen
<< 24 | src
->parsedToken
.charsOffset
;
1482 sourceToken
->expansion
= src
->parsedToken
.extensionLen
<< 24 | src
->parsedToken
.extensionOffset
;
1484 sourceToken
->debugSource
= *(src
->source
+ src
->parsedToken
.charsOffset
);
1485 sourceToken
->debugExpansion
= *(src
->source
+ src
->parsedToken
.extensionOffset
);
1487 // keep the flags around so that we know about before
1488 sourceToken
->flags
= src
->parsedToken
.flags
;
1490 if(src
->parsedToken
.prefixOffset
!= 0) {
1491 // this is a syntax error
1492 *status
= U_INVALID_FORMAT_ERROR
;
1493 syntaxError(src
->source
,src
->parsedToken
.charsOffset
-1,src
->parsedToken
.charsOffset
+src
->parsedToken
.charsLen
,parseError
);
1495 uprv_free(sourceToken
);
1498 sourceToken
->prefix
= 0;
1501 sourceToken
->polarity
= UCOL_TOK_POLARITY_POSITIVE
; /* TODO: this should also handle reverse */
1502 sourceToken
->strength
= UCOL_TOK_RESET
;
1503 sourceToken
->next
= NULL
;
1504 sourceToken
->previous
= NULL
;
1505 sourceToken
->noOfCEs
= 0;
1506 sourceToken
->noOfExpCEs
= 0;
1507 sourceToken
->listHeader
= &src
->lh
[src
->resultLen
];
1509 src
->lh
[src
->resultLen
].first
= NULL
;
1510 src
->lh
[src
->resultLen
].last
= NULL
;
1511 src
->lh
[src
->resultLen
].first
= NULL
;
1512 src
->lh
[src
->resultLen
].last
= NULL
;
1514 src
->lh
[src
->resultLen
].reset
= sourceToken
;
1517 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
1518 First convert all expansions into normal form. Examples:
1519 If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
1520 d * ... into &x * c/y * d * ...
1521 Note: reset values can never have expansions, although they can cause the
1522 very next item to have one. They may be contractions, if they are found
1523 earlier in the list.
1526 if(expand
!= NULL
) {
1527 /* check to see if there is an expansion */
1528 if(src
->parsedToken
.charsLen
> 1) {
1529 uint32_t resetCharsOffset
;
1530 resetCharsOffset
= (uint32_t)(expand
- src
->source
);
1531 sourceToken
->source
= ((resetCharsOffset
- src
->parsedToken
.charsOffset
) << 24) | src
->parsedToken
.charsOffset
;
1532 *expandNext
= ((src
->parsedToken
.charsLen
+ src
->parsedToken
.charsOffset
- resetCharsOffset
)<<24) | (resetCharsOffset
);
1538 uhash_put(src
->tailored
, sourceToken
, sourceToken
, status
);
1544 inline UColToken
*getVirginBefore(UColTokenParser
*src
, UColToken
*sourceToken
, uint8_t strength
, UParseError
*parseError
, UErrorCode
*status
) {
1545 if(U_FAILURE(*status
)) {
1548 /* this is a virgin before - we need to fish the anchor from the UCA */
1550 uint32_t baseCE
= UCOL_NOT_FOUND
, baseContCE
= UCOL_NOT_FOUND
;
1551 uint32_t CE
, SecondCE
;
1553 if(sourceToken
!= NULL
) {
1554 uprv_init_collIterate(src
->UCA
, src
->source
+((sourceToken
->source
)&0xFFFFFF), 1, &s
, status
);
1556 uprv_init_collIterate(src
->UCA
, src
->source
+src
->parsedToken
.charsOffset
/**charsOffset*/, 1, &s
, status
);
1558 if(U_FAILURE(*status
)) {
1562 baseCE
= ucol_getNextCE(src
->UCA
, &s
, status
) & 0xFFFFFF3F;
1563 baseContCE
= ucol_getNextCE(src
->UCA
, &s
, status
);
1564 if(baseContCE
== UCOL_NO_MORE_CES
) {
1569 UCAConstants
*consts
= (UCAConstants
*)((uint8_t *)src
->UCA
->image
+ src
->UCA
->image
->UCAConsts
);
1571 uint32_t expandNext
= 0;
1574 if((baseCE
& 0xFF000000) >= (consts
->UCA_PRIMARY_IMPLICIT_MIN
<<24) && (baseCE
& 0xFF000000) <= (consts
->UCA_PRIMARY_IMPLICIT_MAX
<<24) ) { /* implicits - */
1575 uint32_t primary
= (baseCE
& UCOL_PRIMARYMASK
) | ((baseContCE
& UCOL_PRIMARYMASK
) >> 16);
1576 uint32_t raw
= uprv_uca_getRawFromImplicit(primary
);
1577 ch
= uprv_uca_getCodePointFromRaw(raw
-1);
1578 uint32_t primaryCE
= uprv_uca_getImplicitFromRaw(raw
-1);
1579 CE
= (primaryCE
& UCOL_PRIMARYMASK
) | 0x0505;
1580 SecondCE
= ((primaryCE
<< 16) & UCOL_PRIMARYMASK
) | UCOL_CONTINUATION_MARKER
;
1582 src
->parsedToken
.charsOffset
= (uint32_t)(src
->extraCurrent
- src
->source
);
1583 *src
->extraCurrent
++ = 0xFFFE;
1584 *src
->extraCurrent
++ = (UChar
)ch
;
1585 src
->parsedToken
.charsLen
++;
1587 key
.source
= (src
->parsedToken
.charsLen
/**newCharsLen*/ << 24) | src
->parsedToken
.charsOffset
/**charsOffset*/;
1588 key
.rulesToParseHdl
= &(src
->source
);
1590 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1591 sourceToken
= (UColToken
*)uhash_get(src
->tailored
, &key
);
1593 if(sourceToken
== NULL
) {
1594 src
->lh
[src
->resultLen
].baseCE
= CE
& 0xFFFFFF3F;
1595 if(isContinuation(SecondCE
)) {
1596 src
->lh
[src
->resultLen
].baseContCE
= SecondCE
;
1598 src
->lh
[src
->resultLen
].baseContCE
= 0;
1600 src
->lh
[src
->resultLen
].nextCE
= 0;
1601 src
->lh
[src
->resultLen
].nextContCE
= 0;
1602 src
->lh
[src
->resultLen
].previousCE
= 0;
1603 src
->lh
[src
->resultLen
].previousContCE
= 0;
1605 src
->lh
[src
->resultLen
].indirect
= FALSE
;
1607 sourceToken
= ucol_tok_initAReset(src
, 0, &expandNext
, parseError
, status
);
1611 /* invPos = */ ucol_inv_getPrevCE(src
, baseCE
, baseContCE
, &CE
, &SecondCE
, strength
);
1613 // we got the previous CE. Now we need to see if the difference between
1614 // the two CEs is really of the requested strength.
1615 // if it's a bigger difference (we asked for secondary and got primary), we
1616 // need to modify the CE.
1617 if(ucol_getCEStrengthDifference(baseCE
, baseContCE
, CE
, SecondCE
) < strength
) {
1618 // adjust the strength
1619 // now we are in the situation where our baseCE should actually be modified in
1620 // order to get the CE in the right position.
1621 if(strength
== UCOL_SECONDARY
) {
1622 CE
= baseCE
- 0x0200;
1623 } else { // strength == UCOL_TERTIARY
1627 if(strength
== UCOL_SECONDARY
) {
1628 SecondCE
= baseContCE
- 0x0200;
1629 } else { // strength == UCOL_TERTIARY
1630 SecondCE
= baseContCE
- 0x02;
1636 // the code below relies on getting a code point from the inverse table, in order to be
1637 // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
1638 // 1. There are many code points that have the same CE
1639 // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
1640 // Also, in case when there is no equivalent strength before an element, we have to actually
1641 // construct one. For example, &[before 2]a << x won't result in x << a, because the element
1642 // before a is a primary difference.
1644 //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
1647 ch
= CETable
[3*invPos
+2];
1649 if((ch
& UCOL_INV_SIZEMASK
) != 0) {
1650 uint16_t *conts
= (uint16_t *)((uint8_t *)src
->invUCA
+src
->invUCA
->conts
);
1651 uint32_t offset
= (ch
& UCOL_INV_OFFSETMASK
);
1655 *src
->extraCurrent
++ = (UChar
)ch
;
1656 src
->parsedToken
.charsOffset
= (uint32_t)(src
->extraCurrent
- src
->source
- 1);
1657 src
->parsedToken
.charsLen
= 1;
1659 // We got an UCA before. However, this might have been tailored.
1662 // &[before 3]\u306a<<<\u306a|\u309d
1665 // uint32_t key = (*newCharsLen << 24) | *charsOffset;
1666 key
.source
= (src
->parsedToken
.charsLen
/**newCharsLen*/ << 24) | src
->parsedToken
.charsOffset
/**charsOffset*/;
1667 key
.rulesToParseHdl
= &(src
->source
);
1669 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1670 sourceToken
= (UColToken
*)uhash_get(src
->tailored
, &key
);
1673 // here is how it should be. The situation such as &[before 1]a < x, should be
1674 // resolved exactly as if we wrote &a > x.
1675 // therefore, I don't really care if the UCA value before a has been changed.
1676 // However, I do care if the strength between my element and the previous element
1677 // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
1678 // have to construct the base CE.
1682 // if we found a tailored thing, we have to use the UCA value and construct
1683 // a new reset token with constructed name
1684 //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
1685 // character to which we want to anchor is already tailored.
1686 // We need to construct a new token which will be the anchor
1688 //*(src->extraCurrent-1) = 0xFFFE;
1689 //*src->extraCurrent++ = (UChar)ch;
1691 src
->parsedToken
.charsOffset
-= 10;
1692 src
->parsedToken
.charsLen
+= 10;
1693 src
->lh
[src
->resultLen
].baseCE
= CE
& 0xFFFFFF3F;
1694 if(isContinuation(SecondCE
)) {
1695 src
->lh
[src
->resultLen
].baseContCE
= SecondCE
;
1697 src
->lh
[src
->resultLen
].baseContCE
= 0;
1699 src
->lh
[src
->resultLen
].nextCE
= 0;
1700 src
->lh
[src
->resultLen
].nextContCE
= 0;
1701 src
->lh
[src
->resultLen
].previousCE
= 0;
1702 src
->lh
[src
->resultLen
].previousContCE
= 0;
1704 src
->lh
[src
->resultLen
].indirect
= FALSE
;
1706 sourceToken
= ucol_tok_initAReset(src
, 0, &expandNext
, parseError
, status
);
1714 uint32_t ucol_tok_assembleTokenList(UColTokenParser
*src
, UParseError
*parseError
, UErrorCode
*status
) {
1715 UColToken
*lastToken
= NULL
;
1716 const UChar
*parseEnd
= NULL
;
1717 uint32_t expandNext
= 0;
1718 UBool variableTop
= FALSE
;
1721 UColTokListHeader
*ListList
= NULL
;
1723 src
->parsedToken
.strength
= UCOL_TOK_UNSET
;
1727 if(U_FAILURE(*status
)) {
1730 #ifdef DEBUG_FOR_CODE_POINTS
1732 sprintf(filename
, "/tmp/debug_for_cp_%09d.txt", getpid());
1733 dfcp_fp
= fopen(filename
, "a");
1734 fprintf(stdout
, "Output is in the file %s.\n", filename
);
1737 #ifdef DEBUG_FOR_COLL_RULES
1739 UnicodeString(src
->source
).toUTF8String(s3
);
1740 std::cout
<< "src->source = " << s3
<< std::endl
;
1743 while(src
->current
< src
->end
|| src
->isStarred
) {
1744 src
->parsedToken
.prefixOffset
= 0;
1746 parseEnd
= ucol_tok_parseNextToken(src
,
1747 (UBool
)(lastToken
== NULL
),
1751 specs
= src
->parsedToken
.flags
;
1754 variableTop
= ((specs
& UCOL_TOK_VARIABLE_TOP
) != 0);
1755 top
= ((specs
& UCOL_TOK_TOP
) != 0);
1757 if(U_SUCCESS(*status
) && parseEnd
!= NULL
) {
1758 UColToken
*sourceToken
= NULL
;
1760 uint32_t lastStrength
= UCOL_TOK_UNSET
;
1762 if(lastToken
!= NULL
) {
1763 lastStrength
= lastToken
->strength
;
1766 #ifdef DEBUG_FOR_CODE_POINTS
1768 U16_GET(src
->source
, 0, src
->parsedToken
.charsOffset
, (uint32_t)(src
->extraEnd
- src
->source
), cp
);
1769 fprintf(dfcp_fp
, "Code point = %x, Strength = %x\n", cp
, src
->parsedToken
.strength
);
1771 //key = newCharsLen << 24 | charsOffset;
1773 key
.source
= src
->parsedToken
.charsLen
<< 24 | src
->parsedToken
.charsOffset
;
1774 key
.rulesToParseHdl
= &(src
->source
);
1776 /* 4 Lookup each source in the CharsToToken map, and find a sourceToken */
1777 sourceToken
= (UColToken
*)uhash_get(src
->tailored
, &key
);
1779 if(src
->parsedToken
.strength
!= UCOL_TOK_RESET
) {
1780 if(lastToken
== NULL
) { /* this means that rules haven't started properly */
1781 *status
= U_INVALID_FORMAT_ERROR
;
1782 syntaxError(src
->source
,0,(int32_t)(src
->end
-src
->source
),parseError
);
1786 /* 6 Otherwise (when relation != reset) */
1787 if(sourceToken
== NULL
) {
1788 /* If sourceToken is null, create new one, */
1789 sourceToken
= (UColToken
*)uprv_malloc(sizeof(UColToken
));
1791 if (sourceToken
== NULL
) {
1792 *status
= U_MEMORY_ALLOCATION_ERROR
;
1795 sourceToken
->rulesToParseHdl
= &(src
->source
);
1796 sourceToken
->source
= src
->parsedToken
.charsLen
<< 24 | src
->parsedToken
.charsOffset
;
1798 sourceToken
->debugSource
= *(src
->source
+ src
->parsedToken
.charsOffset
);
1800 sourceToken
->prefix
= src
->parsedToken
.prefixLen
<< 24 | src
->parsedToken
.prefixOffset
;
1801 sourceToken
->debugPrefix
= *(src
->source
+ src
->parsedToken
.prefixOffset
);
1803 sourceToken
->polarity
= UCOL_TOK_POLARITY_POSITIVE
; /* TODO: this should also handle reverse */
1804 sourceToken
->next
= NULL
;
1805 sourceToken
->previous
= NULL
;
1806 sourceToken
->noOfCEs
= 0;
1807 sourceToken
->noOfExpCEs
= 0;
1808 // keep the flags around so that we know about before
1809 sourceToken
->flags
= src
->parsedToken
.flags
;
1810 uhash_put(src
->tailored
, sourceToken
, sourceToken
, status
);
1811 if(U_FAILURE(*status
)) {
1815 /* we could have fished out a reset here */
1816 if(sourceToken
->strength
!= UCOL_TOK_RESET
&& lastToken
!= sourceToken
) {
1817 /* otherwise remove sourceToken from where it was. */
1818 if(sourceToken
->next
!= NULL
) {
1819 if(sourceToken
->next
->strength
> sourceToken
->strength
) {
1820 sourceToken
->next
->strength
= sourceToken
->strength
;
1822 sourceToken
->next
->previous
= sourceToken
->previous
;
1824 sourceToken
->listHeader
->last
= sourceToken
->previous
;
1827 if(sourceToken
->previous
!= NULL
) {
1828 sourceToken
->previous
->next
= sourceToken
->next
;
1830 sourceToken
->listHeader
->first
= sourceToken
->next
;
1832 sourceToken
->next
= NULL
;
1833 sourceToken
->previous
= NULL
;
1837 sourceToken
->strength
= src
->parsedToken
.strength
;
1838 sourceToken
->listHeader
= lastToken
->listHeader
;
1841 1. Find the strongest strength in each list, and set strongestP and strongestN
1842 accordingly in the headers.
1844 if(lastStrength
== UCOL_TOK_RESET
1845 || sourceToken
->listHeader
->first
== 0) {
1846 /* If LAST is a reset
1847 insert sourceToken in the list. */
1848 if(sourceToken
->listHeader
->first
== 0) {
1849 sourceToken
->listHeader
->first
= sourceToken
;
1850 sourceToken
->listHeader
->last
= sourceToken
;
1851 } else { /* we need to find a place for us */
1852 /* and we'll get in front of the same strength */
1853 if(sourceToken
->listHeader
->first
->strength
<= sourceToken
->strength
) {
1854 sourceToken
->next
= sourceToken
->listHeader
->first
;
1855 sourceToken
->next
->previous
= sourceToken
;
1856 sourceToken
->listHeader
->first
= sourceToken
;
1857 sourceToken
->previous
= NULL
;
1859 lastToken
= sourceToken
->listHeader
->first
;
1860 while(lastToken
->next
!= NULL
&& lastToken
->next
->strength
> sourceToken
->strength
) {
1861 lastToken
= lastToken
->next
;
1863 if(lastToken
->next
!= NULL
) {
1864 lastToken
->next
->previous
= sourceToken
;
1866 sourceToken
->listHeader
->last
= sourceToken
;
1868 sourceToken
->previous
= lastToken
;
1869 sourceToken
->next
= lastToken
->next
;
1870 lastToken
->next
= sourceToken
;
1874 /* Otherwise (when LAST is not a reset)
1875 if polarity (LAST) == polarity(relation), insert sourceToken after LAST,
1876 otherwise insert before.
1877 when inserting after or before, search to the next position with the same
1878 strength in that direction. (This is called postpone insertion). */
1879 if(sourceToken
!= lastToken
) {
1880 if(lastToken
->polarity
== sourceToken
->polarity
) {
1881 while(lastToken
->next
!= NULL
&& lastToken
->next
->strength
> sourceToken
->strength
) {
1882 lastToken
= lastToken
->next
;
1884 sourceToken
->previous
= lastToken
;
1885 if(lastToken
->next
!= NULL
) {
1886 lastToken
->next
->previous
= sourceToken
;
1888 sourceToken
->listHeader
->last
= sourceToken
;
1891 sourceToken
->next
= lastToken
->next
;
1892 lastToken
->next
= sourceToken
;
1894 while(lastToken
->previous
!= NULL
&& lastToken
->previous
->strength
> sourceToken
->strength
) {
1895 lastToken
= lastToken
->previous
;
1897 sourceToken
->next
= lastToken
;
1898 if(lastToken
->previous
!= NULL
) {
1899 lastToken
->previous
->next
= sourceToken
;
1901 sourceToken
->listHeader
->first
= sourceToken
;
1903 sourceToken
->previous
= lastToken
->previous
;
1904 lastToken
->previous
= sourceToken
;
1906 } else { /* repeated one thing twice in rules, stay with the stronger strength */
1907 if(lastStrength
< sourceToken
->strength
) {
1908 sourceToken
->strength
= lastStrength
;
1913 /* if the token was a variable top, we're gonna put it in */
1914 if(variableTop
== TRUE
&& src
->varTop
== NULL
) {
1915 variableTop
= FALSE
;
1916 src
->varTop
= sourceToken
;
1919 // Treat the expansions.
1920 // There are two types of expansions: explicit (x / y) and reset based propagating expansions
1921 // (&abc * d * e <=> &ab * d / c * e / c)
1922 // if both of them are in effect for a token, they are combined.
1924 sourceToken
->expansion
= src
->parsedToken
.extensionLen
<< 24 | src
->parsedToken
.extensionOffset
;
1926 if(expandNext
!= 0) {
1927 if(sourceToken
->strength
== UCOL_PRIMARY
) { /* primary strength kills off the implicit expansion */
1929 } else if(sourceToken
->expansion
== 0) { /* if there is no expansion, implicit is just added to the token */
1930 sourceToken
->expansion
= expandNext
;
1931 } else { /* there is both explicit and implicit expansion. We need to make a combination */
1932 uprv_memcpy(src
->extraCurrent
, src
->source
+ (expandNext
& 0xFFFFFF), (expandNext
>> 24)*sizeof(UChar
));
1933 uprv_memcpy(src
->extraCurrent
+(expandNext
>> 24), src
->source
+ src
->parsedToken
.extensionOffset
, src
->parsedToken
.extensionLen
*sizeof(UChar
));
1934 sourceToken
->expansion
= (uint32_t)(((expandNext
>> 24) + src
->parsedToken
.extensionLen
)<<24 | (uint32_t)(src
->extraCurrent
- src
->source
));
1935 src
->extraCurrent
+= (expandNext
>> 24) + src
->parsedToken
.extensionLen
;
1939 // This is just for debugging purposes
1940 if(sourceToken
->expansion
!= 0) {
1941 sourceToken
->debugExpansion
= *(src
->source
+ src
->parsedToken
.extensionOffset
);
1943 sourceToken
->debugExpansion
= 0;
1945 // if the previous token was a reset before, the strength of this
1946 // token must match the strength of before. Otherwise we have an
1947 // undefined situation.
1948 // In other words, we currently have a cludge which we use to
1949 // represent &a >> x. This is written as &[before 2]a << x.
1950 if((lastToken
->flags
& UCOL_TOK_BEFORE
) != 0) {
1951 uint8_t beforeStrength
= (lastToken
->flags
& UCOL_TOK_BEFORE
) - 1;
1952 if(beforeStrength
!= sourceToken
->strength
) {
1953 *status
= U_INVALID_FORMAT_ERROR
;
1954 syntaxError(src
->source
,0,(int32_t)(src
->end
-src
->source
),parseError
);
1960 if(lastToken
!= NULL
&& lastStrength
== UCOL_TOK_RESET
) {
1961 /* if the previous token was also a reset, */
1962 /*this means that we have two consecutive resets */
1963 /* and we want to remove the previous one if empty*/
1964 if(src
->resultLen
> 0 && ListList
[src
->resultLen
-1].first
== NULL
) {
1969 if(sourceToken
== NULL
) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
1970 uint32_t searchCharsLen
= src
->parsedToken
.charsLen
;
1971 while(searchCharsLen
> 1 && sourceToken
== NULL
) {
1973 //key = searchCharsLen << 24 | charsOffset;
1975 key
.source
= searchCharsLen
<< 24 | src
->parsedToken
.charsOffset
;
1976 key
.rulesToParseHdl
= &(src
->source
);
1977 sourceToken
= (UColToken
*)uhash_get(src
->tailored
, &key
);
1979 if(sourceToken
!= NULL
) {
1980 expandNext
= (src
->parsedToken
.charsLen
- searchCharsLen
) << 24 | (src
->parsedToken
.charsOffset
+ searchCharsLen
);
1984 if((specs
& UCOL_TOK_BEFORE
) != 0) { /* we're doing before */
1985 if(top
== FALSE
) { /* there is no indirection */
1986 uint8_t strength
= (specs
& UCOL_TOK_BEFORE
) - 1;
1987 if(sourceToken
!= NULL
&& sourceToken
->strength
!= UCOL_TOK_RESET
) {
1988 /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
1989 while(sourceToken
->strength
> strength
&& sourceToken
->previous
!= NULL
) {
1990 sourceToken
= sourceToken
->previous
;
1992 /* here, either we hit the strength or NULL */
1993 if(sourceToken
->strength
== strength
) {
1994 if(sourceToken
->previous
!= NULL
) {
1995 sourceToken
= sourceToken
->previous
;
1996 } else { /* start of list */
1997 sourceToken
= sourceToken
->listHeader
->reset
;
1999 } else { /* we hit NULL */
2000 /* we should be doing the else part */
2001 sourceToken
= sourceToken
->listHeader
->reset
;
2002 sourceToken
= getVirginBefore(src
, sourceToken
, strength
, parseError
, status
);
2005 sourceToken
= getVirginBefore(src
, sourceToken
, strength
, parseError
, status
);
2007 } else { /* this is both before and indirection */
2009 ListList
[src
->resultLen
].previousCE
= 0;
2010 ListList
[src
->resultLen
].previousContCE
= 0;
2011 ListList
[src
->resultLen
].indirect
= TRUE
;
2012 /* we need to do slightly more work. we need to get the baseCE using the */
2013 /* inverse UCA & getPrevious. The next bound is not set, and will be decided */
2015 uint8_t strength
= (specs
& UCOL_TOK_BEFORE
) - 1;
2016 uint32_t baseCE
= ucolIndirectBoundaries
[src
->parsedToken
.indirectIndex
].startCE
;
2017 uint32_t baseContCE
= ucolIndirectBoundaries
[src
->parsedToken
.indirectIndex
].startContCE
;//&0xFFFFFF3F;
2018 uint32_t CE
= UCOL_NOT_FOUND
, SecondCE
= UCOL_NOT_FOUND
;
2020 UCAConstants
*consts
= (UCAConstants
*)((uint8_t *)src
->UCA
->image
+ src
->UCA
->image
->UCAConsts
);
2021 if((baseCE
& 0xFF000000) >= (consts
->UCA_PRIMARY_IMPLICIT_MIN
<<24) &&
2022 (baseCE
& 0xFF000000) <= (consts
->UCA_PRIMARY_IMPLICIT_MAX
<<24) ) { /* implicits - */
2023 uint32_t primary
= (baseCE
& UCOL_PRIMARYMASK
) | ((baseContCE
& UCOL_PRIMARYMASK
) >> 16);
2024 uint32_t raw
= uprv_uca_getRawFromImplicit(primary
);
2025 uint32_t primaryCE
= uprv_uca_getImplicitFromRaw(raw
-1);
2026 CE
= (primaryCE
& UCOL_PRIMARYMASK
) | 0x0505;
2027 SecondCE
= ((primaryCE
<< 16) & UCOL_PRIMARYMASK
) | UCOL_CONTINUATION_MARKER
;
2029 /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/
2030 ucol_inv_getPrevCE(src
, baseCE
, baseContCE
, &CE
, &SecondCE
, strength
);
2033 ListList
[src
->resultLen
].baseCE
= CE
;
2034 ListList
[src
->resultLen
].baseContCE
= SecondCE
;
2035 ListList
[src
->resultLen
].nextCE
= 0;
2036 ListList
[src
->resultLen
].nextContCE
= 0;
2038 sourceToken
= ucol_tok_initAReset(src
, 0, &expandNext
, parseError
, status
);
2043 /* 5 If the relation is a reset:
2044 If sourceToken is null
2045 Create new list, create new sourceToken, make the baseCE from source, put
2046 the sourceToken in ListHeader of the new list */
2047 if(sourceToken
== NULL
) {
2049 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
2050 First convert all expansions into normal form. Examples:
2051 If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
2052 d * ... into &x * c/y * d * ...
2053 Note: reset values can never have expansions, although they can cause the
2054 very next item to have one. They may be contractions, if they are found
2055 earlier in the list.
2059 uint32_t CE
= UCOL_NOT_FOUND
, SecondCE
= UCOL_NOT_FOUND
;
2061 uprv_init_collIterate(src
->UCA
, src
->source
+src
->parsedToken
.charsOffset
, src
->parsedToken
.charsLen
, &s
, status
);
2063 CE
= ucol_getNextCE(src
->UCA
, &s
, status
);
2064 const UChar
*expand
= s
.pos
;
2065 SecondCE
= ucol_getNextCE(src
->UCA
, &s
, status
);
2067 ListList
[src
->resultLen
].baseCE
= CE
& 0xFFFFFF3F;
2068 if(isContinuation(SecondCE
)) {
2069 ListList
[src
->resultLen
].baseContCE
= SecondCE
;
2071 ListList
[src
->resultLen
].baseContCE
= 0;
2073 ListList
[src
->resultLen
].nextCE
= 0;
2074 ListList
[src
->resultLen
].nextContCE
= 0;
2075 ListList
[src
->resultLen
].previousCE
= 0;
2076 ListList
[src
->resultLen
].previousContCE
= 0;
2077 ListList
[src
->resultLen
].indirect
= FALSE
;
2078 sourceToken
= ucol_tok_initAReset(src
, expand
, &expandNext
, parseError
, status
);
2079 } else { /* top == TRUE */
2080 /* just use the supplied values */
2082 ListList
[src
->resultLen
].previousCE
= 0;
2083 ListList
[src
->resultLen
].previousContCE
= 0;
2084 ListList
[src
->resultLen
].indirect
= TRUE
;
2085 ListList
[src
->resultLen
].baseCE
= ucolIndirectBoundaries
[src
->parsedToken
.indirectIndex
].startCE
;
2086 ListList
[src
->resultLen
].baseContCE
= ucolIndirectBoundaries
[src
->parsedToken
.indirectIndex
].startContCE
;
2087 ListList
[src
->resultLen
].nextCE
= ucolIndirectBoundaries
[src
->parsedToken
.indirectIndex
].limitCE
;
2088 ListList
[src
->resultLen
].nextContCE
= ucolIndirectBoundaries
[src
->parsedToken
.indirectIndex
].limitContCE
;
2090 sourceToken
= ucol_tok_initAReset(src
, 0, &expandNext
, parseError
, status
);
2093 } else { /* reset to something already in rules */
2097 /* 7 After all this, set LAST to point to sourceToken, and goto step 3. */
2098 lastToken
= sourceToken
;
2100 if(U_FAILURE(*status
)) {
2105 #ifdef DEBUG_FOR_CODE_POINTS
2110 if(src
->resultLen
> 0 && ListList
[src
->resultLen
-1].first
== NULL
) {
2113 return src
->resultLen
;
2116 const UChar
* ucol_tok_getRulesFromBundle(
2123 const UChar
* rules
= NULL
;
2124 UResourceBundle
* bundle
;
2125 UResourceBundle
* collations
;
2126 UResourceBundle
* collation
;
2130 bundle
= ures_open(U_ICUDATA_COLL
, locale
, status
);
2131 if(U_SUCCESS(*status
)){
2132 collations
= ures_getByKey(bundle
, "collations", NULL
, status
);
2133 if(U_SUCCESS(*status
)){
2134 collation
= ures_getByKey(collations
, type
, NULL
, status
);
2135 if(U_SUCCESS(*status
)){
2136 rules
= ures_getStringByKey(collation
, "Sequence", pLength
, status
);
2137 if(U_FAILURE(*status
)){
2141 ures_close(collation
);
2143 ures_close(collations
);
2152 void ucol_tok_initTokenList(
2153 UColTokenParser
*src
,
2155 uint32_t rulesLength
,
2156 const UCollator
*UCA
,
2157 GetCollationRulesFunction importFunc
,
2159 UErrorCode
*status
) {
2163 uint32_t estimatedSize
= (2*rulesLength
+UCOL_TOK_EXTRA_RULE_SPACE_SIZE
);
2165 bool needToDeallocRules
= false;
2167 if(U_FAILURE(*status
)) {
2171 // set everything to zero, so that we can clean up gracefully
2172 uprv_memset(src
, 0, sizeof(UColTokenParser
));
2174 // first we need to find options that don't like to be normalized,
2175 // like copy and remove...
2176 //const UChar *openBrace = rules;
2177 int32_t optionNumber
= -1;
2178 const UChar
*setStart
= NULL
;
2180 while(i
< rulesLength
) {
2181 if(rules
[i
] == 0x005B) { // '[': start of an option
2182 /* Gets the following:
2183 optionNumber: The index of the option.
2184 setStart: The pointer at which the option arguments start.
2186 optionNumber
= ucol_uprv_tok_readOption(rules
+i
+1, rules
+rulesLength
, &setStart
);
2188 if(optionNumber
== OPTION_OPTIMIZE
) { /* copy - parts of UCA to tailoring */
2190 USet
*newSet
= ucol_uprv_tok_readAndSetUnicodeSet(setStart
, rules
+rulesLength
, status
);
2191 if(U_SUCCESS(*status
)) {
2192 if(src
->copySet
== NULL
) {
2193 src
->copySet
= newSet
;
2195 uset_addAll(src
->copySet
, newSet
);
2201 } else if(optionNumber
== OPTION_SUPPRESS_CONTRACTIONS
) {
2202 USet
*newSet
= ucol_uprv_tok_readAndSetUnicodeSet(setStart
, rules
+rulesLength
, status
);
2203 if(U_SUCCESS(*status
)) {
2204 if(src
->removeSet
== NULL
) {
2205 src
->removeSet
= newSet
;
2207 uset_addAll(src
->removeSet
, newSet
);
2213 } else if(optionNumber
== OPTION_IMPORT
){
2214 // [import <collation-name>]
2216 // Find the address of the closing ].
2217 UChar
* import_end
= u_strchr(setStart
, 0x005D);
2218 int32_t optionEndOffset
= (int32_t)(import_end
+ 1 - rules
);
2219 // Ignore trailing whitespace.
2220 while(PatternProps::isWhiteSpace(*(import_end
-1))) {
2224 int32_t optionLength
= (int32_t)(import_end
- setStart
);
2226 if(optionLength
>= (int32_t)sizeof(option
)) {
2227 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2230 u_UCharsToChars(setStart
, option
, optionLength
);
2231 option
[optionLength
] = 0;
2233 *status
= U_ZERO_ERROR
;
2236 uloc_forLanguageTag(option
, locale
, (int32_t)sizeof(locale
), &templ
, status
);
2237 if(U_FAILURE(*status
)) {
2238 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2243 if (uloc_getKeywordValue(locale
, "collation", type
, (int32_t)sizeof(type
), status
) <= 0 ||
2246 *status
= U_ZERO_ERROR
;
2247 uprv_strcpy(type
, "standard");
2250 // TODO: Use public functions when available, see ticket #8134.
2251 char *keywords
= (char *)locale_getKeywordsStart(locale
);
2252 if(keywords
!= NULL
) {
2256 int32_t importRulesLength
= 0;
2257 const UChar
* importRules
= importFunc(context
, locale
, type
, &importRulesLength
, status
);
2259 #ifdef DEBUG_FOR_COLL_RULES
2261 UnicodeString(importRules
).toUTF8String(s
);
2262 std::cout
<< "Import rules = " << s
<< std::endl
;
2265 // Add the length of the imported rules to length of the original rules,
2266 // and subtract the length of the import option.
2267 uint32_t newRulesLength
= rulesLength
+ importRulesLength
- (optionEndOffset
- i
);
2269 UChar
* newRules
= (UChar
*)uprv_malloc(newRulesLength
*sizeof(UChar
));
2271 #ifdef DEBUG_FOR_COLL_RULES
2273 UnicodeString(rules
).toUTF8String(s1
);
2274 std::cout
<< "Original rules = " << s1
<< std::endl
;
2278 // Copy the section of the original rules leading up to the import
2279 uprv_memcpy(newRules
, rules
, i
*sizeof(UChar
));
2280 // Copy the imported rules
2281 uprv_memcpy(newRules
+i
, importRules
, importRulesLength
*sizeof(UChar
));
2282 // Copy the rest of the original rules (minus the import option itself)
2283 uprv_memcpy(newRules
+i
+importRulesLength
,
2284 rules
+optionEndOffset
,
2285 (rulesLength
-optionEndOffset
)*sizeof(UChar
));
2287 #ifdef DEBUG_FOR_COLL_RULES
2289 UnicodeString(newRules
).toUTF8String(s2
);
2290 std::cout
<< "Resulting rules = " << s2
<< std::endl
;
2293 if(needToDeallocRules
){
2294 // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free
2295 uprv_free((void*)rules
);
2297 needToDeallocRules
= true;
2299 rulesLength
= newRulesLength
;
2301 estimatedSize
+= importRulesLength
*2;
2303 // First character of the new rules needs to be processed
2311 src
->source
= (UChar
*)uprv_malloc(estimatedSize
*sizeof(UChar
));
2313 if (src
->source
== NULL
) {
2314 *status
= U_MEMORY_ALLOCATION_ERROR
;
2317 uprv_memset(src
->source
, 0, estimatedSize
*sizeof(UChar
));
2318 nSize
= unorm_normalize(rules
, rulesLength
, UNORM_NFD
, 0, src
->source
, estimatedSize
, status
);
2319 if(nSize
> estimatedSize
|| *status
== U_BUFFER_OVERFLOW_ERROR
) {
2320 *status
= U_ZERO_ERROR
;
2321 src
->source
= (UChar
*)uprv_realloc(src
->source
, (nSize
+UCOL_TOK_EXTRA_RULE_SPACE_SIZE
)*sizeof(UChar
));
2323 if (src
->source
== NULL
) {
2324 *status
= U_MEMORY_ALLOCATION_ERROR
;
2327 nSize
= unorm_normalize(rules
, rulesLength
, UNORM_NFD
, 0, src
->source
, nSize
+UCOL_TOK_EXTRA_RULE_SPACE_SIZE
, status
);
2329 if(needToDeallocRules
){
2330 // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free
2331 uprv_free((void*)rules
);
2335 src
->current
= src
->source
;
2336 src
->end
= src
->source
+nSize
;
2337 src
->sourceCurrent
= src
->source
;
2338 src
->extraCurrent
= src
->end
+1; // Preserve terminating zero in the rule string so that option scanning works correctly
2339 src
->extraEnd
= src
->source
+estimatedSize
; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
2342 src
->invUCA
= ucol_initInverseUCA(status
);
2343 src
->parsedToken
.charsLen
= 0;
2344 src
->parsedToken
.charsOffset
= 0;
2345 src
->parsedToken
.extensionLen
= 0;
2346 src
->parsedToken
.extensionOffset
= 0;
2347 src
->parsedToken
.prefixLen
= 0;
2348 src
->parsedToken
.prefixOffset
= 0;
2349 src
->parsedToken
.flags
= 0;
2350 src
->parsedToken
.strength
= UCOL_TOK_UNSET
;
2351 src
->buildCCTabFlag
= FALSE
;
2352 src
->isStarred
= FALSE
;
2353 src
->inRange
= FALSE
;
2354 src
->lastRangeCp
= 0;
2355 src
->previousCp
= 0;
2357 if(U_FAILURE(*status
)) {
2360 src
->tailored
= uhash_open(uhash_hashTokens
, uhash_compareTokens
, NULL
, status
);
2361 if(U_FAILURE(*status
)) {
2364 uhash_setValueDeleter(src
->tailored
, uprv_free
);
2366 src
->opts
= (UColOptionSet
*)uprv_malloc(sizeof(UColOptionSet
));
2368 if (src
->opts
== NULL
) {
2369 *status
= U_MEMORY_ALLOCATION_ERROR
;
2373 uprv_memcpy(src
->opts
, UCA
->options
, sizeof(UColOptionSet
));
2376 src
->listCapacity
= 1024;
2377 src
->lh
= (UColTokListHeader
*)uprv_malloc(src
->listCapacity
*sizeof(UColTokListHeader
));
2379 if (src
->lh
== NULL
) {
2380 *status
= U_MEMORY_ALLOCATION_ERROR
;
2383 uprv_memset(src
->lh
, 0, src
->listCapacity
*sizeof(UColTokListHeader
));
2386 UCAConstants
*consts
= (UCAConstants
*)((uint8_t *)src
->UCA
->image
+ src
->UCA
->image
->UCAConsts
);
2388 // UCOL_RESET_TOP_VALUE
2389 setIndirectBoundaries(0, consts
->UCA_LAST_NON_VARIABLE
, consts
->UCA_FIRST_IMPLICIT
);
2390 // UCOL_FIRST_PRIMARY_IGNORABLE
2391 setIndirectBoundaries(1, consts
->UCA_FIRST_PRIMARY_IGNORABLE
, 0);
2392 // UCOL_LAST_PRIMARY_IGNORABLE
2393 setIndirectBoundaries(2, consts
->UCA_LAST_PRIMARY_IGNORABLE
, 0);
2394 // UCOL_FIRST_SECONDARY_IGNORABLE
2395 setIndirectBoundaries(3, consts
->UCA_FIRST_SECONDARY_IGNORABLE
, 0);
2396 // UCOL_LAST_SECONDARY_IGNORABLE
2397 setIndirectBoundaries(4, consts
->UCA_LAST_SECONDARY_IGNORABLE
, 0);
2398 // UCOL_FIRST_TERTIARY_IGNORABLE
2399 setIndirectBoundaries(5, consts
->UCA_FIRST_TERTIARY_IGNORABLE
, 0);
2400 // UCOL_LAST_TERTIARY_IGNORABLE
2401 setIndirectBoundaries(6, consts
->UCA_LAST_TERTIARY_IGNORABLE
, 0);
2402 // UCOL_FIRST_VARIABLE
2403 setIndirectBoundaries(7, consts
->UCA_FIRST_VARIABLE
, 0);
2404 // UCOL_LAST_VARIABLE
2405 setIndirectBoundaries(8, consts
->UCA_LAST_VARIABLE
, 0);
2406 // UCOL_FIRST_NON_VARIABLE
2407 setIndirectBoundaries(9, consts
->UCA_FIRST_NON_VARIABLE
, 0);
2408 // UCOL_LAST_NON_VARIABLE
2409 setIndirectBoundaries(10, consts
->UCA_LAST_NON_VARIABLE
, consts
->UCA_FIRST_IMPLICIT
);
2410 // UCOL_FIRST_IMPLICIT
2411 setIndirectBoundaries(11, consts
->UCA_FIRST_IMPLICIT
, 0);
2412 // UCOL_LAST_IMPLICIT
2413 setIndirectBoundaries(12, consts
->UCA_LAST_IMPLICIT
, consts
->UCA_FIRST_TRAILING
);
2414 // UCOL_FIRST_TRAILING
2415 setIndirectBoundaries(13, consts
->UCA_FIRST_TRAILING
, 0);
2416 // UCOL_LAST_TRAILING
2417 setIndirectBoundaries(14, consts
->UCA_LAST_TRAILING
, 0);
2418 ucolIndirectBoundaries
[14].limitCE
= (consts
->UCA_PRIMARY_SPECIAL_MIN
<<24);
2422 void ucol_tok_closeTokenList(UColTokenParser
*src
) {
2423 if(src
->copySet
!= NULL
) {
2424 uset_close(src
->copySet
);
2426 if(src
->removeSet
!= NULL
) {
2427 uset_close(src
->removeSet
);
2429 if(src
->tailored
!= NULL
) {
2430 uhash_close(src
->tailored
);
2432 if(src
->lh
!= NULL
) {
2435 if(src
->source
!= NULL
) {
2436 uprv_free(src
->source
);
2438 if(src
->opts
!= NULL
) {
2439 uprv_free(src
->opts
);
2441 if (src
->reorderCodes
!= NULL
) {
2442 uprv_free(src
->reorderCodes
);
2446 #endif /* #if !UCONFIG_NO_COLLATION */