2 *******************************************************************************
4 * Copyright (C) 2001-2010, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: ucol_tok.cpp
10 * tab size: 8 (not used)
14 * created by: Vladimir Weinstein
16 * This module reads a tailoring rule string and produces a list of
17 * tokens that will be turned into collation elements
21 #include "unicode/utypes.h"
23 #if !UCONFIG_NO_COLLATION
25 #include "unicode/uscript.h"
26 #include "unicode/ustring.h"
27 #include "unicode/uchar.h"
28 #include "unicode/uniset.h"
38 // Define this only for debugging.
39 // #define DEBUG_FOR_COLL_RULES 1
41 #ifdef DEBUG_FOR_COLL_RULES
48 static int32_t U_CALLCONV
49 uhash_hashTokens(const UHashTok k
)
52 //uint32_t key = (uint32_t)k.integer;
53 UColToken
*key
= (UColToken
*)k
.pointer
;
55 int32_t len
= (key
->source
& 0xFF000000)>>24;
56 int32_t inc
= ((len
- 32) / 32) + 1;
58 const UChar
*p
= (key
->source
& 0x00FFFFFF) + *(key
->rulesToParseHdl
);
59 const UChar
*limit
= p
+ len
;
62 hash
= (hash
* 37) + *p
;
69 static UBool U_CALLCONV
70 uhash_compareTokens(const UHashTok key1
, const UHashTok key2
)
72 //uint32_t p1 = (uint32_t) key1.integer;
73 //uint32_t p2 = (uint32_t) key2.integer;
74 UColToken
*p1
= (UColToken
*)key1
.pointer
;
75 UColToken
*p2
= (UColToken
*)key2
.pointer
;
76 const UChar
*s1
= (p1
->source
& 0x00FFFFFF) + *(p1
->rulesToParseHdl
);
77 const UChar
*s2
= (p2
->source
& 0x00FFFFFF) + *(p2
->rulesToParseHdl
);
78 uint32_t s1L
= ((p1
->source
& 0xFF000000) >> 24);
79 uint32_t s2L
= ((p2
->source
& 0xFF000000) >> 24);
80 const UChar
*end
= s1
+s1L
-1;
85 if (p1
->source
== 0 || p2
->source
== 0) {
91 if(p1
->source
== p2
->source
) {
94 while((s1
< end
) && *s1
== *s2
) {
107 * Debug messages used to pinpoint where a format error occurred.
108 * A better way is to include context-sensitive information in syntaxError() function.
110 * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_FORMAT_ERROR
111 * in the compile line.
113 /* #define DEBUG_FOR_FORMAT_ERROR 1 */
115 #ifdef DEBUG_FOR_FORMAT_ERROR
116 #define DBG_FORMAT_ERROR { printf("U_INVALID_FORMAT_ERROR at line %d", __LINE__);}
118 #define DBG_FORMAT_ERROR
123 * Controls debug messages so that the output can be compared before and after a
124 * big change. Prints the information of every code point that comes out of the
125 * collation parser and its strength into a file. When a big change in format
126 * happens, the files before and after the change should be identical.
128 * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_CODE_POINTS
129 * in the compile line.
131 // #define DEBUG_FOR_CODE_POINTS 1
133 #ifdef DEBUG_FOR_CODE_POINTS
134 FILE* dfcp_fp
= NULL
;
138 /*static inline void U_CALLCONV
139 uhash_freeBlockWrapper(void *obj) {
140 uhash_freeBlock(obj);
146 uint32_t startContCE
;
148 uint32_t limitContCE
;
149 } indirectBoundaries
;
151 /* these values are used for finding CE values for indirect positioning. */
152 /* Indirect positioning is a mechanism for allowing resets on symbolic */
153 /* values. It only works for resets and you cannot tailor indirect names */
154 /* An indirect name can define either an anchor point or a range. An */
155 /* anchor point behaves in exactly the same way as a code point in reset */
156 /* would, except that it cannot be tailored. A range (we currently only */
157 /* know for the [top] range will explicitly set the upper bound for */
158 /* generated CEs, thus allowing for better control over how many CEs can */
159 /* be squeezed between in the range without performance penalty. */
160 /* In that respect, we use [top] for tailoring of locales that use CJK */
161 /* characters. Other indirect values are currently a pure convenience, */
162 /* they can be used to assure that the CEs will be always positioned in */
163 /* the same place relative to a point with known properties (e.g. first */
164 /* primary ignorable). */
165 static indirectBoundaries ucolIndirectBoundaries
[15];
167 static indirectBoundaries ucolIndirectBoundaries[11] = {
168 { UCOL_RESET_TOP_VALUE, 0,
169 UCOL_NEXT_TOP_VALUE, 0 },
170 { UCOL_FIRST_PRIMARY_IGNORABLE, 0,
172 { UCOL_LAST_PRIMARY_IGNORABLE, UCOL_LAST_PRIMARY_IGNORABLE_CONT,
174 { UCOL_FIRST_SECONDARY_IGNORABLE, 0,
176 { UCOL_LAST_SECONDARY_IGNORABLE, 0,
178 { UCOL_FIRST_TERTIARY_IGNORABLE, 0,
180 { UCOL_LAST_TERTIARY_IGNORABLE, 0,
182 { UCOL_FIRST_VARIABLE, 0,
184 { UCOL_LAST_VARIABLE, 0,
186 { UCOL_FIRST_NON_VARIABLE, 0,
188 { UCOL_LAST_NON_VARIABLE, 0,
193 static void setIndirectBoundaries(uint32_t indexR
, uint32_t *start
, uint32_t *end
) {
195 // Set values for the top - TODO: once we have values for all the indirects, we are going
196 // to initalize here.
197 ucolIndirectBoundaries
[indexR
].startCE
= start
[0];
198 ucolIndirectBoundaries
[indexR
].startContCE
= start
[1];
200 ucolIndirectBoundaries
[indexR
].limitCE
= end
[0];
201 ucolIndirectBoundaries
[indexR
].limitContCE
= end
[1];
203 ucolIndirectBoundaries
[indexR
].limitCE
= 0;
204 ucolIndirectBoundaries
[indexR
].limitContCE
= 0;
210 void syntaxError(const UChar
* rules
,
213 UParseError
* parseError
)
215 parseError
->offset
= pos
;
216 parseError
->line
= 0 ; /* we are not using line numbers */
219 int32_t start
= (pos
< U_PARSE_CONTEXT_LEN
)? 0 : (pos
- (U_PARSE_CONTEXT_LEN
-1));
222 u_memcpy(parseError
->preContext
,rules
+start
,stop
-start
);
223 //null terminate the buffer
224 parseError
->preContext
[stop
-start
] = 0;
228 stop
= ((pos
+U_PARSE_CONTEXT_LEN
)<= rulesLen
)? (pos
+(U_PARSE_CONTEXT_LEN
-1)) :
232 u_memcpy(parseError
->postContext
,rules
+start
,stop
-start
);
233 //null terminate the buffer
234 parseError
->postContext
[stop
-start
]= 0;
236 parseError
->postContext
[0] = 0;
241 void ucol_uprv_tok_setOptionInImage(UColOptionSet
*opts
, UColAttribute attrib
, UColAttributeValue value
) {
243 case UCOL_HIRAGANA_QUATERNARY_MODE
:
244 opts
->hiraganaQ
= value
;
246 case UCOL_FRENCH_COLLATION
:
247 opts
->frenchCollation
= value
;
249 case UCOL_ALTERNATE_HANDLING
:
250 opts
->alternateHandling
= value
;
252 case UCOL_CASE_FIRST
:
253 opts
->caseFirst
= value
;
255 case UCOL_CASE_LEVEL
:
256 opts
->caseLevel
= value
;
258 case UCOL_NORMALIZATION_MODE
:
259 opts
->normalizationMode
= value
;
262 opts
->strength
= value
;
264 case UCOL_NUMERIC_COLLATION
:
265 opts
->numericCollation
= value
;
267 case UCOL_ATTRIBUTE_COUNT
:
273 #define UTOK_OPTION_COUNT 22
275 static UBool didInit
= FALSE
;
276 /* we can be strict, or we can be lenient */
277 /* I'd surely be lenient with the option arguments */
278 /* maybe even with options */
279 U_STRING_DECL(suboption_00
, "non-ignorable", 13);
280 U_STRING_DECL(suboption_01
, "shifted", 7);
282 U_STRING_DECL(suboption_02
, "lower", 5);
283 U_STRING_DECL(suboption_03
, "upper", 5);
284 U_STRING_DECL(suboption_04
, "off", 3);
285 U_STRING_DECL(suboption_05
, "on", 2);
286 U_STRING_DECL(suboption_06
, "1", 1);
287 U_STRING_DECL(suboption_07
, "2", 1);
288 U_STRING_DECL(suboption_08
, "3", 1);
289 U_STRING_DECL(suboption_09
, "4", 1);
290 U_STRING_DECL(suboption_10
, "I", 1);
292 U_STRING_DECL(suboption_11
, "primary", 7);
293 U_STRING_DECL(suboption_12
, "secondary", 9);
294 U_STRING_DECL(suboption_13
, "tertiary", 8);
295 U_STRING_DECL(suboption_14
, "variable", 8);
296 U_STRING_DECL(suboption_15
, "regular", 7);
297 U_STRING_DECL(suboption_16
, "implicit", 8);
298 U_STRING_DECL(suboption_17
, "trailing", 8);
301 U_STRING_DECL(option_00
, "undefined", 9);
302 U_STRING_DECL(option_01
, "rearrange", 9);
303 U_STRING_DECL(option_02
, "alternate", 9);
304 U_STRING_DECL(option_03
, "backwards", 9);
305 U_STRING_DECL(option_04
, "variable top", 12);
306 U_STRING_DECL(option_05
, "top", 3);
307 U_STRING_DECL(option_06
, "normalization", 13);
308 U_STRING_DECL(option_07
, "caseLevel", 9);
309 U_STRING_DECL(option_08
, "caseFirst", 9);
310 U_STRING_DECL(option_09
, "scriptOrder", 11);
311 U_STRING_DECL(option_10
, "charsetname", 11);
312 U_STRING_DECL(option_11
, "charset", 7);
313 U_STRING_DECL(option_12
, "before", 6);
314 U_STRING_DECL(option_13
, "hiraganaQ", 9);
315 U_STRING_DECL(option_14
, "strength", 8);
316 U_STRING_DECL(option_15
, "first", 5);
317 U_STRING_DECL(option_16
, "last", 4);
318 U_STRING_DECL(option_17
, "optimize", 8);
319 U_STRING_DECL(option_18
, "suppressContractions", 20);
320 U_STRING_DECL(option_19
, "numericOrdering", 15);
321 U_STRING_DECL(option_20
, "import", 6);
322 U_STRING_DECL(option_21
, "reorder", 7);
325 [last variable] last variable value
326 [last primary ignorable] largest CE for primary ignorable
327 [last secondary ignorable] largest CE for secondary ignorable
328 [last tertiary ignorable] largest CE for tertiary ignorable
329 [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
333 static const ucolTokSuboption alternateSub
[2] = {
334 {suboption_00
, 13, UCOL_NON_IGNORABLE
},
335 {suboption_01
, 7, UCOL_SHIFTED
}
338 static const ucolTokSuboption caseFirstSub
[3] = {
339 {suboption_02
, 5, UCOL_LOWER_FIRST
},
340 {suboption_03
, 5, UCOL_UPPER_FIRST
},
341 {suboption_04
, 3, UCOL_OFF
},
344 static const ucolTokSuboption onOffSub
[2] = {
345 {suboption_04
, 3, UCOL_OFF
},
346 {suboption_05
, 2, UCOL_ON
}
349 static const ucolTokSuboption frenchSub
[1] = {
350 {suboption_07
, 1, UCOL_ON
}
353 static const ucolTokSuboption beforeSub
[3] = {
354 {suboption_06
, 1, UCOL_PRIMARY
},
355 {suboption_07
, 1, UCOL_SECONDARY
},
356 {suboption_08
, 1, UCOL_TERTIARY
}
359 static const ucolTokSuboption strengthSub
[5] = {
360 {suboption_06
, 1, UCOL_PRIMARY
},
361 {suboption_07
, 1, UCOL_SECONDARY
},
362 {suboption_08
, 1, UCOL_TERTIARY
},
363 {suboption_09
, 1, UCOL_QUATERNARY
},
364 {suboption_10
, 1, UCOL_IDENTICAL
},
367 static const ucolTokSuboption firstLastSub
[7] = {
368 {suboption_11
, 7, UCOL_PRIMARY
},
369 {suboption_12
, 9, UCOL_PRIMARY
},
370 {suboption_13
, 8, UCOL_PRIMARY
},
371 {suboption_14
, 8, UCOL_PRIMARY
},
372 {suboption_15
, 7, UCOL_PRIMARY
},
373 {suboption_16
, 8, UCOL_PRIMARY
},
374 {suboption_17
, 8, UCOL_PRIMARY
},
378 OPTION_ALTERNATE_HANDLING
= 0,
379 OPTION_FRENCH_COLLATION
,
382 OPTION_NORMALIZATION_MODE
,
383 OPTION_HIRAGANA_QUATERNARY
,
385 OPTION_NUMERIC_COLLATION
,
386 OPTION_NORMAL_OPTIONS_LIMIT
= OPTION_NUMERIC_COLLATION
,
394 OPTION_SUPPRESS_CONTRACTIONS
,
403 static const ucolTokOption rulesOptions
[UTOK_OPTION_COUNT
] = {
404 /*00*/ {option_02
, 9, alternateSub
, 2, UCOL_ALTERNATE_HANDLING
}, /*"alternate" */
405 /*01*/ {option_03
, 9, frenchSub
, 1, UCOL_FRENCH_COLLATION
}, /*"backwards" */
406 /*02*/ {option_07
, 9, onOffSub
, 2, UCOL_CASE_LEVEL
}, /*"caseLevel" */
407 /*03*/ {option_08
, 9, caseFirstSub
, 3, UCOL_CASE_FIRST
}, /*"caseFirst" */
408 /*04*/ {option_06
, 13, onOffSub
, 2, UCOL_NORMALIZATION_MODE
}, /*"normalization" */
409 /*05*/ {option_13
, 9, onOffSub
, 2, UCOL_HIRAGANA_QUATERNARY_MODE
}, /*"hiraganaQ" */
410 /*06*/ {option_14
, 8, strengthSub
, 5, UCOL_STRENGTH
}, /*"strength" */
411 /*07*/ {option_19
, 15, onOffSub
, 2, UCOL_NUMERIC_COLLATION
}, /*"numericOrdering"*/
412 /*08*/ {option_04
, 12, NULL
, 0, UCOL_ATTRIBUTE_COUNT
}, /*"variable top" */
413 /*09*/ {option_01
, 9, NULL
, 0, UCOL_ATTRIBUTE_COUNT
}, /*"rearrange" */
414 /*10*/ {option_12
, 6, beforeSub
, 3, UCOL_ATTRIBUTE_COUNT
}, /*"before" */
415 /*11*/ {option_05
, 3, NULL
, 0, UCOL_ATTRIBUTE_COUNT
}, /*"top" */
416 /*12*/ {option_15
, 5, firstLastSub
, 7, UCOL_ATTRIBUTE_COUNT
}, /*"first" */
417 /*13*/ {option_16
, 4, firstLastSub
, 7, UCOL_ATTRIBUTE_COUNT
}, /*"last" */
418 /*14*/ {option_17
, 8, NULL
, 0, UCOL_ATTRIBUTE_COUNT
}, /*"optimize" */
419 /*15*/ {option_18
, 20, NULL
, 0, UCOL_ATTRIBUTE_COUNT
}, /*"suppressContractions" */
420 /*16*/ {option_00
, 9, NULL
, 0, UCOL_ATTRIBUTE_COUNT
}, /*"undefined" */
421 /*17*/ {option_09
, 11, NULL
, 0, UCOL_ATTRIBUTE_COUNT
}, /*"scriptOrder" */
422 /*18*/ {option_10
, 11, NULL
, 0, UCOL_ATTRIBUTE_COUNT
}, /*"charsetname" */
423 /*19*/ {option_11
, 7, NULL
, 0, UCOL_ATTRIBUTE_COUNT
}, /*"charset" */
424 /*20*/ {option_20
, 6, NULL
, 0, UCOL_ATTRIBUTE_COUNT
}, /*"import" */
425 /*21*/ {option_21
, 7, NULL
, 0, UCOL_ATTRIBUTE_COUNT
} /*"reorder" */
429 int32_t u_strncmpNoCase(const UChar
*s1
,
436 rc
= (int32_t)u_tolower(*s1
) - (int32_t)u_tolower(*s2
);
437 if(rc
!= 0 || *s1
== 0 || --n
== 0) {
448 void ucol_uprv_tok_initData() {
450 U_STRING_INIT(suboption_00
, "non-ignorable", 13);
451 U_STRING_INIT(suboption_01
, "shifted", 7);
453 U_STRING_INIT(suboption_02
, "lower", 5);
454 U_STRING_INIT(suboption_03
, "upper", 5);
455 U_STRING_INIT(suboption_04
, "off", 3);
456 U_STRING_INIT(suboption_05
, "on", 2);
458 U_STRING_INIT(suboption_06
, "1", 1);
459 U_STRING_INIT(suboption_07
, "2", 1);
460 U_STRING_INIT(suboption_08
, "3", 1);
461 U_STRING_INIT(suboption_09
, "4", 1);
462 U_STRING_INIT(suboption_10
, "I", 1);
464 U_STRING_INIT(suboption_11
, "primary", 7);
465 U_STRING_INIT(suboption_12
, "secondary", 9);
466 U_STRING_INIT(suboption_13
, "tertiary", 8);
467 U_STRING_INIT(suboption_14
, "variable", 8);
468 U_STRING_INIT(suboption_15
, "regular", 7);
469 U_STRING_INIT(suboption_16
, "implicit", 8);
470 U_STRING_INIT(suboption_17
, "trailing", 8);
473 U_STRING_INIT(option_00
, "undefined", 9);
474 U_STRING_INIT(option_01
, "rearrange", 9);
475 U_STRING_INIT(option_02
, "alternate", 9);
476 U_STRING_INIT(option_03
, "backwards", 9);
477 U_STRING_INIT(option_04
, "variable top", 12);
478 U_STRING_INIT(option_05
, "top", 3);
479 U_STRING_INIT(option_06
, "normalization", 13);
480 U_STRING_INIT(option_07
, "caseLevel", 9);
481 U_STRING_INIT(option_08
, "caseFirst", 9);
482 U_STRING_INIT(option_09
, "scriptOrder", 11);
483 U_STRING_INIT(option_10
, "charsetname", 11);
484 U_STRING_INIT(option_11
, "charset", 7);
485 U_STRING_INIT(option_12
, "before", 6);
486 U_STRING_INIT(option_13
, "hiraganaQ", 9);
487 U_STRING_INIT(option_14
, "strength", 8);
488 U_STRING_INIT(option_15
, "first", 5);
489 U_STRING_INIT(option_16
, "last", 4);
490 U_STRING_INIT(option_17
, "optimize", 8);
491 U_STRING_INIT(option_18
, "suppressContractions", 20);
492 U_STRING_INIT(option_19
, "numericOrdering", 15);
493 U_STRING_INIT(option_20
, "import ", 6);
494 U_STRING_INIT(option_21
, "reorder", 7);
500 // This function reads basic options to set in the runtime collator
501 // used by data driven tests. Should not support build time options
502 U_CAPI
const UChar
* U_EXPORT2
503 ucol_tok_getNextArgument(const UChar
*start
, const UChar
*end
,
504 UColAttribute
*attrib
, UColAttributeValue
*value
,
509 UBool foundOption
= FALSE
;
510 const UChar
*optionArg
= NULL
;
512 ucol_uprv_tok_initData();
514 while(start
< end
&& (u_isWhitespace(*start
) || uprv_isRuleWhiteSpace(*start
))) { /* eat whitespace */
520 /* skip opening '[' */
521 if(*start
== 0x005b) {
524 *status
= U_ILLEGAL_ARGUMENT_ERROR
; // no opening '['
528 while(i
< UTOK_OPTION_COUNT
) {
529 if(u_strncmpNoCase(start
, rulesOptions
[i
].optionName
, rulesOptions
[i
].optionLen
) == 0) {
531 if(end
- start
> rulesOptions
[i
].optionLen
) {
532 optionArg
= start
+rulesOptions
[i
].optionLen
+1; /* start of the options, skip space */
533 while(u_isWhitespace(*optionArg
) || uprv_isRuleWhiteSpace(*optionArg
)) { /* eat whitespace */
543 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
548 for(j
= 0; j
<rulesOptions
[i
].subSize
; j
++) {
549 if(u_strncmpNoCase(optionArg
, rulesOptions
[i
].subopts
[j
].subName
, rulesOptions
[i
].subopts
[j
].subLen
) == 0) {
550 //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
551 *attrib
= rulesOptions
[i
].attr
;
552 *value
= rulesOptions
[i
].subopts
[j
].attrVal
;
553 optionArg
+= rulesOptions
[i
].subopts
[j
].subLen
;
554 while(u_isWhitespace(*optionArg
) || uprv_isRuleWhiteSpace(*optionArg
)) { /* eat whitespace */
557 if(*optionArg
== 0x005d) {
561 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
567 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
572 USet
*ucol_uprv_tok_readAndSetUnicodeSet(const UChar
*start
, const UChar
*end
, UErrorCode
*status
) {
573 while(*start
!= 0x005b) { /* advance while we find the first '[' */
576 // now we need to get a balanced set of '[]'. The problem is that a set can have
577 // many, and *end point to the first closing '['
578 int32_t noOpenBraces
= 1;
579 int32_t current
= 1; // skip the opening brace
580 while(start
+current
< end
&& noOpenBraces
!= 0) {
581 if(start
[current
] == 0x005b) {
583 } else if(start
[current
] == 0x005D) { // closing brace
589 if(noOpenBraces
!= 0 || u_strchr(start
+current
, 0x005d /*']'*/) == NULL
) {
590 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
593 return uset_openPattern(start
, current
, status
);
597 * Reads an option and matches the option name with the predefined options. (Case-insensitive.)
598 * @param start Pointer to the start UChar.
599 * @param end Pointer to the last valid pointer beyond which the option will not extend.
600 * @param optionArg Address of the pointer at which the options start (after the option name)
601 * @return The index of the option, or -1 if the option is not valid.
604 int32_t ucol_uprv_tok_readOption(const UChar
*start
, const UChar
*end
, const UChar
**optionArg
) {
606 ucol_uprv_tok_initData();
608 while(u_isWhitespace(*start
) || uprv_isRuleWhiteSpace(*start
)) { /* eat whitespace */
611 while(i
< UTOK_OPTION_COUNT
) {
612 if(u_strncmpNoCase(start
, rulesOptions
[i
].optionName
, rulesOptions
[i
].optionLen
) == 0) {
613 if(end
- start
> rulesOptions
[i
].optionLen
) {
614 *optionArg
= start
+rulesOptions
[i
].optionLen
; /* End of option name; start of the options */
615 while(u_isWhitespace(**optionArg
) || uprv_isRuleWhiteSpace(**optionArg
)) { /* eat whitespace */
623 if(i
== UTOK_OPTION_COUNT
) {
624 i
= -1; // didn't find an option
631 void ucol_tok_parseScriptReorder(UColTokenParser
*src
, UErrorCode
*status
) {
632 int32_t codeCount
= 0;
633 int32_t codeIndex
= 0;
635 int32_t tokenLength
= 0;
638 const UChar
* current
= src
->current
;
639 const UChar
* end
= u_memchr(src
->current
, 0x005d, src
->end
- src
->current
);
641 // eat leading whitespace
642 while(current
< end
&& u_isWhitespace(*current
)) {
646 while(current
< end
) {
647 space
= u_memchr(current
, 0x0020, end
- current
);
648 space
= space
== 0 ? end
: space
;
649 tokenLength
= space
- current
;
650 if (tokenLength
< 4) {
651 *status
= U_INVALID_FORMAT_ERROR
;
655 current
+= tokenLength
;
656 while(current
< end
&& u_isWhitespace(*current
)) { /* eat whitespace */
661 if (codeCount
== 0) {
662 *status
= U_INVALID_FORMAT_ERROR
;
665 src
->reorderCodesLength
= codeCount
;
666 src
->reorderCodes
= (int32_t*)uprv_malloc(codeCount
* sizeof(int32_t));
667 current
= src
->current
;
669 // eat leading whitespace
670 while(current
< end
&& u_isWhitespace(*current
)) {
674 while(current
< end
) {
675 space
= u_memchr(current
, 0x0020, end
- current
);
676 space
= space
== 0 ? end
: space
;
677 tokenLength
= space
- current
;
678 if (tokenLength
< 4) {
679 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
682 u_UCharsToChars(current
, conversion
, tokenLength
);
683 conversion
[tokenLength
] = '\0';
684 src
->reorderCodes
[codeIndex
] = ucol_findReorderingEntry(conversion
);
685 if (src
->reorderCodes
[codeIndex
] == USCRIPT_INVALID_CODE
) {
686 src
->reorderCodes
[codeIndex
] = u_getPropertyValueEnum(UCHAR_SCRIPT
, conversion
);
688 if (src
->reorderCodes
[codeIndex
] == USCRIPT_INVALID_CODE
) {
689 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
693 current
+= tokenLength
;
694 while(current
< end
&& u_isWhitespace(*current
)) { /* eat whitespace */
700 // reads and conforms to various options in rules
701 // end is the position of the first closing ']'
702 // However, some of the options take an UnicodeSet definition
703 // which needs to duplicate the closing ']'
704 // for example: '[copy [\uAC00-\uD7FF]]'
705 // These options will move end to the second ']' and the
706 // caller will set the current to it.
708 uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser
*src
, UErrorCode
*status
) {
709 const UChar
* start
= src
->current
;
712 const UChar
*optionArg
= NULL
;
716 start
++; /*skip opening '['*/
717 i
= ucol_uprv_tok_readOption(start
, src
->end
, &optionArg
);
719 src
->current
= optionArg
;
723 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
725 int32_t noOpenBraces
= 1;
727 case OPTION_ALTERNATE_HANDLING
:
728 case OPTION_FRENCH_COLLATION
:
729 case OPTION_CASE_LEVEL
:
730 case OPTION_CASE_FIRST
:
731 case OPTION_NORMALIZATION_MODE
:
732 case OPTION_HIRAGANA_QUATERNARY
:
733 case OPTION_STRENGTH
:
734 case OPTION_NUMERIC_COLLATION
:
736 for(j
= 0; j
<rulesOptions
[i
].subSize
; j
++) {
737 if(u_strncmpNoCase(optionArg
, rulesOptions
[i
].subopts
[j
].subName
, rulesOptions
[i
].subopts
[j
].subLen
) == 0) {
738 ucol_uprv_tok_setOptionInImage(src
->opts
, rulesOptions
[i
].attr
, rulesOptions
[i
].subopts
[j
].attrVal
);
739 result
= UCOL_TOK_SUCCESS
;
744 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
747 case OPTION_VARIABLE_TOP
:
748 result
= UCOL_TOK_SUCCESS
| UCOL_TOK_VARIABLE_TOP
;
750 case OPTION_REARRANGE
:
751 result
= UCOL_TOK_SUCCESS
;
755 for(j
= 0; j
<rulesOptions
[i
].subSize
; j
++) {
756 if(u_strncmpNoCase(optionArg
, rulesOptions
[i
].subopts
[j
].subName
, rulesOptions
[i
].subopts
[j
].subLen
) == 0) {
757 result
= UCOL_TOK_SUCCESS
| (rulesOptions
[i
].subopts
[j
].attrVal
+ 1);
762 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
765 case OPTION_TOP
: /* we are going to have an array with structures of limit CEs */
766 /* index to this array will be src->parsedToken.indirectIndex*/
767 src
->parsedToken
.indirectIndex
= 0;
768 result
= UCOL_TOK_SUCCESS
| UCOL_TOK_TOP
;
771 case OPTION_LAST
: /* first, last */
772 for(j
= 0; j
<rulesOptions
[i
].subSize
; j
++) {
773 if(u_strncmpNoCase(optionArg
, rulesOptions
[i
].subopts
[j
].subName
, rulesOptions
[i
].subopts
[j
].subLen
) == 0) {
774 // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first
775 // element of indirect boundaries is reserved for top.
776 src
->parsedToken
.indirectIndex
= (uint16_t)(i
-OPTION_FIRST
+1+j
*2);
777 result
= UCOL_TOK_SUCCESS
| UCOL_TOK_TOP
;;
781 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
784 case OPTION_OPTIMIZE
:
785 case OPTION_SUPPRESS_CONTRACTIONS
: // copy and remove are handled before normalization
786 // we need to move end here
787 src
->current
++; // skip opening brace
788 while(src
->current
< src
->end
&& noOpenBraces
!= 0) {
789 if(*src
->current
== 0x005b) {
791 } else if(*src
->current
== 0x005D) { // closing brace
796 result
= UCOL_TOK_SUCCESS
;
798 case OPTION_SCRIPTREORDER
:
799 ucol_tok_parseScriptReorder(src
, status
);
802 *status
= U_UNSUPPORTED_ERROR
;
806 src
->current
= u_memchr(src
->current
, 0x005d, (int32_t)(src
->end
-src
->current
));
811 inline void ucol_tok_addToExtraCurrent(UColTokenParser
*src
, const UChar
*stuff
, int32_t len
, UErrorCode
*status
) {
812 if (stuff
== NULL
|| len
<= 0) {
815 UnicodeString
tempStuff(FALSE
, stuff
, len
);
816 if(src
->extraCurrent
+len
>= src
->extraEnd
) {
818 if (stuff
>= src
->source
&& stuff
<= src
->end
) {
819 // Copy the "stuff" contents into tempStuff's own buffer.
820 // UnicodeString is copy-on-write.
822 tempStuff
.setCharAt(0, tempStuff
[0]);
827 UChar
*newSrc
= (UChar
*)uprv_realloc(src
->source
, (src
->extraEnd
-src
->source
)*2*sizeof(UChar
));
829 src
->current
= newSrc
+ (src
->current
- src
->source
);
830 src
->extraCurrent
= newSrc
+ (src
->extraCurrent
- src
->source
);
831 src
->end
= newSrc
+ (src
->end
- src
->source
);
832 src
->extraEnd
= newSrc
+ (src
->extraEnd
-src
->source
)*2;
833 src
->sourceCurrent
= newSrc
+ (src
->sourceCurrent
-src
->source
);
834 src
->source
= newSrc
;
836 *status
= U_MEMORY_ALLOCATION_ERROR
;
841 *src
->extraCurrent
++ = tempStuff
[0];
843 u_memcpy(src
->extraCurrent
, tempStuff
.getBuffer(), len
);
844 src
->extraCurrent
+= len
;
848 inline UBool
ucol_tok_doSetTop(UColTokenParser
*src
, UErrorCode
*status
) {
853 src
->parsedToken
.charsOffset
= (uint32_t)(src
->extraCurrent
- src
->source
);
855 buff
[1] = (UChar
)(ucolIndirectBoundaries
[src
->parsedToken
.indirectIndex
].startCE
>> 16);
856 buff
[2] = (UChar
)(ucolIndirectBoundaries
[src
->parsedToken
.indirectIndex
].startCE
& 0xFFFF);
857 if(ucolIndirectBoundaries
[src
->parsedToken
.indirectIndex
].startContCE
== 0) {
858 src
->parsedToken
.charsLen
= 3;
859 ucol_tok_addToExtraCurrent(src
, buff
, 3, status
);
861 buff
[3] = (UChar
)(ucolIndirectBoundaries
[src
->parsedToken
.indirectIndex
].startContCE
>> 16);
862 buff
[4] = (UChar
)(ucolIndirectBoundaries
[src
->parsedToken
.indirectIndex
].startContCE
& 0xFFFF);
863 src
->parsedToken
.charsLen
= 5;
864 ucol_tok_addToExtraCurrent(src
, buff
, 5, status
);
869 static UBool
isCharNewLine(UChar c
){
871 case 0x000A: /* LF */
872 case 0x000D: /* CR */
873 case 0x000C: /* FF */
874 case 0x0085: /* NEL */
875 case 0x2028: /* LS */
876 case 0x2029: /* PS */
884 * This function is called several times when a range is processed. Each time, the next code point
886 * The following variables must be set before calling this function:
887 * src->currentRangeCp: The current code point to process.
888 * src->lastRangeCp: The last code point in the range.
889 * Pre-requisite: src->currentRangeCp <= src->lastRangeCp.
892 ucol_tok_processNextCodePointInRange(UColTokenParser
*src
,
895 // Append current code point to source
896 UChar buff
[U16_MAX_LENGTH
];
899 uint32_t nChars
= U16_LENGTH(src
->currentRangeCp
);
900 src
->parsedToken
.charsOffset
= (uint32_t)(src
->extraCurrent
- src
->source
);
901 src
->parsedToken
.charsLen
= nChars
;
903 U16_APPEND_UNSAFE(buff
, i
, src
->currentRangeCp
);
904 ucol_tok_addToExtraCurrent(src
, buff
, nChars
, status
);
906 ++src
->currentRangeCp
;
907 if (src
->currentRangeCp
> src
->lastRangeCp
) {
908 src
->inRange
= FALSE
;
910 if (src
->currentStarredCharIndex
> src
->lastStarredCharIndex
) {
911 src
->isStarred
= FALSE
;
914 src
->previousCp
= src
->currentRangeCp
;
920 * This function is called several times when a starred list is processed. Each time, the next code point
921 * in the list is processed.
922 * The following variables must be set before calling this function:
923 * src->currentStarredCharIndex: Index (in src->source) of the first char of the current code point.
924 * src->lastStarredCharIndex: Index to the last character in the list.
925 * Pre-requisite: src->currentStarredCharIndex <= src->lastStarredCharIndex.
928 ucol_tok_processNextTokenInStarredList(UColTokenParser
*src
)
930 // Extract the characters corresponding to the next code point.
932 src
->parsedToken
.charsOffset
= src
->currentStarredCharIndex
;
933 int32_t prev
= src
->currentStarredCharIndex
;
934 U16_NEXT(src
->source
, src
->currentStarredCharIndex
, (uint32_t)(src
->end
- src
->source
), cp
);
935 src
->parsedToken
.charsLen
= src
->currentStarredCharIndex
- prev
;
937 // When we are done parsing the starred string, turn the flag off so that
938 // the normal processing is restored.
939 if (src
->currentStarredCharIndex
> src
->lastStarredCharIndex
) {
940 src
->isStarred
= FALSE
;
942 src
->previousCp
= cp
;
947 * Partially parses the next token, keeps the indices in src->parsedToken, and updates the counters.
949 * This routine parses and separates almost all tokens. The following are the syntax characters recognized.
950 * # : Comment character
953 * < : Primary collation
954 * << : Secondary collation
955 * <<< : Tertiary collation
956 * ; : Secondary collation
957 * , : Tertiary collation
962 * ! : Java Thai modifier, ignored
968 * Along with operators =, <, <<, <<<, the operator * is supported to indicate a list. For example, &a<*bcdexyz
969 * is equivalent to &a<b<c<d<e<x<y<z. In lists, ranges also can be given, so &a*b-ex-z is equivalent to the above.
970 * This function do not separate the tokens in a list. Instead, &a<*b-ex-z is parsed as three tokens - "&a",
971 * "<*b", "-ex", "-z". The strength (< in this case), whether in a list, whether in a range and the previous
972 * character returned as cached so that the calling program can do further splitting.
975 ucol_tok_parseNextTokenInternal(UColTokenParser
*src
,
977 UParseError
*parseError
,
980 UBool variableTop
= FALSE
;
982 UBool inChars
= TRUE
;
983 UBool inQuote
= FALSE
;
984 UBool wasInQuote
= FALSE
;
986 UBool isEscaped
= FALSE
;
988 // TODO: replace these variables with src->parsedToken counterparts
989 // no need to use them anymore since we have src->parsedToken.
990 // Ideally, token parser would be a nice class... Once, when I have
991 // more time (around 2020 probably).
992 uint32_t newExtensionLen
= 0;
993 uint32_t extensionOffset
= 0;
994 uint32_t newStrength
= UCOL_TOK_UNSET
;
997 src
->parsedToken
.charsOffset
= 0; src
->parsedToken
.charsLen
= 0;
998 src
->parsedToken
.prefixOffset
= 0; src
->parsedToken
.prefixLen
= 0;
999 src
->parsedToken
.indirectIndex
= 0;
1001 while (src
->current
< src
->end
) {
1002 UChar ch
= *(src
->current
);
1005 if (ch
== 0x0027/*'\''*/) {
1008 if ((src
->parsedToken
.charsLen
== 0) || inChars
) {
1009 if(src
->parsedToken
.charsLen
== 0) {
1010 src
->parsedToken
.charsOffset
= (uint32_t)(src
->extraCurrent
- src
->source
);
1012 src
->parsedToken
.charsLen
++;
1014 if(newExtensionLen
== 0) {
1015 extensionOffset
= (uint32_t)(src
->extraCurrent
- src
->source
);
1020 }else if(isEscaped
){
1022 if (newStrength
== UCOL_TOK_UNSET
) {
1023 *status
= U_INVALID_FORMAT_ERROR
;
1024 syntaxError(src
->source
,(int32_t)(src
->current
-src
->source
),(int32_t)(src
->end
-src
->source
),parseError
);
1027 // enabling rules to start with non-tokens a < b
1028 // newStrength = UCOL_TOK_RESET;
1030 if(ch
!= 0x0000 && src
->current
!= src
->end
) {
1032 if(src
->parsedToken
.charsLen
== 0) {
1033 src
->parsedToken
.charsOffset
= (uint32_t)(src
->current
- src
->source
);
1035 src
->parsedToken
.charsLen
++;
1037 if(newExtensionLen
== 0) {
1038 extensionOffset
= (uint32_t)(src
->current
- src
->source
);
1044 if(!uprv_isRuleWhiteSpace(ch
)) {
1045 /* Sets the strength for this entry */
1047 case 0x003D/*'='*/ :
1048 if (newStrength
!= UCOL_TOK_UNSET
) {
1052 /* if we start with strength, we'll reset to top */
1053 if(startOfRules
== TRUE
) {
1054 src
->parsedToken
.indirectIndex
= 5;
1055 top
= ucol_tok_doSetTop(src
, status
);
1056 newStrength
= UCOL_TOK_RESET
;
1059 newStrength
= UCOL_IDENTICAL
;
1060 if(*(src
->current
+1) == 0x002A) {/*'*'*/
1062 src
->isStarred
= TRUE
;
1067 if (newStrength
!= UCOL_TOK_UNSET
) {
1071 /* if we start with strength, we'll reset to top */
1072 if(startOfRules
== TRUE
) {
1073 src
->parsedToken
.indirectIndex
= 5;
1074 top
= ucol_tok_doSetTop(src
, status
);
1075 newStrength
= UCOL_TOK_RESET
;
1078 newStrength
= UCOL_TERTIARY
;
1082 if (newStrength
!= UCOL_TOK_UNSET
) {
1086 /* if we start with strength, we'll reset to top */
1087 if(startOfRules
== TRUE
) {
1088 src
->parsedToken
.indirectIndex
= 5;
1089 top
= ucol_tok_doSetTop(src
, status
);
1090 newStrength
= UCOL_TOK_RESET
;
1093 newStrength
= UCOL_SECONDARY
;
1097 if (newStrength
!= UCOL_TOK_UNSET
) {
1101 /* if we start with strength, we'll reset to top */
1102 if(startOfRules
== TRUE
) {
1103 src
->parsedToken
.indirectIndex
= 5;
1104 top
= ucol_tok_doSetTop(src
, status
);
1105 newStrength
= UCOL_TOK_RESET
;
1108 /* before this, do a scan to verify whether this is */
1109 /* another strength */
1110 if(*(src
->current
+1) == 0x003C) {
1112 if(*(src
->current
+1) == 0x003C) {
1113 src
->current
++; /* three in a row! */
1114 newStrength
= UCOL_TERTIARY
;
1115 } else { /* two in a row */
1116 newStrength
= UCOL_SECONDARY
;
1118 } else { /* just one */
1119 newStrength
= UCOL_PRIMARY
;
1121 if(*(src
->current
+1) == 0x002A) {/*'*'*/
1123 src
->isStarred
= TRUE
;
1128 if (newStrength
!= UCOL_TOK_UNSET
) {
1133 newStrength
= UCOL_TOK_RESET
; /* PatternEntry::RESET = 0 */
1137 /* options - read an option, analyze it */
1138 if(u_strchr(src
->current
, 0x005d /*']'*/) != NULL
) {
1139 uint8_t result
= ucol_uprv_tok_readAndSetOption(src
, status
);
1140 if(U_SUCCESS(*status
)) {
1141 if(result
& UCOL_TOK_TOP
) {
1142 if(newStrength
== UCOL_TOK_RESET
) {
1143 top
= ucol_tok_doSetTop(src
, status
);
1144 if(before
) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
1145 src
->parsedToken
.charsLen
+=2;
1148 ucol_tok_addToExtraCurrent(src
, buff
, 2, status
);
1154 *status
= U_INVALID_FORMAT_ERROR
;
1155 syntaxError(src
->source
,(int32_t)(src
->current
-src
->source
),(int32_t)(src
->end
-src
->source
),parseError
);
1158 } else if(result
& UCOL_TOK_VARIABLE_TOP
) {
1159 if(newStrength
!= UCOL_TOK_RESET
&& newStrength
!= UCOL_TOK_UNSET
) {
1161 src
->parsedToken
.charsOffset
= (uint32_t)(src
->extraCurrent
- src
->source
);
1162 src
->parsedToken
.charsLen
= 1;
1164 ucol_tok_addToExtraCurrent(src
, buff
, 1, status
);
1168 *status
= U_INVALID_FORMAT_ERROR
;
1169 syntaxError(src
->source
,(int32_t)(src
->current
-src
->source
),(int32_t)(src
->end
-src
->source
),parseError
);
1172 } else if (result
& UCOL_TOK_BEFORE
){
1173 if(newStrength
== UCOL_TOK_RESET
) {
1174 before
= result
& UCOL_TOK_BEFORE
;
1176 *status
= U_INVALID_FORMAT_ERROR
;
1177 syntaxError(src
->source
,(int32_t)(src
->current
-src
->source
),(int32_t)(src
->end
-src
->source
),parseError
);
1182 *status
= U_INVALID_FORMAT_ERROR
;
1183 syntaxError(src
->source
,(int32_t)(src
->current
-src
->source
),(int32_t)(src
->end
-src
->source
),parseError
);
1189 case 0x0021/*! skip java thai modifier reordering*/:
1192 wasInQuote
= FALSE
; /* if we were copying source characters, we want to stop now */
1193 inChars
= FALSE
; /* we're now processing expansion */
1195 case 0x005C /* back slash for escaped chars */:
1198 /* found a quote, we're gonna start copying */
1199 case 0x0027/*'\''*/:
1200 if (newStrength
== UCOL_TOK_UNSET
) { /* quote is illegal until we have a strength */
1201 *status
= U_INVALID_FORMAT_ERROR
;
1202 syntaxError(src
->source
,(int32_t)(src
->current
-src
->source
),(int32_t)(src
->end
-src
->source
),parseError
);
1205 // enabling rules to start with a non-token character a < b
1206 // newStrength = UCOL_TOK_RESET;
1211 if(inChars
) { /* we're doing characters */
1212 if(wasInQuote
== FALSE
) {
1213 src
->parsedToken
.charsOffset
= (uint32_t)(src
->extraCurrent
- src
->source
);
1215 if (src
->parsedToken
.charsLen
!= 0) {
1216 ucol_tok_addToExtraCurrent(src
, src
->current
- src
->parsedToken
.charsLen
, src
->parsedToken
.charsLen
, status
);
1218 src
->parsedToken
.charsLen
++;
1219 } else { /* we're doing an expansion */
1220 if(wasInQuote
== FALSE
) {
1221 extensionOffset
= (uint32_t)(src
->extraCurrent
- src
->source
);
1223 if (newExtensionLen
!= 0) {
1224 ucol_tok_addToExtraCurrent(src
, src
->current
- newExtensionLen
, newExtensionLen
, status
);
1231 ch
= *(++(src
->current
));
1232 if(ch
== 0x0027) { /* copy the double quote */
1233 ucol_tok_addToExtraCurrent(src
, &ch
, 1, status
);
1238 /* '@' is french only if the strength is not currently set */
1239 /* if it is, it's just a regular character in collation rules */
1241 if (newStrength
== UCOL_TOK_UNSET
) {
1242 src
->opts
->frenchCollation
= UCOL_ON
;
1246 case 0x007C /*|*/: /* this means we have actually been reading prefix part */
1247 // we want to store read characters to the prefix part and continue reading
1248 // the characters (proper way would be to restart reading the chars, but in
1249 // that case we would have to complicate the token hasher, which I do not
1250 // intend to play with. Instead, we will do prefixes when prefixes are due
1251 // (before adding the elements).
1252 src
->parsedToken
.prefixOffset
= src
->parsedToken
.charsOffset
;
1253 src
->parsedToken
.prefixLen
= src
->parsedToken
.charsLen
;
1255 if(inChars
) { /* we're doing characters */
1256 if(wasInQuote
== FALSE
) {
1257 src
->parsedToken
.charsOffset
= (uint32_t)(src
->extraCurrent
- src
->source
);
1259 if (src
->parsedToken
.charsLen
!= 0) {
1260 ucol_tok_addToExtraCurrent(src
, src
->current
- src
->parsedToken
.charsLen
, src
->parsedToken
.charsLen
, status
);
1262 src
->parsedToken
.charsLen
++;
1268 ch
= *(++(src
->current
));
1269 // skip whitespace between '|' and the character
1270 } while (uprv_isRuleWhiteSpace(ch
));
1275 //break; // We want to store the whole prefix/character sequence. If we break
1276 // the '|' is going to get lost.
1278 case 0x002D /*-*/: /* A range. */
1279 if (newStrength
!= UCOL_TOK_UNSET
) {
1280 // While processing the pending token, the isStarred field
1281 // is reset, so it needs to be saved for the next
1283 src
->savedIsStarred
= src
->isStarred
;
1286 src
->isStarred
= src
->savedIsStarred
;
1288 // Ranges are valid only in starred tokens.
1289 if (!src
->isStarred
) {
1290 *status
= U_INVALID_FORMAT_ERROR
;
1291 syntaxError(src
->source
,(int32_t)(src
->current
-src
->source
),(int32_t)(src
->end
-src
->source
),parseError
);
1295 newStrength
= src
->parsedToken
.strength
;
1296 src
->inRange
= TRUE
;
1299 case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */
1301 ch
= *(++(src
->current
));
1302 } while (!isCharNewLine(ch
));
1306 if (newStrength
== UCOL_TOK_UNSET
) {
1307 *status
= U_INVALID_FORMAT_ERROR
;
1308 syntaxError(src
->source
,(int32_t)(src
->current
-src
->source
),(int32_t)(src
->end
-src
->source
),parseError
);
1313 if (ucol_tok_isSpecialChar(ch
) && (inQuote
== FALSE
)) {
1314 *status
= U_INVALID_FORMAT_ERROR
;
1315 syntaxError(src
->source
,(int32_t)(src
->current
-src
->source
),(int32_t)(src
->end
-src
->source
),parseError
);
1320 if(ch
== 0x0000 && src
->current
+1 == src
->end
) {
1325 if(src
->parsedToken
.charsLen
== 0) {
1326 src
->parsedToken
.charsOffset
= (uint32_t)(src
->current
- src
->source
);
1328 src
->parsedToken
.charsLen
++;
1330 if(newExtensionLen
== 0) {
1331 extensionOffset
= (uint32_t)(src
->current
- src
->source
);
1343 if(inQuote
|| !uprv_isRuleWhiteSpace(ch
)) {
1344 ucol_tok_addToExtraCurrent(src
, &ch
, 1, status
);
1354 if (newStrength
== UCOL_TOK_UNSET
) {
1358 if (src
->parsedToken
.charsLen
== 0 && top
== FALSE
) {
1359 syntaxError(src
->source
,(int32_t)(src
->current
-src
->source
),(int32_t)(src
->end
-src
->source
),parseError
);
1360 *status
= U_INVALID_FORMAT_ERROR
;
1365 src
->parsedToken
.strength
= newStrength
;
1366 src
->parsedToken
.extensionOffset
= extensionOffset
;
1367 src
->parsedToken
.extensionLen
= newExtensionLen
;
1368 src
->parsedToken
.flags
= (UCOL_TOK_VARIABLE_TOP
* (variableTop
?1:0)) | (UCOL_TOK_TOP
* (top
?1:0)) | before
;
1370 return src
->current
;
1374 * Parses the next token, keeps the indices in src->parsedToken, and updates the counters.
1375 * @see ucol_tok_parseNextTokenInternal() for the description of what operators are supported.
1377 * In addition to what ucol_tok_parseNextTokenInternal() does, this function does the following:
1378 * 1) ucol_tok_parseNextTokenInternal() returns a range as a single token. This function separates
1379 * it to separate tokens and returns one by one. In order to do that, the necessary states are
1380 * cached as member variables of the token parser.
1381 * 2) When encountering a range, ucol_tok_parseNextTokenInternal() processes characters up to the
1382 * starting character as a single list token (which is separated into individual characters here)
1383 * and as another list token starting with the last character in the range. Before expanding it
1384 * as a list of tokens, this function expands the range by filling the intermediate characters and
1385 * returns them one by one as separate tokens.
1386 * Necessary checks are done for invalid combinations.
1388 U_CAPI
const UChar
* U_EXPORT2
1389 ucol_tok_parseNextToken(UColTokenParser
*src
,
1391 UParseError
*parseError
,
1394 const UChar
*nextToken
;
1397 // We are not done processing a range. Continue it.
1398 return ucol_tok_processNextCodePointInRange(src
, status
);
1399 } else if (src
->isStarred
) {
1400 // We are not done processing a starred token. Continue it.
1401 return ucol_tok_processNextTokenInStarredList(src
);
1404 // Get the next token.
1405 nextToken
= ucol_tok_parseNextTokenInternal(src
, startOfRules
, parseError
, status
);
1407 if (nextToken
== NULL
) {
1412 // A new range has started.
1413 // Check whether it is a chain of ranges with more than one hyphen.
1414 if (src
->lastRangeCp
> 0 && src
->lastRangeCp
== src
->previousCp
) {
1415 *status
= U_INVALID_FORMAT_ERROR
;
1416 syntaxError(src
->source
,src
->parsedToken
.charsOffset
-1,
1417 src
->parsedToken
.charsOffset
+src
->parsedToken
.charsLen
, parseError
);
1422 // The current token indicates the second code point of the range.
1423 // Process just that, and then proceed with the star.
1424 src
->currentStarredCharIndex
= src
->parsedToken
.charsOffset
;
1425 U16_NEXT(src
->source
, src
->currentStarredCharIndex
,
1426 (uint32_t)(src
->end
- src
->source
), src
->lastRangeCp
);
1427 if (src
->lastRangeCp
<= src
->previousCp
) {
1428 *status
= U_INVALID_FORMAT_ERROR
;
1429 syntaxError(src
->source
,src
->parsedToken
.charsOffset
-1,
1430 src
->parsedToken
.charsOffset
+src
->parsedToken
.charsLen
,parseError
);
1435 // Set current range code point to process the range loop
1436 src
->currentRangeCp
= src
->previousCp
+ 1;
1438 src
->lastStarredCharIndex
= src
->parsedToken
.charsOffset
+ src
->parsedToken
.charsLen
- 1;
1440 return ucol_tok_processNextCodePointInRange(src
, status
);
1441 } else if (src
->isStarred
) {
1442 // We define two indices m_currentStarredCharIndex_ and m_lastStarredCharIndex_ so that
1443 // [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive, need to be
1444 // separated into several tokens and returned.
1445 src
->currentStarredCharIndex
= src
->parsedToken
.charsOffset
;
1446 src
->lastStarredCharIndex
= src
->parsedToken
.charsOffset
+ src
->parsedToken
.charsLen
- 1;
1448 return ucol_tok_processNextTokenInStarredList(src
);
1450 // Set previous codepoint
1451 U16_GET(src
->source
, 0, src
->parsedToken
.charsOffset
, (uint32_t)(src
->end
- src
->source
), src
->previousCp
);
1458 Processing Description
1459 1 Build a ListList. Each list has a header, which contains two lists (positive
1460 and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and
1462 2 As you process, you keep a LAST pointer that points to the last token you
1467 static UColToken
*ucol_tok_initAReset(UColTokenParser
*src
, const UChar
*expand
, uint32_t *expandNext
,
1468 UParseError
*parseError
, UErrorCode
*status
)
1470 if(src
->resultLen
== src
->listCapacity
) {
1471 // Unfortunately, this won't work, as we store addresses of lhs in token
1472 src
->listCapacity
*= 2;
1473 src
->lh
= (UColTokListHeader
*)uprv_realloc(src
->lh
, src
->listCapacity
*sizeof(UColTokListHeader
));
1474 if(src
->lh
== NULL
) {
1475 *status
= U_MEMORY_ALLOCATION_ERROR
;
1479 /* do the reset thing */
1480 UColToken
*sourceToken
= (UColToken
*)uprv_malloc(sizeof(UColToken
));
1482 if (sourceToken
== NULL
) {
1483 *status
= U_MEMORY_ALLOCATION_ERROR
;
1486 sourceToken
->rulesToParseHdl
= &(src
->source
);
1487 sourceToken
->source
= src
->parsedToken
.charsLen
<< 24 | src
->parsedToken
.charsOffset
;
1488 sourceToken
->expansion
= src
->parsedToken
.extensionLen
<< 24 | src
->parsedToken
.extensionOffset
;
1490 sourceToken
->debugSource
= *(src
->source
+ src
->parsedToken
.charsOffset
);
1491 sourceToken
->debugExpansion
= *(src
->source
+ src
->parsedToken
.extensionOffset
);
1493 // keep the flags around so that we know about before
1494 sourceToken
->flags
= src
->parsedToken
.flags
;
1496 if(src
->parsedToken
.prefixOffset
!= 0) {
1497 // this is a syntax error
1498 *status
= U_INVALID_FORMAT_ERROR
;
1499 syntaxError(src
->source
,src
->parsedToken
.charsOffset
-1,src
->parsedToken
.charsOffset
+src
->parsedToken
.charsLen
,parseError
);
1501 uprv_free(sourceToken
);
1504 sourceToken
->prefix
= 0;
1507 sourceToken
->polarity
= UCOL_TOK_POLARITY_POSITIVE
; /* TODO: this should also handle reverse */
1508 sourceToken
->strength
= UCOL_TOK_RESET
;
1509 sourceToken
->next
= NULL
;
1510 sourceToken
->previous
= NULL
;
1511 sourceToken
->noOfCEs
= 0;
1512 sourceToken
->noOfExpCEs
= 0;
1513 sourceToken
->listHeader
= &src
->lh
[src
->resultLen
];
1515 src
->lh
[src
->resultLen
].first
= NULL
;
1516 src
->lh
[src
->resultLen
].last
= NULL
;
1517 src
->lh
[src
->resultLen
].first
= NULL
;
1518 src
->lh
[src
->resultLen
].last
= NULL
;
1520 src
->lh
[src
->resultLen
].reset
= sourceToken
;
1523 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
1524 First convert all expansions into normal form. Examples:
1525 If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
1526 d * ... into &x * c/y * d * ...
1527 Note: reset values can never have expansions, although they can cause the
1528 very next item to have one. They may be contractions, if they are found
1529 earlier in the list.
1532 if(expand
!= NULL
) {
1533 /* check to see if there is an expansion */
1534 if(src
->parsedToken
.charsLen
> 1) {
1535 uint32_t resetCharsOffset
;
1536 resetCharsOffset
= (uint32_t)(expand
- src
->source
);
1537 sourceToken
->source
= ((resetCharsOffset
- src
->parsedToken
.charsOffset
) << 24) | src
->parsedToken
.charsOffset
;
1538 *expandNext
= ((src
->parsedToken
.charsLen
+ src
->parsedToken
.charsOffset
- resetCharsOffset
)<<24) | (resetCharsOffset
);
1544 uhash_put(src
->tailored
, sourceToken
, sourceToken
, status
);
1550 inline UColToken
*getVirginBefore(UColTokenParser
*src
, UColToken
*sourceToken
, uint8_t strength
, UParseError
*parseError
, UErrorCode
*status
) {
1551 if(U_FAILURE(*status
)) {
1554 /* this is a virgin before - we need to fish the anchor from the UCA */
1556 uint32_t baseCE
= UCOL_NOT_FOUND
, baseContCE
= UCOL_NOT_FOUND
;
1557 uint32_t CE
, SecondCE
;
1559 if(sourceToken
!= NULL
) {
1560 uprv_init_collIterate(src
->UCA
, src
->source
+((sourceToken
->source
)&0xFFFFFF), 1, &s
, status
);
1562 uprv_init_collIterate(src
->UCA
, src
->source
+src
->parsedToken
.charsOffset
/**charsOffset*/, 1, &s
, status
);
1564 if(U_FAILURE(*status
)) {
1568 baseCE
= ucol_getNextCE(src
->UCA
, &s
, status
) & 0xFFFFFF3F;
1569 baseContCE
= ucol_getNextCE(src
->UCA
, &s
, status
);
1570 if(baseContCE
== UCOL_NO_MORE_CES
) {
1575 UCAConstants
*consts
= (UCAConstants
*)((uint8_t *)src
->UCA
->image
+ src
->UCA
->image
->UCAConsts
);
1577 uint32_t expandNext
= 0;
1580 if((baseCE
& 0xFF000000) >= (consts
->UCA_PRIMARY_IMPLICIT_MIN
<<24) && (baseCE
& 0xFF000000) <= (consts
->UCA_PRIMARY_IMPLICIT_MAX
<<24) ) { /* implicits - */
1581 uint32_t primary
= (baseCE
& UCOL_PRIMARYMASK
) | ((baseContCE
& UCOL_PRIMARYMASK
) >> 16);
1582 uint32_t raw
= uprv_uca_getRawFromImplicit(primary
);
1583 ch
= uprv_uca_getCodePointFromRaw(raw
-1);
1584 uint32_t primaryCE
= uprv_uca_getImplicitFromRaw(raw
-1);
1585 CE
= (primaryCE
& UCOL_PRIMARYMASK
) | 0x0505;
1586 SecondCE
= ((primaryCE
<< 16) & UCOL_PRIMARYMASK
) | UCOL_CONTINUATION_MARKER
;
1588 src
->parsedToken
.charsOffset
= (uint32_t)(src
->extraCurrent
- src
->source
);
1589 *src
->extraCurrent
++ = 0xFFFE;
1590 *src
->extraCurrent
++ = (UChar
)ch
;
1591 src
->parsedToken
.charsLen
++;
1593 key
.source
= (src
->parsedToken
.charsLen
/**newCharsLen*/ << 24) | src
->parsedToken
.charsOffset
/**charsOffset*/;
1594 key
.rulesToParseHdl
= &(src
->source
);
1596 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1597 sourceToken
= (UColToken
*)uhash_get(src
->tailored
, &key
);
1599 if(sourceToken
== NULL
) {
1600 src
->lh
[src
->resultLen
].baseCE
= CE
& 0xFFFFFF3F;
1601 if(isContinuation(SecondCE
)) {
1602 src
->lh
[src
->resultLen
].baseContCE
= SecondCE
;
1604 src
->lh
[src
->resultLen
].baseContCE
= 0;
1606 src
->lh
[src
->resultLen
].nextCE
= 0;
1607 src
->lh
[src
->resultLen
].nextContCE
= 0;
1608 src
->lh
[src
->resultLen
].previousCE
= 0;
1609 src
->lh
[src
->resultLen
].previousContCE
= 0;
1611 src
->lh
[src
->resultLen
].indirect
= FALSE
;
1613 sourceToken
= ucol_tok_initAReset(src
, 0, &expandNext
, parseError
, status
);
1617 invPos
= ucol_inv_getPrevCE(src
, baseCE
, baseContCE
, &CE
, &SecondCE
, strength
);
1619 // we got the previous CE. Now we need to see if the difference between
1620 // the two CEs is really of the requested strength.
1621 // if it's a bigger difference (we asked for secondary and got primary), we
1622 // need to modify the CE.
1623 if(ucol_getCEStrengthDifference(baseCE
, baseContCE
, CE
, SecondCE
) < strength
) {
1624 // adjust the strength
1625 // now we are in the situation where our baseCE should actually be modified in
1626 // order to get the CE in the right position.
1627 if(strength
== UCOL_SECONDARY
) {
1628 CE
= baseCE
- 0x0200;
1629 } else { // strength == UCOL_TERTIARY
1633 if(strength
== UCOL_SECONDARY
) {
1634 SecondCE
= baseContCE
- 0x0200;
1635 } else { // strength == UCOL_TERTIARY
1636 SecondCE
= baseContCE
- 0x02;
1642 // the code below relies on getting a code point from the inverse table, in order to be
1643 // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
1644 // 1. There are many code points that have the same CE
1645 // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
1646 // Also, in case when there is no equivalent strength before an element, we have to actually
1647 // construct one. For example, &[before 2]a << x won't result in x << a, because the element
1648 // before a is a primary difference.
1650 //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
1653 ch
= CETable
[3*invPos
+2];
1655 if((ch
& UCOL_INV_SIZEMASK
) != 0) {
1656 uint16_t *conts
= (uint16_t *)((uint8_t *)src
->invUCA
+src
->invUCA
->conts
);
1657 uint32_t offset
= (ch
& UCOL_INV_OFFSETMASK
);
1661 *src
->extraCurrent
++ = (UChar
)ch
;
1662 src
->parsedToken
.charsOffset
= (uint32_t)(src
->extraCurrent
- src
->source
- 1);
1663 src
->parsedToken
.charsLen
= 1;
1665 // We got an UCA before. However, this might have been tailored.
1668 // &[before 3]\u306a<<<\u306a|\u309d
1671 // uint32_t key = (*newCharsLen << 24) | *charsOffset;
1672 key
.source
= (src
->parsedToken
.charsLen
/**newCharsLen*/ << 24) | src
->parsedToken
.charsOffset
/**charsOffset*/;
1673 key
.rulesToParseHdl
= &(src
->source
);
1675 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1676 sourceToken
= (UColToken
*)uhash_get(src
->tailored
, &key
);
1679 // here is how it should be. The situation such as &[before 1]a < x, should be
1680 // resolved exactly as if we wrote &a > x.
1681 // therefore, I don't really care if the UCA value before a has been changed.
1682 // However, I do care if the strength between my element and the previous element
1683 // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
1684 // have to construct the base CE.
1688 // if we found a tailored thing, we have to use the UCA value and construct
1689 // a new reset token with constructed name
1690 //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
1691 // character to which we want to anchor is already tailored.
1692 // We need to construct a new token which will be the anchor
1694 //*(src->extraCurrent-1) = 0xFFFE;
1695 //*src->extraCurrent++ = (UChar)ch;
1697 src
->parsedToken
.charsOffset
-= 10;
1698 src
->parsedToken
.charsLen
+= 10;
1699 src
->lh
[src
->resultLen
].baseCE
= CE
& 0xFFFFFF3F;
1700 if(isContinuation(SecondCE
)) {
1701 src
->lh
[src
->resultLen
].baseContCE
= SecondCE
;
1703 src
->lh
[src
->resultLen
].baseContCE
= 0;
1705 src
->lh
[src
->resultLen
].nextCE
= 0;
1706 src
->lh
[src
->resultLen
].nextContCE
= 0;
1707 src
->lh
[src
->resultLen
].previousCE
= 0;
1708 src
->lh
[src
->resultLen
].previousContCE
= 0;
1710 src
->lh
[src
->resultLen
].indirect
= FALSE
;
1712 sourceToken
= ucol_tok_initAReset(src
, 0, &expandNext
, parseError
, status
);
1720 uint32_t ucol_tok_assembleTokenList(UColTokenParser
*src
, UParseError
*parseError
, UErrorCode
*status
) {
1721 UColToken
*lastToken
= NULL
;
1722 const UChar
*parseEnd
= NULL
;
1723 uint32_t expandNext
= 0;
1724 UBool variableTop
= FALSE
;
1727 UColTokListHeader
*ListList
= NULL
;
1729 src
->parsedToken
.strength
= UCOL_TOK_UNSET
;
1733 if(U_FAILURE(*status
)) {
1736 #ifdef DEBUG_FOR_CODE_POINTS
1738 sprintf(filename
, "/tmp/debug_for_cp_%09d.txt", getpid());
1739 dfcp_fp
= fopen(filename
, "a");
1740 fprintf(stdout
, "Output is in the file %s.\n", filename
);
1743 #ifdef DEBUG_FOR_COLL_RULES
1745 UnicodeString(src
->source
).toUTF8String(s3
);
1746 std::cout
<< "src->source = " << s3
<< std::endl
;
1749 while(src
->current
< src
->end
|| src
->isStarred
) {
1750 src
->parsedToken
.prefixOffset
= 0;
1752 parseEnd
= ucol_tok_parseNextToken(src
,
1753 (UBool
)(lastToken
== NULL
),
1757 specs
= src
->parsedToken
.flags
;
1760 variableTop
= ((specs
& UCOL_TOK_VARIABLE_TOP
) != 0);
1761 top
= ((specs
& UCOL_TOK_TOP
) != 0);
1763 if(U_SUCCESS(*status
) && parseEnd
!= NULL
) {
1764 UColToken
*sourceToken
= NULL
;
1766 uint32_t lastStrength
= UCOL_TOK_UNSET
;
1768 if(lastToken
!= NULL
) {
1769 lastStrength
= lastToken
->strength
;
1772 #ifdef DEBUG_FOR_CODE_POINTS
1774 U16_GET(src
->source
, 0, src
->parsedToken
.charsOffset
, (uint32_t)(src
->extraEnd
- src
->source
), cp
);
1775 fprintf(dfcp_fp
, "Code point = %x, Strength = %x\n", cp
, src
->parsedToken
.strength
);
1777 //key = newCharsLen << 24 | charsOffset;
1779 key
.source
= src
->parsedToken
.charsLen
<< 24 | src
->parsedToken
.charsOffset
;
1780 key
.rulesToParseHdl
= &(src
->source
);
1782 /* 4 Lookup each source in the CharsToToken map, and find a sourceToken */
1783 sourceToken
= (UColToken
*)uhash_get(src
->tailored
, &key
);
1785 if(src
->parsedToken
.strength
!= UCOL_TOK_RESET
) {
1786 if(lastToken
== NULL
) { /* this means that rules haven't started properly */
1787 *status
= U_INVALID_FORMAT_ERROR
;
1788 syntaxError(src
->source
,0,(int32_t)(src
->end
-src
->source
),parseError
);
1792 /* 6 Otherwise (when relation != reset) */
1793 if(sourceToken
== NULL
) {
1794 /* If sourceToken is null, create new one, */
1795 sourceToken
= (UColToken
*)uprv_malloc(sizeof(UColToken
));
1797 if (sourceToken
== NULL
) {
1798 *status
= U_MEMORY_ALLOCATION_ERROR
;
1801 sourceToken
->rulesToParseHdl
= &(src
->source
);
1802 sourceToken
->source
= src
->parsedToken
.charsLen
<< 24 | src
->parsedToken
.charsOffset
;
1804 sourceToken
->debugSource
= *(src
->source
+ src
->parsedToken
.charsOffset
);
1806 sourceToken
->prefix
= src
->parsedToken
.prefixLen
<< 24 | src
->parsedToken
.prefixOffset
;
1807 sourceToken
->debugPrefix
= *(src
->source
+ src
->parsedToken
.prefixOffset
);
1809 sourceToken
->polarity
= UCOL_TOK_POLARITY_POSITIVE
; /* TODO: this should also handle reverse */
1810 sourceToken
->next
= NULL
;
1811 sourceToken
->previous
= NULL
;
1812 sourceToken
->noOfCEs
= 0;
1813 sourceToken
->noOfExpCEs
= 0;
1814 // keep the flags around so that we know about before
1815 sourceToken
->flags
= src
->parsedToken
.flags
;
1816 uhash_put(src
->tailored
, sourceToken
, sourceToken
, status
);
1817 if(U_FAILURE(*status
)) {
1821 /* we could have fished out a reset here */
1822 if(sourceToken
->strength
!= UCOL_TOK_RESET
&& lastToken
!= sourceToken
) {
1823 /* otherwise remove sourceToken from where it was. */
1824 if(sourceToken
->next
!= NULL
) {
1825 if(sourceToken
->next
->strength
> sourceToken
->strength
) {
1826 sourceToken
->next
->strength
= sourceToken
->strength
;
1828 sourceToken
->next
->previous
= sourceToken
->previous
;
1830 sourceToken
->listHeader
->last
= sourceToken
->previous
;
1833 if(sourceToken
->previous
!= NULL
) {
1834 sourceToken
->previous
->next
= sourceToken
->next
;
1836 sourceToken
->listHeader
->first
= sourceToken
->next
;
1838 sourceToken
->next
= NULL
;
1839 sourceToken
->previous
= NULL
;
1843 sourceToken
->strength
= src
->parsedToken
.strength
;
1844 sourceToken
->listHeader
= lastToken
->listHeader
;
1847 1. Find the strongest strength in each list, and set strongestP and strongestN
1848 accordingly in the headers.
1850 if(lastStrength
== UCOL_TOK_RESET
1851 || sourceToken
->listHeader
->first
== 0) {
1852 /* If LAST is a reset
1853 insert sourceToken in the list. */
1854 if(sourceToken
->listHeader
->first
== 0) {
1855 sourceToken
->listHeader
->first
= sourceToken
;
1856 sourceToken
->listHeader
->last
= sourceToken
;
1857 } else { /* we need to find a place for us */
1858 /* and we'll get in front of the same strength */
1859 if(sourceToken
->listHeader
->first
->strength
<= sourceToken
->strength
) {
1860 sourceToken
->next
= sourceToken
->listHeader
->first
;
1861 sourceToken
->next
->previous
= sourceToken
;
1862 sourceToken
->listHeader
->first
= sourceToken
;
1863 sourceToken
->previous
= NULL
;
1865 lastToken
= sourceToken
->listHeader
->first
;
1866 while(lastToken
->next
!= NULL
&& lastToken
->next
->strength
> sourceToken
->strength
) {
1867 lastToken
= lastToken
->next
;
1869 if(lastToken
->next
!= NULL
) {
1870 lastToken
->next
->previous
= sourceToken
;
1872 sourceToken
->listHeader
->last
= sourceToken
;
1874 sourceToken
->previous
= lastToken
;
1875 sourceToken
->next
= lastToken
->next
;
1876 lastToken
->next
= sourceToken
;
1880 /* Otherwise (when LAST is not a reset)
1881 if polarity (LAST) == polarity(relation), insert sourceToken after LAST,
1882 otherwise insert before.
1883 when inserting after or before, search to the next position with the same
1884 strength in that direction. (This is called postpone insertion). */
1885 if(sourceToken
!= lastToken
) {
1886 if(lastToken
->polarity
== sourceToken
->polarity
) {
1887 while(lastToken
->next
!= NULL
&& lastToken
->next
->strength
> sourceToken
->strength
) {
1888 lastToken
= lastToken
->next
;
1890 sourceToken
->previous
= lastToken
;
1891 if(lastToken
->next
!= NULL
) {
1892 lastToken
->next
->previous
= sourceToken
;
1894 sourceToken
->listHeader
->last
= sourceToken
;
1897 sourceToken
->next
= lastToken
->next
;
1898 lastToken
->next
= sourceToken
;
1900 while(lastToken
->previous
!= NULL
&& lastToken
->previous
->strength
> sourceToken
->strength
) {
1901 lastToken
= lastToken
->previous
;
1903 sourceToken
->next
= lastToken
;
1904 if(lastToken
->previous
!= NULL
) {
1905 lastToken
->previous
->next
= sourceToken
;
1907 sourceToken
->listHeader
->first
= sourceToken
;
1909 sourceToken
->previous
= lastToken
->previous
;
1910 lastToken
->previous
= sourceToken
;
1912 } else { /* repeated one thing twice in rules, stay with the stronger strength */
1913 if(lastStrength
< sourceToken
->strength
) {
1914 sourceToken
->strength
= lastStrength
;
1919 /* if the token was a variable top, we're gonna put it in */
1920 if(variableTop
== TRUE
&& src
->varTop
== NULL
) {
1921 variableTop
= FALSE
;
1922 src
->varTop
= sourceToken
;
1925 // Treat the expansions.
1926 // There are two types of expansions: explicit (x / y) and reset based propagating expansions
1927 // (&abc * d * e <=> &ab * d / c * e / c)
1928 // if both of them are in effect for a token, they are combined.
1930 sourceToken
->expansion
= src
->parsedToken
.extensionLen
<< 24 | src
->parsedToken
.extensionOffset
;
1932 if(expandNext
!= 0) {
1933 if(sourceToken
->strength
== UCOL_PRIMARY
) { /* primary strength kills off the implicit expansion */
1935 } else if(sourceToken
->expansion
== 0) { /* if there is no expansion, implicit is just added to the token */
1936 sourceToken
->expansion
= expandNext
;
1937 } else { /* there is both explicit and implicit expansion. We need to make a combination */
1938 uprv_memcpy(src
->extraCurrent
, src
->source
+ (expandNext
& 0xFFFFFF), (expandNext
>> 24)*sizeof(UChar
));
1939 uprv_memcpy(src
->extraCurrent
+(expandNext
>> 24), src
->source
+ src
->parsedToken
.extensionOffset
, src
->parsedToken
.extensionLen
*sizeof(UChar
));
1940 sourceToken
->expansion
= (uint32_t)(((expandNext
>> 24) + src
->parsedToken
.extensionLen
)<<24 | (uint32_t)(src
->extraCurrent
- src
->source
));
1941 src
->extraCurrent
+= (expandNext
>> 24) + src
->parsedToken
.extensionLen
;
1945 // This is just for debugging purposes
1946 if(sourceToken
->expansion
!= 0) {
1947 sourceToken
->debugExpansion
= *(src
->source
+ src
->parsedToken
.extensionOffset
);
1949 sourceToken
->debugExpansion
= 0;
1951 // if the previous token was a reset before, the strength of this
1952 // token must match the strength of before. Otherwise we have an
1953 // undefined situation.
1954 // In other words, we currently have a cludge which we use to
1955 // represent &a >> x. This is written as &[before 2]a << x.
1956 if((lastToken
->flags
& UCOL_TOK_BEFORE
) != 0) {
1957 uint8_t beforeStrength
= (lastToken
->flags
& UCOL_TOK_BEFORE
) - 1;
1958 if(beforeStrength
!= sourceToken
->strength
) {
1959 *status
= U_INVALID_FORMAT_ERROR
;
1960 syntaxError(src
->source
,0,(int32_t)(src
->end
-src
->source
),parseError
);
1966 if(lastToken
!= NULL
&& lastStrength
== UCOL_TOK_RESET
) {
1967 /* if the previous token was also a reset, */
1968 /*this means that we have two consecutive resets */
1969 /* and we want to remove the previous one if empty*/
1970 if(src
->resultLen
> 0 && ListList
[src
->resultLen
-1].first
== NULL
) {
1975 if(sourceToken
== NULL
) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
1976 uint32_t searchCharsLen
= src
->parsedToken
.charsLen
;
1977 while(searchCharsLen
> 1 && sourceToken
== NULL
) {
1979 //key = searchCharsLen << 24 | charsOffset;
1981 key
.source
= searchCharsLen
<< 24 | src
->parsedToken
.charsOffset
;
1982 key
.rulesToParseHdl
= &(src
->source
);
1983 sourceToken
= (UColToken
*)uhash_get(src
->tailored
, &key
);
1985 if(sourceToken
!= NULL
) {
1986 expandNext
= (src
->parsedToken
.charsLen
- searchCharsLen
) << 24 | (src
->parsedToken
.charsOffset
+ searchCharsLen
);
1990 if((specs
& UCOL_TOK_BEFORE
) != 0) { /* we're doing before */
1991 if(top
== FALSE
) { /* there is no indirection */
1992 uint8_t strength
= (specs
& UCOL_TOK_BEFORE
) - 1;
1993 if(sourceToken
!= NULL
&& sourceToken
->strength
!= UCOL_TOK_RESET
) {
1994 /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
1995 while(sourceToken
->strength
> strength
&& sourceToken
->previous
!= NULL
) {
1996 sourceToken
= sourceToken
->previous
;
1998 /* here, either we hit the strength or NULL */
1999 if(sourceToken
->strength
== strength
) {
2000 if(sourceToken
->previous
!= NULL
) {
2001 sourceToken
= sourceToken
->previous
;
2002 } else { /* start of list */
2003 sourceToken
= sourceToken
->listHeader
->reset
;
2005 } else { /* we hit NULL */
2006 /* we should be doing the else part */
2007 sourceToken
= sourceToken
->listHeader
->reset
;
2008 sourceToken
= getVirginBefore(src
, sourceToken
, strength
, parseError
, status
);
2011 sourceToken
= getVirginBefore(src
, sourceToken
, strength
, parseError
, status
);
2013 } else { /* this is both before and indirection */
2015 ListList
[src
->resultLen
].previousCE
= 0;
2016 ListList
[src
->resultLen
].previousContCE
= 0;
2017 ListList
[src
->resultLen
].indirect
= TRUE
;
2018 /* we need to do slightly more work. we need to get the baseCE using the */
2019 /* inverse UCA & getPrevious. The next bound is not set, and will be decided */
2021 uint8_t strength
= (specs
& UCOL_TOK_BEFORE
) - 1;
2022 uint32_t baseCE
= ucolIndirectBoundaries
[src
->parsedToken
.indirectIndex
].startCE
;
2023 uint32_t baseContCE
= ucolIndirectBoundaries
[src
->parsedToken
.indirectIndex
].startContCE
;//&0xFFFFFF3F;
2024 uint32_t CE
= UCOL_NOT_FOUND
, SecondCE
= UCOL_NOT_FOUND
;
2026 UCAConstants
*consts
= (UCAConstants
*)((uint8_t *)src
->UCA
->image
+ src
->UCA
->image
->UCAConsts
);
2027 if((baseCE
& 0xFF000000) >= (consts
->UCA_PRIMARY_IMPLICIT_MIN
<<24) &&
2028 (baseCE
& 0xFF000000) <= (consts
->UCA_PRIMARY_IMPLICIT_MAX
<<24) ) { /* implicits - */
2029 uint32_t primary
= (baseCE
& UCOL_PRIMARYMASK
) | ((baseContCE
& UCOL_PRIMARYMASK
) >> 16);
2030 uint32_t raw
= uprv_uca_getRawFromImplicit(primary
);
2031 uint32_t primaryCE
= uprv_uca_getImplicitFromRaw(raw
-1);
2032 CE
= (primaryCE
& UCOL_PRIMARYMASK
) | 0x0505;
2033 SecondCE
= ((primaryCE
<< 16) & UCOL_PRIMARYMASK
) | UCOL_CONTINUATION_MARKER
;
2035 /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/
2036 ucol_inv_getPrevCE(src
, baseCE
, baseContCE
, &CE
, &SecondCE
, strength
);
2039 ListList
[src
->resultLen
].baseCE
= CE
;
2040 ListList
[src
->resultLen
].baseContCE
= SecondCE
;
2041 ListList
[src
->resultLen
].nextCE
= 0;
2042 ListList
[src
->resultLen
].nextContCE
= 0;
2044 sourceToken
= ucol_tok_initAReset(src
, 0, &expandNext
, parseError
, status
);
2049 /* 5 If the relation is a reset:
2050 If sourceToken is null
2051 Create new list, create new sourceToken, make the baseCE from source, put
2052 the sourceToken in ListHeader of the new list */
2053 if(sourceToken
== NULL
) {
2055 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
2056 First convert all expansions into normal form. Examples:
2057 If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
2058 d * ... into &x * c/y * d * ...
2059 Note: reset values can never have expansions, although they can cause the
2060 very next item to have one. They may be contractions, if they are found
2061 earlier in the list.
2065 uint32_t CE
= UCOL_NOT_FOUND
, SecondCE
= UCOL_NOT_FOUND
;
2067 uprv_init_collIterate(src
->UCA
, src
->source
+src
->parsedToken
.charsOffset
, src
->parsedToken
.charsLen
, &s
, status
);
2069 CE
= ucol_getNextCE(src
->UCA
, &s
, status
);
2070 const UChar
*expand
= s
.pos
;
2071 SecondCE
= ucol_getNextCE(src
->UCA
, &s
, status
);
2073 ListList
[src
->resultLen
].baseCE
= CE
& 0xFFFFFF3F;
2074 if(isContinuation(SecondCE
)) {
2075 ListList
[src
->resultLen
].baseContCE
= SecondCE
;
2077 ListList
[src
->resultLen
].baseContCE
= 0;
2079 ListList
[src
->resultLen
].nextCE
= 0;
2080 ListList
[src
->resultLen
].nextContCE
= 0;
2081 ListList
[src
->resultLen
].previousCE
= 0;
2082 ListList
[src
->resultLen
].previousContCE
= 0;
2083 ListList
[src
->resultLen
].indirect
= FALSE
;
2084 sourceToken
= ucol_tok_initAReset(src
, expand
, &expandNext
, parseError
, status
);
2085 } else { /* top == TRUE */
2086 /* just use the supplied values */
2088 ListList
[src
->resultLen
].previousCE
= 0;
2089 ListList
[src
->resultLen
].previousContCE
= 0;
2090 ListList
[src
->resultLen
].indirect
= TRUE
;
2091 ListList
[src
->resultLen
].baseCE
= ucolIndirectBoundaries
[src
->parsedToken
.indirectIndex
].startCE
;
2092 ListList
[src
->resultLen
].baseContCE
= ucolIndirectBoundaries
[src
->parsedToken
.indirectIndex
].startContCE
;
2093 ListList
[src
->resultLen
].nextCE
= ucolIndirectBoundaries
[src
->parsedToken
.indirectIndex
].limitCE
;
2094 ListList
[src
->resultLen
].nextContCE
= ucolIndirectBoundaries
[src
->parsedToken
.indirectIndex
].limitContCE
;
2096 sourceToken
= ucol_tok_initAReset(src
, 0, &expandNext
, parseError
, status
);
2099 } else { /* reset to something already in rules */
2103 /* 7 After all this, set LAST to point to sourceToken, and goto step 3. */
2104 lastToken
= sourceToken
;
2106 if(U_FAILURE(*status
)) {
2111 #ifdef DEBUG_FOR_CODE_POINTS
2116 if(src
->resultLen
> 0 && ListList
[src
->resultLen
-1].first
== NULL
) {
2119 return src
->resultLen
;
2122 const UChar
* ucol_tok_getRulesFromBundle(
2129 const UChar
* rules
= NULL
;
2130 UResourceBundle
* bundle
;
2131 UResourceBundle
* collations
;
2132 UResourceBundle
* collation
;
2136 bundle
= ures_open(U_ICUDATA_COLL
, locale
, status
);
2137 if(U_SUCCESS(*status
)){
2138 collations
= ures_getByKey(bundle
, "collations", NULL
, status
);
2139 if(U_SUCCESS(*status
)){
2140 collation
= ures_getByKey(collations
, type
, NULL
, status
);
2141 if(U_SUCCESS(*status
)){
2142 rules
= ures_getStringByKey(collation
, "Sequence", pLength
, status
);
2143 if(U_FAILURE(*status
)){
2147 ures_close(collation
);
2149 ures_close(collations
);
2158 void ucol_tok_initTokenList(
2159 UColTokenParser
*src
,
2161 uint32_t rulesLength
,
2162 const UCollator
*UCA
,
2163 GetCollationRulesFunction importFunc
,
2165 UErrorCode
*status
) {
2169 uint32_t estimatedSize
= (2*rulesLength
+UCOL_TOK_EXTRA_RULE_SPACE_SIZE
);
2171 bool needToDeallocRules
= false;
2173 if(U_FAILURE(*status
)) {
2177 // set everything to zero, so that we can clean up gracefully
2178 uprv_memset(src
, 0, sizeof(UColTokenParser
));
2180 // first we need to find options that don't like to be normalized,
2181 // like copy and remove...
2182 //const UChar *openBrace = rules;
2183 int32_t optionNumber
= -1;
2184 const UChar
*setStart
= NULL
;
2186 while(i
< rulesLength
) {
2187 if(rules
[i
] == 0x005B) { // '[': start of an option
2188 /* Gets the following:
2189 optionNumber: The index of the option.
2190 setStart: The pointer at which the option arguments start.
2192 optionNumber
= ucol_uprv_tok_readOption(rules
+i
+1, rules
+rulesLength
, &setStart
);
2194 if(optionNumber
== OPTION_OPTIMIZE
) { /* copy - parts of UCA to tailoring */
2196 USet
*newSet
= ucol_uprv_tok_readAndSetUnicodeSet(setStart
, rules
+rulesLength
, status
);
2197 if(U_SUCCESS(*status
)) {
2198 if(src
->copySet
== NULL
) {
2199 src
->copySet
= newSet
;
2201 uset_addAll(src
->copySet
, newSet
);
2207 } else if(optionNumber
== OPTION_SUPPRESS_CONTRACTIONS
) {
2208 USet
*newSet
= ucol_uprv_tok_readAndSetUnicodeSet(setStart
, rules
+rulesLength
, status
);
2209 if(U_SUCCESS(*status
)) {
2210 if(src
->removeSet
== NULL
) {
2211 src
->removeSet
= newSet
;
2213 uset_addAll(src
->removeSet
, newSet
);
2219 } else if(optionNumber
== OPTION_IMPORT
){
2220 // [import <collation-name>]
2222 // Find the address of the closing ].
2223 UChar
* import_end
= u_strchr(setStart
, 0x005D);
2224 int32_t optionEndOffset
= (int32_t)(import_end
+ 1 - rules
);
2225 // Ignore trailing whitespace.
2226 while(uprv_isRuleWhiteSpace(*(import_end
-1))) {
2230 int32_t optionLength
= (int32_t)(import_end
- setStart
);
2232 if(optionLength
>= (int32_t)sizeof(option
)) {
2233 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2236 u_UCharsToChars(setStart
, option
, optionLength
);
2237 option
[optionLength
] = 0;
2239 *status
= U_ZERO_ERROR
;
2242 uloc_forLanguageTag(option
, locale
, (int32_t)sizeof(locale
), &templ
, status
);
2243 if(U_FAILURE(*status
)) {
2244 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2249 if (uloc_getKeywordValue(locale
, "collation", type
, (int32_t)sizeof(type
), status
) <= 0 ||
2252 *status
= U_ZERO_ERROR
;
2253 uprv_strcpy(type
, "standard");
2256 // TODO: Use public functions when available, see ticket #8134.
2257 char *keywords
= (char *)locale_getKeywordsStart(locale
);
2258 if(keywords
!= NULL
) {
2262 int32_t importRulesLength
= 0;
2263 const UChar
* importRules
= importFunc(context
, locale
, type
, &importRulesLength
, status
);
2265 #ifdef DEBUG_FOR_COLL_RULES
2267 UnicodeString(importRules
).toUTF8String(s
);
2268 std::cout
<< "Import rules = " << s
<< std::endl
;
2271 // Add the length of the imported rules to length of the original rules,
2272 // and subtract the length of the import option.
2273 uint32_t newRulesLength
= rulesLength
+ importRulesLength
- (optionEndOffset
- i
);
2275 UChar
* newRules
= (UChar
*)uprv_malloc(newRulesLength
*sizeof(UChar
));
2277 #ifdef DEBUG_FOR_COLL_RULES
2279 UnicodeString(rules
).toUTF8String(s1
);
2280 std::cout
<< "Original rules = " << s1
<< std::endl
;
2284 // Copy the section of the original rules leading up to the import
2285 uprv_memcpy(newRules
, rules
, i
*sizeof(UChar
));
2286 // Copy the imported rules
2287 uprv_memcpy(newRules
+i
, importRules
, importRulesLength
*sizeof(UChar
));
2288 // Copy the rest of the original rules (minus the import option itself)
2289 uprv_memcpy(newRules
+i
+importRulesLength
,
2290 rules
+optionEndOffset
,
2291 (rulesLength
-optionEndOffset
)*sizeof(UChar
));
2293 #ifdef DEBUG_FOR_COLL_RULES
2295 UnicodeString(newRules
).toUTF8String(s2
);
2296 std::cout
<< "Resulting rules = " << s2
<< std::endl
;
2299 if(needToDeallocRules
){
2300 // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free
2301 uprv_free((void*)rules
);
2303 needToDeallocRules
= true;
2305 rulesLength
= newRulesLength
;
2307 estimatedSize
+= importRulesLength
*2;
2309 // First character of the new rules needs to be processed
2317 src
->source
= (UChar
*)uprv_malloc(estimatedSize
*sizeof(UChar
));
2319 if (src
->source
== NULL
) {
2320 *status
= U_MEMORY_ALLOCATION_ERROR
;
2323 uprv_memset(src
->source
, 0, estimatedSize
*sizeof(UChar
));
2324 nSize
= unorm_normalize(rules
, rulesLength
, UNORM_NFD
, 0, src
->source
, estimatedSize
, status
);
2325 if(nSize
> estimatedSize
|| *status
== U_BUFFER_OVERFLOW_ERROR
) {
2326 *status
= U_ZERO_ERROR
;
2327 src
->source
= (UChar
*)uprv_realloc(src
->source
, (nSize
+UCOL_TOK_EXTRA_RULE_SPACE_SIZE
)*sizeof(UChar
));
2329 if (src
->source
== NULL
) {
2330 *status
= U_MEMORY_ALLOCATION_ERROR
;
2333 nSize
= unorm_normalize(rules
, rulesLength
, UNORM_NFD
, 0, src
->source
, nSize
+UCOL_TOK_EXTRA_RULE_SPACE_SIZE
, status
);
2335 if(needToDeallocRules
){
2336 // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free
2337 uprv_free((void*)rules
);
2341 src
->current
= src
->source
;
2342 src
->end
= src
->source
+nSize
;
2343 src
->sourceCurrent
= src
->source
;
2344 src
->extraCurrent
= src
->end
+1; // Preserve terminating zero in the rule string so that option scanning works correctly
2345 src
->extraEnd
= src
->source
+estimatedSize
; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
2348 src
->invUCA
= ucol_initInverseUCA(status
);
2349 src
->parsedToken
.charsLen
= 0;
2350 src
->parsedToken
.charsOffset
= 0;
2351 src
->parsedToken
.extensionLen
= 0;
2352 src
->parsedToken
.extensionOffset
= 0;
2353 src
->parsedToken
.prefixLen
= 0;
2354 src
->parsedToken
.prefixOffset
= 0;
2355 src
->parsedToken
.flags
= 0;
2356 src
->parsedToken
.strength
= UCOL_TOK_UNSET
;
2357 src
->buildCCTabFlag
= FALSE
;
2358 src
->isStarred
= FALSE
;
2359 src
->inRange
= FALSE
;
2360 src
->lastRangeCp
= 0;
2361 src
->previousCp
= 0;
2363 if(U_FAILURE(*status
)) {
2366 src
->tailored
= uhash_open(uhash_hashTokens
, uhash_compareTokens
, NULL
, status
);
2367 if(U_FAILURE(*status
)) {
2370 uhash_setValueDeleter(src
->tailored
, uhash_freeBlock
);
2372 src
->opts
= (UColOptionSet
*)uprv_malloc(sizeof(UColOptionSet
));
2374 if (src
->opts
== NULL
) {
2375 *status
= U_MEMORY_ALLOCATION_ERROR
;
2379 uprv_memcpy(src
->opts
, UCA
->options
, sizeof(UColOptionSet
));
2382 src
->listCapacity
= 1024;
2383 src
->lh
= (UColTokListHeader
*)uprv_malloc(src
->listCapacity
*sizeof(UColTokListHeader
));
2385 if (src
->lh
== NULL
) {
2386 *status
= U_MEMORY_ALLOCATION_ERROR
;
2389 uprv_memset(src
->lh
, 0, src
->listCapacity
*sizeof(UColTokListHeader
));
2392 UCAConstants
*consts
= (UCAConstants
*)((uint8_t *)src
->UCA
->image
+ src
->UCA
->image
->UCAConsts
);
2394 // UCOL_RESET_TOP_VALUE
2395 setIndirectBoundaries(0, consts
->UCA_LAST_NON_VARIABLE
, consts
->UCA_FIRST_IMPLICIT
);
2396 // UCOL_FIRST_PRIMARY_IGNORABLE
2397 setIndirectBoundaries(1, consts
->UCA_FIRST_PRIMARY_IGNORABLE
, 0);
2398 // UCOL_LAST_PRIMARY_IGNORABLE
2399 setIndirectBoundaries(2, consts
->UCA_LAST_PRIMARY_IGNORABLE
, 0);
2400 // UCOL_FIRST_SECONDARY_IGNORABLE
2401 setIndirectBoundaries(3, consts
->UCA_FIRST_SECONDARY_IGNORABLE
, 0);
2402 // UCOL_LAST_SECONDARY_IGNORABLE
2403 setIndirectBoundaries(4, consts
->UCA_LAST_SECONDARY_IGNORABLE
, 0);
2404 // UCOL_FIRST_TERTIARY_IGNORABLE
2405 setIndirectBoundaries(5, consts
->UCA_FIRST_TERTIARY_IGNORABLE
, 0);
2406 // UCOL_LAST_TERTIARY_IGNORABLE
2407 setIndirectBoundaries(6, consts
->UCA_LAST_TERTIARY_IGNORABLE
, 0);
2408 // UCOL_FIRST_VARIABLE
2409 setIndirectBoundaries(7, consts
->UCA_FIRST_VARIABLE
, 0);
2410 // UCOL_LAST_VARIABLE
2411 setIndirectBoundaries(8, consts
->UCA_LAST_VARIABLE
, 0);
2412 // UCOL_FIRST_NON_VARIABLE
2413 setIndirectBoundaries(9, consts
->UCA_FIRST_NON_VARIABLE
, 0);
2414 // UCOL_LAST_NON_VARIABLE
2415 setIndirectBoundaries(10, consts
->UCA_LAST_NON_VARIABLE
, consts
->UCA_FIRST_IMPLICIT
);
2416 // UCOL_FIRST_IMPLICIT
2417 setIndirectBoundaries(11, consts
->UCA_FIRST_IMPLICIT
, 0);
2418 // UCOL_LAST_IMPLICIT
2419 setIndirectBoundaries(12, consts
->UCA_LAST_IMPLICIT
, consts
->UCA_FIRST_TRAILING
);
2420 // UCOL_FIRST_TRAILING
2421 setIndirectBoundaries(13, consts
->UCA_FIRST_TRAILING
, 0);
2422 // UCOL_LAST_TRAILING
2423 setIndirectBoundaries(14, consts
->UCA_LAST_TRAILING
, 0);
2424 ucolIndirectBoundaries
[14].limitCE
= (consts
->UCA_PRIMARY_SPECIAL_MIN
<<24);
2428 void ucol_tok_closeTokenList(UColTokenParser
*src
) {
2429 if(src
->copySet
!= NULL
) {
2430 uset_close(src
->copySet
);
2432 if(src
->removeSet
!= NULL
) {
2433 uset_close(src
->removeSet
);
2435 if(src
->tailored
!= NULL
) {
2436 uhash_close(src
->tailored
);
2438 if(src
->lh
!= NULL
) {
2441 if(src
->source
!= NULL
) {
2442 uprv_free(src
->source
);
2444 if(src
->opts
!= NULL
) {
2445 uprv_free(src
->opts
);
2447 if (src
->reorderCodes
!= NULL
) {
2448 uprv_free(src
->reorderCodes
);
2452 #endif /* #if !UCONFIG_NO_COLLATION */