2 **********************************************************************
3 * Copyright (C) 1999-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 11/17/99 aliu Creation.
8 **********************************************************************
11 #include "unicode/utypes.h"
13 #if !UCONFIG_NO_TRANSLITERATION
15 #include "unicode/uobject.h"
16 #include "unicode/parseerr.h"
17 #include "unicode/parsepos.h"
18 #include "unicode/putil.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ustring.h"
21 #include "unicode/uniset.h"
22 #include "unicode/utf16.h"
33 #include "unicode/symtable.h"
37 #include "patternprops.h"
44 #define VARIABLE_DEF_OP ((UChar)0x003D) /*=*/
45 #define FORWARD_RULE_OP ((UChar)0x003E) /*>*/
46 #define REVERSE_RULE_OP ((UChar)0x003C) /*<*/
47 #define FWDREV_RULE_OP ((UChar)0x007E) /*~*/ // internal rep of <> op
49 // Other special characters
50 #define QUOTE ((UChar)0x0027) /*'*/
51 #define ESCAPE ((UChar)0x005C) /*\*/
52 #define END_OF_RULE ((UChar)0x003B) /*;*/
53 #define RULE_COMMENT_CHAR ((UChar)0x0023) /*#*/
55 #define SEGMENT_OPEN ((UChar)0x0028) /*(*/
56 #define SEGMENT_CLOSE ((UChar)0x0029) /*)*/
57 #define CONTEXT_ANTE ((UChar)0x007B) /*{*/
58 #define CONTEXT_POST ((UChar)0x007D) /*}*/
59 #define CURSOR_POS ((UChar)0x007C) /*|*/
60 #define CURSOR_OFFSET ((UChar)0x0040) /*@*/
61 #define ANCHOR_START ((UChar)0x005E) /*^*/
62 #define KLEENE_STAR ((UChar)0x002A) /***/
63 #define ONE_OR_MORE ((UChar)0x002B) /*+*/
64 #define ZERO_OR_ONE ((UChar)0x003F) /*?*/
66 #define DOT ((UChar)46) /*.*/
68 static const UChar DOT_SET
[] = { // "[^[:Zp:][:Zl:]\r\n$]";
69 91, 94, 91, 58, 90, 112, 58, 93, 91, 58, 90,
70 108, 58, 93, 92, 114, 92, 110, 36, 93, 0
73 // A function is denoted &Source-Target/Variant(text)
74 #define FUNCTION ((UChar)38) /*&*/
76 // Aliases for some of the syntax characters. These are provided so
77 // transliteration rules can be expressed in XML without clashing with
78 // XML syntax characters '<', '>', and '&'.
79 #define ALT_REVERSE_RULE_OP ((UChar)0x2190) // Left Arrow
80 #define ALT_FORWARD_RULE_OP ((UChar)0x2192) // Right Arrow
81 #define ALT_FWDREV_RULE_OP ((UChar)0x2194) // Left Right Arrow
82 #define ALT_FUNCTION ((UChar)0x2206) // Increment (~Greek Capital Delta)
84 // Special characters disallowed at the top level
85 static const UChar ILLEGAL_TOP
[] = {41,0}; // ")"
87 // Special characters disallowed within a segment
88 static const UChar ILLEGAL_SEG
[] = {123,125,124,64,0}; // "{}|@"
90 // Special characters disallowed within a function argument
91 static const UChar ILLEGAL_FUNC
[] = {94,40,46,42,43,63,123,125,124,64,0}; // "^(.*+?{}|@"
93 // By definition, the ANCHOR_END special character is a
94 // trailing SymbolTable.SYMBOL_REF character.
95 // private static final char ANCHOR_END = '$';
97 static const UChar gOPERATORS
[] = { // "=><"
98 VARIABLE_DEF_OP
, FORWARD_RULE_OP
, REVERSE_RULE_OP
,
99 ALT_FORWARD_RULE_OP
, ALT_REVERSE_RULE_OP
, ALT_FWDREV_RULE_OP
,
103 static const UChar HALF_ENDERS
[] = { // "=><;"
104 VARIABLE_DEF_OP
, FORWARD_RULE_OP
, REVERSE_RULE_OP
,
105 ALT_FORWARD_RULE_OP
, ALT_REVERSE_RULE_OP
, ALT_FWDREV_RULE_OP
,
110 // These are also used in Transliterator::toRules()
111 static const int32_t ID_TOKEN_LEN
= 2;
112 static const UChar ID_TOKEN
[] = { 0x3A, 0x3A }; // ':', ':'
115 commented out until we do real ::BEGIN/::END functionality
116 static const int32_t BEGIN_TOKEN_LEN = 5;
117 static const UChar BEGIN_TOKEN[] = { 0x42, 0x45, 0x47, 0x49, 0x4e }; // 'BEGIN'
119 static const int32_t END_TOKEN_LEN = 3;
120 static const UChar END_TOKEN[] = { 0x45, 0x4e, 0x44 }; // 'END'
125 //----------------------------------------------------------------------
127 //----------------------------------------------------------------------
130 * This class implements the SymbolTable interface. It is used
131 * during parsing to give UnicodeSet access to variables that
132 * have been defined so far. Note that it uses variablesVector,
133 * _not_ data.setVariables.
135 class ParseData
: public UMemory
, public SymbolTable
{
137 const TransliterationRuleData
* data
; // alias
139 const UVector
* variablesVector
; // alias
141 const Hashtable
* variableNames
; // alias
143 ParseData(const TransliterationRuleData
* data
= 0,
144 const UVector
* variablesVector
= 0,
145 const Hashtable
* variableNames
= 0);
147 virtual ~ParseData();
149 virtual const UnicodeString
* lookup(const UnicodeString
& s
) const;
151 virtual const UnicodeFunctor
* lookupMatcher(UChar32 ch
) const;
153 virtual UnicodeString
parseReference(const UnicodeString
& text
,
154 ParsePosition
& pos
, int32_t limit
) const;
156 * Return true if the given character is a matcher standin or a plain
157 * character (non standin).
159 UBool
isMatcher(UChar32 ch
);
162 * Return true if the given character is a replacer standin or a plain
163 * character (non standin).
165 UBool
isReplacer(UChar32 ch
);
168 ParseData(const ParseData
&other
); // forbid copying of this class
169 ParseData
&operator=(const ParseData
&other
); // forbid copying of this class
172 ParseData::ParseData(const TransliterationRuleData
* d
,
174 const Hashtable
* vNames
) :
175 data(d
), variablesVector(sets
), variableNames(vNames
) {}
177 ParseData::~ParseData() {}
180 * Implement SymbolTable API.
182 const UnicodeString
* ParseData::lookup(const UnicodeString
& name
) const {
183 return (const UnicodeString
*) variableNames
->get(name
);
187 * Implement SymbolTable API.
189 const UnicodeFunctor
* ParseData::lookupMatcher(UChar32 ch
) const {
190 // Note that we cannot use data.lookupSet() because the
191 // set array has not been constructed yet.
192 const UnicodeFunctor
* set
= NULL
;
193 int32_t i
= ch
- data
->variablesBase
;
194 if (i
>= 0 && i
< variablesVector
->size()) {
195 int32_t i
= ch
- data
->variablesBase
;
196 set
= (i
< variablesVector
->size()) ?
197 (UnicodeFunctor
*) variablesVector
->elementAt(i
) : 0;
203 * Implement SymbolTable API. Parse out a symbol reference
206 UnicodeString
ParseData::parseReference(const UnicodeString
& text
,
207 ParsePosition
& pos
, int32_t limit
) const {
208 int32_t start
= pos
.getIndex();
210 UnicodeString result
;
212 UChar c
= text
.charAt(i
);
213 if ((i
==start
&& !u_isIDStart(c
)) || !u_isIDPart(c
)) {
218 if (i
== start
) { // No valid name chars
219 return result
; // Indicate failure with empty string
222 text
.extractBetween(start
, i
, result
);
226 UBool
ParseData::isMatcher(UChar32 ch
) {
227 // Note that we cannot use data.lookup() because the
228 // set array has not been constructed yet.
229 int32_t i
= ch
- data
->variablesBase
;
230 if (i
>= 0 && i
< variablesVector
->size()) {
231 UnicodeFunctor
*f
= (UnicodeFunctor
*) variablesVector
->elementAt(i
);
232 return f
!= NULL
&& f
->toMatcher() != NULL
;
238 * Return true if the given character is a replacer standin or a plain
239 * character (non standin).
241 UBool
ParseData::isReplacer(UChar32 ch
) {
242 // Note that we cannot use data.lookup() because the
243 // set array has not been constructed yet.
244 int i
= ch
- data
->variablesBase
;
245 if (i
>= 0 && i
< variablesVector
->size()) {
246 UnicodeFunctor
*f
= (UnicodeFunctor
*) variablesVector
->elementAt(i
);
247 return f
!= NULL
&& f
->toReplacer() != NULL
;
252 //----------------------------------------------------------------------
254 //----------------------------------------------------------------------
257 * A class representing one side of a rule. This class knows how to
258 * parse half of a rule. It is tightly coupled to the method
259 * RuleBasedTransliterator.Parser.parseRule().
261 class RuleHalf
: public UMemory
{
267 int32_t cursor
; // position of cursor in text
268 int32_t ante
; // position of ante context marker '{' in text
269 int32_t post
; // position of post context marker '}' in text
271 // Record the offset to the cursor either to the left or to the
272 // right of the key. This is indicated by characters on the output
273 // side that allow the cursor to be positioned arbitrarily within
274 // the matching text. For example, abc{def} > | @@@ xyz; changes
275 // def to xyz and moves the cursor to before abc. Offset characters
276 // must be at the start or end, and they cannot move the cursor past
277 // the ante- or postcontext text. Placeholders are only valid in
278 // output text. The length of the ante and post context is
279 // determined at runtime, because of supplementals and quantifiers.
280 int32_t cursorOffset
; // only nonzero on output side
282 // Position of first CURSOR_OFFSET on _right_. This will be -1
283 // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.
284 int32_t cursorOffsetPos
;
290 * The segment number from 1..n of the next '(' we see
291 * during parsing; 1-based.
293 int32_t nextSegmentNumber
;
295 TransliteratorParser
& parser
;
297 //--------------------------------------------------
300 RuleHalf(TransliteratorParser
& parser
);
303 int32_t parse(const UnicodeString
& rule
, int32_t pos
, int32_t limit
, UErrorCode
& status
);
305 int32_t parseSection(const UnicodeString
& rule
, int32_t pos
, int32_t limit
,
307 const UnicodeString
& illegal
,
314 void removeContext();
317 * Return true if this half looks like valid output, that is, does not
318 * contain quantifiers or other special input-only elements.
320 UBool
isValidOutput(TransliteratorParser
& parser
);
323 * Return true if this half looks like valid input, that is, does not
324 * contain functions or other special output-only elements.
326 UBool
isValidInput(TransliteratorParser
& parser
);
328 int syntaxError(UErrorCode code
,
329 const UnicodeString
& rule
,
331 UErrorCode
& status
) {
332 return parser
.syntaxError(code
, rule
, start
, status
);
336 // Disallowed methods; no impl.
337 RuleHalf(const RuleHalf
&);
338 RuleHalf
& operator=(const RuleHalf
&);
341 RuleHalf::RuleHalf(TransliteratorParser
& p
) :
349 anchorStart
= anchorEnd
= FALSE
;
350 nextSegmentNumber
= 1;
353 RuleHalf::~RuleHalf() {
357 * Parse one side of a rule, stopping at either the limit,
358 * the END_OF_RULE character, or an operator.
359 * @return the index after the terminating character, or
360 * if limit was reached, limit
362 int32_t RuleHalf::parse(const UnicodeString
& rule
, int32_t pos
, int32_t limit
, UErrorCode
& status
) {
365 pos
= parseSection(rule
, pos
, limit
, text
, UnicodeString(TRUE
, ILLEGAL_TOP
, -1), FALSE
, status
);
367 if (cursorOffset
> 0 && cursor
!= cursorOffsetPos
) {
368 return syntaxError(U_MISPLACED_CURSOR_OFFSET
, rule
, start
, status
);
375 * Parse a section of one side of a rule, stopping at either
376 * the limit, the END_OF_RULE character, an operator, or a
377 * segment close character. This method parses both a
378 * top-level rule half and a segment within such a rule half.
379 * It calls itself recursively to parse segments and nested
381 * @param buf buffer into which to accumulate the rule pattern
382 * characters, either literal characters from the rule or
383 * standins for UnicodeMatcher objects including segments.
384 * @param illegal the set of special characters that is illegal during
386 * @param isSegment if true, then we've already seen a '(' and
387 * pos on entry points right after it. Accumulate everything
388 * up to the closing ')', put it in a segment matcher object,
389 * generate a standin for it, and add the standin to buf. As
390 * a side effect, update the segments vector with a reference
391 * to the segment matcher. This works recursively for nested
392 * segments. If isSegment is false, just accumulate
393 * characters into buf.
394 * @return the index after the terminating character, or
395 * if limit was reached, limit
397 int32_t RuleHalf::parseSection(const UnicodeString
& rule
, int32_t pos
, int32_t limit
,
399 const UnicodeString
& illegal
,
400 UBool isSegment
, UErrorCode
& status
) {
403 UnicodeString scratch
;
405 int32_t quoteStart
= -1; // Most recent 'single quoted string'
406 int32_t quoteLimit
= -1;
407 int32_t varStart
= -1; // Most recent $variableReference
408 int32_t varLimit
= -1;
409 int32_t bufStart
= buf
.length();
411 while (pos
< limit
&& !done
) {
412 // Since all syntax characters are in the BMP, fetching
413 // 16-bit code units suffices here.
414 UChar c
= rule
.charAt(pos
++);
415 if (PatternProps::isWhiteSpace(c
)) {
416 // Ignore whitespace. Note that this is not Unicode
417 // spaces, but Java spaces -- a subset, representing
418 // whitespace likely to be seen in code.
421 if (u_strchr(HALF_ENDERS
, c
) != NULL
) {
424 return syntaxError(U_UNCLOSED_SEGMENT
, rule
, start
, status
);
429 // Text after a presumed end anchor is a syntax err
430 return syntaxError(U_MALFORMED_VARIABLE_REFERENCE
, rule
, start
, status
);
432 if (UnicodeSet::resemblesPattern(rule
, pos
-1)) {
433 pp
.setIndex(pos
-1); // Backup to opening '['
434 buf
.append(parser
.parseSet(rule
, pp
, status
));
435 if (U_FAILURE(status
)) {
436 return syntaxError(U_MALFORMED_SET
, rule
, start
, status
);
444 return syntaxError(U_TRAILING_BACKSLASH
, rule
, start
, status
);
446 UChar32 escaped
= rule
.unescapeAt(pos
); // pos is already past '\\'
447 if (escaped
== (UChar32
) -1) {
448 return syntaxError(U_MALFORMED_UNICODE_ESCAPE
, rule
, start
, status
);
450 if (!parser
.checkVariableRange(escaped
)) {
451 return syntaxError(U_VARIABLE_RANGE_OVERLAP
, rule
, start
, status
);
456 // Handle quoted matter
458 int32_t iq
= rule
.indexOf(QUOTE
, pos
);
460 buf
.append(c
); // Parse [''] outside quotes as [']
463 /* This loop picks up a run of quoted text of the
464 * form 'aaaa' each time through. If this run
465 * hasn't really ended ('aaaa''bbbb') then it keeps
466 * looping, each time adding on a new run. When it
467 * reaches the final quote it breaks.
469 quoteStart
= buf
.length();
472 return syntaxError(U_UNTERMINATED_QUOTE
, rule
, start
, status
);
475 rule
.extractBetween(pos
, iq
, scratch
);
478 if (pos
< limit
&& rule
.charAt(pos
) == QUOTE
) {
479 // Parse [''] inside quotes as [']
480 iq
= rule
.indexOf(QUOTE
, pos
+1);
486 quoteLimit
= buf
.length();
488 for (iq
=quoteStart
; iq
<quoteLimit
; ++iq
) {
489 if (!parser
.checkVariableRange(buf
.charAt(iq
))) {
490 return syntaxError(U_VARIABLE_RANGE_OVERLAP
, rule
, start
, status
);
497 if (!parser
.checkVariableRange(c
)) {
498 return syntaxError(U_VARIABLE_RANGE_OVERLAP
, rule
, start
, status
);
501 if (illegal
.indexOf(c
) >= 0) {
502 syntaxError(U_ILLEGAL_CHARACTER
, rule
, start
, status
);
507 //------------------------------------------------------
508 // Elements allowed within and out of segments
509 //------------------------------------------------------
511 if (buf
.length() == 0 && !anchorStart
) {
514 return syntaxError(U_MISPLACED_ANCHOR_START
,
515 rule
, start
, status
);
520 // bufSegStart is the offset in buf to the first
521 // character of the segment we are parsing.
522 int32_t bufSegStart
= buf
.length();
524 // Record segment number now, since nextSegmentNumber
525 // will be incremented during the call to parseSection
526 // if there are nested segments.
527 int32_t segmentNumber
= nextSegmentNumber
++; // 1-based
530 pos
= parseSection(rule
, pos
, limit
, buf
, UnicodeString(TRUE
, ILLEGAL_SEG
, -1), TRUE
, status
);
532 // After parsing a segment, the relevant characters are
533 // in buf, starting at offset bufSegStart. Extract them
534 // into a string matcher, and replace them with a
535 // standin for that matcher.
537 new StringMatcher(buf
, bufSegStart
, buf
.length(),
538 segmentNumber
, *parser
.curData
);
540 return syntaxError(U_MEMORY_ALLOCATION_ERROR
, rule
, start
, status
);
543 // Record and associate object and segment number
544 parser
.setSegmentObject(segmentNumber
, m
, status
);
545 buf
.truncate(bufSegStart
);
546 buf
.append(parser
.getSegmentStandin(segmentNumber
, status
));
553 TransliteratorIDParser::SingleID
* single
=
554 TransliteratorIDParser::parseFilterID(rule
, iref
);
555 // The next character MUST be a segment open
556 if (single
== NULL
||
557 !ICU_Utility::parseChar(rule
, iref
, SEGMENT_OPEN
)) {
558 return syntaxError(U_INVALID_FUNCTION
, rule
, start
, status
);
561 Transliterator
*t
= single
->createInstance();
564 return syntaxError(U_INVALID_FUNCTION
, rule
, start
, status
);
567 // bufSegStart is the offset in buf to the first
568 // character of the segment we are parsing.
569 int32_t bufSegStart
= buf
.length();
572 pos
= parseSection(rule
, iref
, limit
, buf
, UnicodeString(TRUE
, ILLEGAL_FUNC
, -1), TRUE
, status
);
574 // After parsing a segment, the relevant characters are
575 // in buf, starting at offset bufSegStart.
576 UnicodeString output
;
577 buf
.extractBetween(bufSegStart
, buf
.length(), output
);
578 FunctionReplacer
*r
=
579 new FunctionReplacer(t
, new StringReplacer(output
, parser
.curData
));
581 return syntaxError(U_MEMORY_ALLOCATION_ERROR
, rule
, start
, status
);
584 // Replace the buffer contents with a stand-in
585 buf
.truncate(bufSegStart
);
586 buf
.append(parser
.generateStandInFor(r
, status
));
589 case SymbolTable::SYMBOL_REF
:
590 // Handle variable references and segment references "$1" .. "$9"
592 // A variable reference must be followed immediately
593 // by a Unicode identifier start and zero or more
594 // Unicode identifier part characters, or by a digit
595 // 1..9 if it is a segment reference.
597 // A variable ref character at the end acts as
598 // an anchor to the context limit, as in perl.
602 // Parse "$1" "$2" .. "$9" .. (no upper limit)
603 c
= rule
.charAt(pos
);
604 int32_t r
= u_digit(c
, 10);
605 if (r
>= 1 && r
<= 9) {
606 r
= ICU_Utility::parseNumber(rule
, pos
, 10);
608 return syntaxError(U_UNDEFINED_SEGMENT_REFERENCE
,
609 rule
, start
, status
);
611 buf
.append(parser
.getSegmentStandin(r
, status
));
614 UnicodeString name
= parser
.parseData
->
615 parseReference(rule
, pp
, limit
);
616 if (name
.length() == 0) {
617 // This means the '$' was not followed by a
618 // valid name. Try to interpret it as an
619 // end anchor then. If this also doesn't work
620 // (if we see a following character) then signal
626 // If this is a variable definition statement,
627 // then the LHS variable will be undefined. In
628 // that case appendVariableDef() will append the
629 // special placeholder char variableLimit-1.
630 varStart
= buf
.length();
631 parser
.appendVariableDef(name
, buf
, status
);
632 varLimit
= buf
.length();
637 buf
.append(parser
.getDotStandIn(status
));
642 // Quantifiers. We handle single characters, quoted strings,
643 // variable references, and segments.
645 // 'foo'+ matches foofoofoo
646 // $v+ matches xyxyxy if $v == xy
647 // (seg)+ matches segsegseg
649 if (isSegment
&& buf
.length() == bufStart
) {
650 // The */+ immediately follows '('
651 return syntaxError(U_MISPLACED_QUANTIFIER
, rule
, start
, status
);
654 int32_t qstart
, qlimit
;
655 // The */+ follows an isolated character or quote
656 // or variable reference
657 if (buf
.length() == quoteLimit
) {
658 // The */+ follows a 'quoted string'
661 } else if (buf
.length() == varLimit
) {
662 // The */+ follows a $variableReference
666 // The */+ follows a single character, possibly
668 qstart
= buf
.length() - 1;
673 new StringMatcher(buf
, qstart
, qlimit
, 0, *parser
.curData
);
675 return syntaxError(U_MEMORY_ALLOCATION_ERROR
, rule
, start
, status
);
678 int32_t max
= Quantifier::MAX
;
688 // do nothing -- min, max already set
690 m
= new Quantifier(m
, min
, max
);
692 return syntaxError(U_MEMORY_ALLOCATION_ERROR
, rule
, start
, status
);
694 buf
.truncate(qstart
);
695 buf
.append(parser
.generateStandInFor(m
, status
));
699 //------------------------------------------------------
700 // Elements allowed ONLY WITHIN segments
701 //------------------------------------------------------
703 // assert(isSegment);
704 // We're done parsing a segment.
708 //------------------------------------------------------
709 // Elements allowed ONLY OUTSIDE segments
710 //------------------------------------------------------
713 return syntaxError(U_MULTIPLE_ANTE_CONTEXTS
, rule
, start
, status
);
719 return syntaxError(U_MULTIPLE_POST_CONTEXTS
, rule
, start
, status
);
725 return syntaxError(U_MULTIPLE_CURSORS
, rule
, start
, status
);
727 cursor
= buf
.length();
730 if (cursorOffset
< 0) {
731 if (buf
.length() > 0) {
732 return syntaxError(U_MISPLACED_CURSOR_OFFSET
, rule
, start
, status
);
735 } else if (cursorOffset
> 0) {
736 if (buf
.length() != cursorOffsetPos
|| cursor
>= 0) {
737 return syntaxError(U_MISPLACED_CURSOR_OFFSET
, rule
, start
, status
);
741 if (cursor
== 0 && buf
.length() == 0) {
743 } else if (cursor
< 0) {
744 cursorOffsetPos
= buf
.length();
747 return syntaxError(U_MISPLACED_CURSOR_OFFSET
, rule
, start
, status
);
753 //------------------------------------------------------
754 // Non-special characters
755 //------------------------------------------------------
757 // Disallow unquoted characters other than [0-9A-Za-z]
758 // in the printable ASCII range. These characters are
759 // reserved for possible future use.
760 if (c
>= 0x0021 && c
<= 0x007E &&
761 !((c
>= 0x0030/*'0'*/ && c
<= 0x0039/*'9'*/) ||
762 (c
>= 0x0041/*'A'*/ && c
<= 0x005A/*'Z'*/) ||
763 (c
>= 0x0061/*'a'*/ && c
<= 0x007A/*'z'*/))) {
764 return syntaxError(U_UNQUOTED_SPECIAL
, rule
, start
, status
);
777 void RuleHalf::removeContext() {
778 //text = text.substring(ante < 0 ? 0 : ante,
779 // post < 0 ? text.length() : post);
784 text
.removeBetween(0, ante
);
787 anchorStart
= anchorEnd
= FALSE
;
791 * Return true if this half looks like valid output, that is, does not
792 * contain quantifiers or other special input-only elements.
794 UBool
RuleHalf::isValidOutput(TransliteratorParser
& transParser
) {
795 for (int32_t i
=0; i
<text
.length(); ) {
796 UChar32 c
= text
.char32At(i
);
798 if (!transParser
.parseData
->isReplacer(c
)) {
806 * Return true if this half looks like valid input, that is, does not
807 * contain functions or other special output-only elements.
809 UBool
RuleHalf::isValidInput(TransliteratorParser
& transParser
) {
810 for (int32_t i
=0; i
<text
.length(); ) {
811 UChar32 c
= text
.char32At(i
);
813 if (!transParser
.parseData
->isMatcher(c
)) {
820 //----------------------------------------------------------------------
822 //----------------------------------------------------------------------
827 TransliteratorParser::TransliteratorParser(UErrorCode
&statusReturn
) :
828 dataVector(statusReturn
),
829 idBlockVector(statusReturn
),
830 variablesVector(statusReturn
),
831 segmentObjects(statusReturn
)
833 idBlockVector
.setDeleter(uprv_deleteUObject
);
835 compoundFilter
= NULL
;
837 variableNames
.setValueDeleter(uprv_deleteUObject
);
843 TransliteratorParser::~TransliteratorParser() {
844 while (!dataVector
.isEmpty())
845 delete (TransliterationRuleData
*)(dataVector
.orphanElementAt(0));
846 delete compoundFilter
;
848 while (!variablesVector
.isEmpty())
849 delete (UnicodeFunctor
*)variablesVector
.orphanElementAt(0);
853 TransliteratorParser::parse(const UnicodeString
& rules
,
854 UTransDirection transDirection
,
858 parseRules(rules
, transDirection
, ec
);
864 * Return the compound filter parsed by parse(). Caller owns result.
866 UnicodeSet
* TransliteratorParser::orphanCompoundFilter() {
867 UnicodeSet
* f
= compoundFilter
;
868 compoundFilter
= NULL
;
872 //----------------------------------------------------------------------
873 // Private implementation
874 //----------------------------------------------------------------------
877 * Parse the given string as a sequence of rules, separated by newline
878 * characters ('\n'), and cause this object to implement those rules. Any
879 * previous rules are discarded. Typically this method is called exactly
880 * once, during construction.
881 * @exception IllegalArgumentException if there is a syntax error in the
884 void TransliteratorParser::parseRules(const UnicodeString
& rule
,
885 UTransDirection theDirection
,
888 // Clear error struct
889 uprv_memset(&parseError
, 0, sizeof(parseError
));
890 parseError
.line
= parseError
.offset
= -1;
892 UBool parsingIDs
= TRUE
;
893 int32_t ruleCount
= 0;
895 while (!dataVector
.isEmpty()) {
896 delete (TransliterationRuleData
*)(dataVector
.orphanElementAt(0));
898 if (U_FAILURE(status
)) {
902 idBlockVector
.removeAllElements();
904 direction
= theDirection
;
907 delete compoundFilter
;
908 compoundFilter
= NULL
;
910 while (!variablesVector
.isEmpty()) {
911 delete (UnicodeFunctor
*)variablesVector
.orphanElementAt(0);
913 variableNames
.removeAll();
914 parseData
= new ParseData(0, &variablesVector
, &variableNames
);
915 if (parseData
== NULL
) {
916 status
= U_MEMORY_ALLOCATION_ERROR
;
920 dotStandIn
= (UChar
) -1;
922 UnicodeString
*tempstr
= NULL
; // used for memory allocation error checking
923 UnicodeString str
; // scratch
924 UnicodeString idBlockResult
;
926 int32_t limit
= rule
.length();
928 // The compound filter offset is an index into idBlockResult.
929 // If it is 0, then the compound filter occurred at the start,
930 // and it is the offset to the _start_ of the compound filter
931 // pattern. Otherwise it is the offset to the _limit_ of the
932 // compound filter pattern within idBlockResult.
933 compoundFilter
= NULL
;
934 int32_t compoundFilterOffset
= -1;
936 while (pos
< limit
&& U_SUCCESS(status
)) {
937 UChar c
= rule
.charAt(pos
++);
938 if (PatternProps::isWhiteSpace(c
)) {
939 // Ignore leading whitespace.
942 // Skip lines starting with the comment character
943 if (c
== RULE_COMMENT_CHAR
) {
944 pos
= rule
.indexOf((UChar
)0x000A /*\n*/, pos
) + 1;
946 break; // No "\n" found; rest of rule is a commnet
948 continue; // Either fall out or restart with next line
952 if (c
== END_OF_RULE
)
955 // keep track of how many rules we've seen
958 // We've found the start of a rule or ID. c is its first
959 // character, and pos points past c.
961 // Look for an ID token. Must have at least ID_TOKEN_LEN + 1
963 if ((pos
+ ID_TOKEN_LEN
+ 1) <= limit
&&
964 rule
.compare(pos
, ID_TOKEN_LEN
, ID_TOKEN
) == 0) {
966 c
= rule
.charAt(pos
);
967 while (PatternProps::isWhiteSpace(c
) && pos
< limit
) {
969 c
= rule
.charAt(pos
);
975 if (curData
!= NULL
) {
976 if (direction
== UTRANS_FORWARD
)
977 dataVector
.addElement(curData
, status
);
979 dataVector
.insertElementAt(curData
, 0, status
);
985 TransliteratorIDParser::SingleID
* id
=
986 TransliteratorIDParser::parseSingleID(rule
, p
, direction
, status
);
987 if (p
!= pos
&& ICU_Utility::parseChar(rule
, p
, END_OF_RULE
)) {
988 // Successful ::ID parse.
990 if (direction
== UTRANS_FORWARD
) {
991 idBlockResult
.append(id
->canonID
).append(END_OF_RULE
);
993 idBlockResult
.insert(0, END_OF_RULE
);
994 idBlockResult
.insert(0, id
->canonID
);
998 // Couldn't parse an ID. Try to parse a global filter
999 int32_t withParens
= -1;
1000 UnicodeSet
* f
= TransliteratorIDParser::parseGlobalFilter(rule
, p
, direction
, withParens
, NULL
);
1002 if (ICU_Utility::parseChar(rule
, p
, END_OF_RULE
)
1003 && (direction
== UTRANS_FORWARD
) == (withParens
== 0))
1005 if (compoundFilter
!= NULL
) {
1006 // Multiple compound filters
1007 syntaxError(U_MULTIPLE_COMPOUND_FILTERS
, rule
, pos
, status
);
1011 compoundFilterOffset
= ruleCount
;
1018 // Can be parsed as neither an ID nor a global filter
1019 syntaxError(U_INVALID_ID
, rule
, pos
, status
);
1026 tempstr
= new UnicodeString(idBlockResult
);
1027 // NULL pointer check
1028 if (tempstr
== NULL
) {
1029 status
= U_MEMORY_ALLOCATION_ERROR
;
1032 if (direction
== UTRANS_FORWARD
)
1033 idBlockVector
.addElement(tempstr
, status
);
1035 idBlockVector
.insertElementAt(tempstr
, 0, status
);
1036 idBlockResult
.remove();
1038 curData
= new TransliterationRuleData(status
);
1039 // NULL pointer check
1040 if (curData
== NULL
) {
1041 status
= U_MEMORY_ALLOCATION_ERROR
;
1044 parseData
->data
= curData
;
1046 // By default, rules use part of the private use area
1047 // E000..F8FF for variables and other stand-ins. Currently
1048 // the range F000..F8FF is typically sufficient. The 'use
1049 // variable range' pragma allows rule sets to modify this.
1050 setVariableRange(0xF000, 0xF8FF, status
);
1053 if (resemblesPragma(rule
, pos
, limit
)) {
1054 int32_t ppp
= parsePragma(rule
, pos
, limit
, status
);
1056 syntaxError(U_MALFORMED_PRAGMA
, rule
, pos
, status
);
1061 pos
= parseRule(rule
, pos
, limit
, status
);
1066 if (parsingIDs
&& idBlockResult
.length() > 0) {
1067 tempstr
= new UnicodeString(idBlockResult
);
1068 // NULL pointer check
1069 if (tempstr
== NULL
) {
1070 status
= U_MEMORY_ALLOCATION_ERROR
;
1073 if (direction
== UTRANS_FORWARD
)
1074 idBlockVector
.addElement(tempstr
, status
);
1076 idBlockVector
.insertElementAt(tempstr
, 0, status
);
1078 else if (!parsingIDs
&& curData
!= NULL
) {
1079 if (direction
== UTRANS_FORWARD
)
1080 dataVector
.addElement(curData
, status
);
1082 dataVector
.insertElementAt(curData
, 0, status
);
1085 if (U_SUCCESS(status
)) {
1086 // Convert the set vector to an array
1087 int32_t i
, dataVectorSize
= dataVector
.size();
1088 for (i
= 0; i
< dataVectorSize
; i
++) {
1089 TransliterationRuleData
* data
= (TransliterationRuleData
*)dataVector
.elementAt(i
);
1090 data
->variablesLength
= variablesVector
.size();
1091 if (data
->variablesLength
== 0) {
1092 data
->variables
= 0;
1094 data
->variables
= (UnicodeFunctor
**)uprv_malloc(data
->variablesLength
* sizeof(UnicodeFunctor
*));
1095 // NULL pointer check
1096 if (data
->variables
== NULL
) {
1097 status
= U_MEMORY_ALLOCATION_ERROR
;
1100 data
->variablesAreOwned
= (i
== 0);
1103 for (int32_t j
= 0; j
< data
->variablesLength
; j
++) {
1104 data
->variables
[j
] =
1105 ((UnicodeSet
*)variablesVector
.elementAt(j
));
1108 data
->variableNames
.removeAll();
1110 const UHashElement
* he
= variableNames
.nextElement(pos
);
1111 while (he
!= NULL
) {
1112 UnicodeString
* tempus
= (UnicodeString
*)(((UnicodeString
*)(he
->value
.pointer
))->clone());
1113 if (tempus
== NULL
) {
1114 status
= U_MEMORY_ALLOCATION_ERROR
;
1117 data
->variableNames
.put(*((UnicodeString
*)(he
->key
.pointer
)),
1119 he
= variableNames
.nextElement(pos
);
1122 variablesVector
.removeAllElements(); // keeps them from getting deleted when we succeed
1125 if (compoundFilter
!= NULL
) {
1126 if ((direction
== UTRANS_FORWARD
&& compoundFilterOffset
!= 1) ||
1127 (direction
== UTRANS_REVERSE
&& compoundFilterOffset
!= ruleCount
)) {
1128 status
= U_MISPLACED_COMPOUND_FILTER
;
1132 for (i
= 0; i
< dataVectorSize
; i
++) {
1133 TransliterationRuleData
* data
= (TransliterationRuleData
*)dataVector
.elementAt(i
);
1134 data
->ruleSet
.freeze(parseError
, status
);
1136 if (idBlockVector
.size() == 1 && ((UnicodeString
*)idBlockVector
.elementAt(0))->isEmpty()) {
1137 idBlockVector
.removeElementAt(0);
1143 * Set the variable range to [start, end] (inclusive).
1145 void TransliteratorParser::setVariableRange(int32_t start
, int32_t end
, UErrorCode
& status
) {
1146 if (start
> end
|| start
< 0 || end
> 0xFFFF) {
1147 status
= U_MALFORMED_PRAGMA
;
1151 curData
->variablesBase
= (UChar
) start
;
1152 if (dataVector
.size() == 0) {
1153 variableNext
= (UChar
) start
;
1154 variableLimit
= (UChar
) (end
+ 1);
1159 * Assert that the given character is NOT within the variable range.
1160 * If it is, return FALSE. This is neccesary to ensure that the
1161 * variable range does not overlap characters used in a rule.
1163 UBool
TransliteratorParser::checkVariableRange(UChar32 ch
) const {
1164 return !(ch
>= curData
->variablesBase
&& ch
< variableLimit
);
1168 * Set the maximum backup to 'backup', in response to a pragma
1171 void TransliteratorParser::pragmaMaximumBackup(int32_t /*backup*/) {
1176 * Begin normalizing all rules using the given mode, in response
1177 * to a pragma statement.
1179 void TransliteratorParser::pragmaNormalizeRules(UNormalizationMode
/*mode*/) {
1183 static const UChar PRAGMA_USE
[] = {0x75,0x73,0x65,0x20,0}; // "use "
1185 static const UChar PRAGMA_VARIABLE_RANGE
[] = {0x7E,0x76,0x61,0x72,0x69,0x61,0x62,0x6C,0x65,0x20,0x72,0x61,0x6E,0x67,0x65,0x20,0x23,0x20,0x23,0x7E,0x3B,0}; // "~variable range # #~;"
1187 static const UChar PRAGMA_MAXIMUM_BACKUP
[] = {0x7E,0x6D,0x61,0x78,0x69,0x6D,0x75,0x6D,0x20,0x62,0x61,0x63,0x6B,0x75,0x70,0x20,0x23,0x7E,0x3B,0}; // "~maximum backup #~;"
1189 static const UChar PRAGMA_NFD_RULES
[] = {0x7E,0x6E,0x66,0x64,0x20,0x72,0x75,0x6C,0x65,0x73,0x7E,0x3B,0}; // "~nfd rules~;"
1191 static const UChar PRAGMA_NFC_RULES
[] = {0x7E,0x6E,0x66,0x63,0x20,0x72,0x75,0x6C,0x65,0x73,0x7E,0x3B,0}; // "~nfc rules~;"
1194 * Return true if the given rule looks like a pragma.
1195 * @param pos offset to the first non-whitespace character
1197 * @param limit pointer past the last character of the rule.
1199 UBool
TransliteratorParser::resemblesPragma(const UnicodeString
& rule
, int32_t pos
, int32_t limit
) {
1200 // Must start with /use\s/i
1201 return ICU_Utility::parsePattern(rule
, pos
, limit
, UnicodeString(TRUE
, PRAGMA_USE
, 4), NULL
) >= 0;
1205 * Parse a pragma. This method assumes resemblesPragma() has
1206 * already returned true.
1207 * @param pos offset to the first non-whitespace character
1209 * @param limit pointer past the last character of the rule.
1210 * @return the position index after the final ';' of the pragma,
1213 int32_t TransliteratorParser::parsePragma(const UnicodeString
& rule
, int32_t pos
, int32_t limit
, UErrorCode
& status
) {
1216 // resemblesPragma() has already returned true, so we
1217 // know that pos points to /use\s/i; we can skip 4 characters
1221 // Here are the pragmas we recognize:
1222 // use variable range 0xE000 0xEFFF;
1223 // use maximum backup 16;
1226 int p
= ICU_Utility::parsePattern(rule
, pos
, limit
, UnicodeString(TRUE
, PRAGMA_VARIABLE_RANGE
, -1), array
);
1228 setVariableRange(array
[0], array
[1], status
);
1232 p
= ICU_Utility::parsePattern(rule
, pos
, limit
, UnicodeString(TRUE
, PRAGMA_MAXIMUM_BACKUP
, -1), array
);
1234 pragmaMaximumBackup(array
[0]);
1238 p
= ICU_Utility::parsePattern(rule
, pos
, limit
, UnicodeString(TRUE
, PRAGMA_NFD_RULES
, -1), NULL
);
1240 pragmaNormalizeRules(UNORM_NFD
);
1244 p
= ICU_Utility::parsePattern(rule
, pos
, limit
, UnicodeString(TRUE
, PRAGMA_NFC_RULES
, -1), NULL
);
1246 pragmaNormalizeRules(UNORM_NFC
);
1250 // Syntax error: unable to parse pragma
1255 * MAIN PARSER. Parse the next rule in the given rule string, starting
1256 * at pos. Return the index after the last character parsed. Do not
1257 * parse characters at or after limit.
1259 * Important: The character at pos must be a non-whitespace character
1260 * that is not the comment character.
1262 * This method handles quoting, escaping, and whitespace removal. It
1263 * parses the end-of-rule character. It recognizes context and cursor
1264 * indicators. Once it does a lexical breakdown of the rule at pos, it
1265 * creates a rule object and adds it to our rule list.
1267 int32_t TransliteratorParser::parseRule(const UnicodeString
& rule
, int32_t pos
, int32_t limit
, UErrorCode
& status
) {
1268 // Locate the left side, operator, and right side
1269 int32_t start
= pos
;
1273 // Set up segments data
1274 segmentStandins
.truncate(0);
1275 segmentObjects
.removeAllElements();
1277 // Use pointers to automatics to make swapping possible.
1278 RuleHalf
_left(*this), _right(*this);
1279 RuleHalf
* left
= &_left
;
1280 RuleHalf
* right
= &_right
;
1282 undefinedVariableName
.remove();
1283 pos
= left
->parse(rule
, pos
, limit
, status
);
1284 if (U_FAILURE(status
)) {
1288 if (pos
== limit
|| u_strchr(gOPERATORS
, (op
= rule
.charAt(--pos
))) == NULL
) {
1289 return syntaxError(U_MISSING_OPERATOR
, rule
, start
, status
);
1293 // Found an operator char. Check for forward-reverse operator.
1294 if (op
== REVERSE_RULE_OP
&&
1295 (pos
< limit
&& rule
.charAt(pos
) == FORWARD_RULE_OP
)) {
1297 op
= FWDREV_RULE_OP
;
1300 // Translate alternate op characters.
1302 case ALT_FORWARD_RULE_OP
:
1303 op
= FORWARD_RULE_OP
;
1305 case ALT_REVERSE_RULE_OP
:
1306 op
= REVERSE_RULE_OP
;
1308 case ALT_FWDREV_RULE_OP
:
1309 op
= FWDREV_RULE_OP
;
1313 pos
= right
->parse(rule
, pos
, limit
, status
);
1314 if (U_FAILURE(status
)) {
1319 if (rule
.charAt(--pos
) == END_OF_RULE
) {
1322 // RuleHalf parser must have terminated at an operator
1323 return syntaxError(U_UNQUOTED_SPECIAL
, rule
, start
, status
);
1327 if (op
== VARIABLE_DEF_OP
) {
1328 // LHS is the name. RHS is a single character, either a literal
1329 // or a set (already parsed). If RHS is longer than one
1330 // character, it is either a multi-character string, or multiple
1331 // sets, or a mixture of chars and sets -- syntax error.
1333 // We expect to see a single undefined variable (the one being
1335 if (undefinedVariableName
.length() == 0) {
1336 // "Missing '$' or duplicate definition"
1337 return syntaxError(U_BAD_VARIABLE_DEFINITION
, rule
, start
, status
);
1339 if (left
->text
.length() != 1 || left
->text
.charAt(0) != variableLimit
) {
1341 return syntaxError(U_MALFORMED_VARIABLE_DEFINITION
, rule
, start
, status
);
1343 if (left
->anchorStart
|| left
->anchorEnd
||
1344 right
->anchorStart
|| right
->anchorEnd
) {
1345 return syntaxError(U_MALFORMED_VARIABLE_DEFINITION
, rule
, start
, status
);
1347 // We allow anything on the right, including an empty string.
1348 UnicodeString
* value
= new UnicodeString(right
->text
);
1349 // NULL pointer check
1350 if (value
== NULL
) {
1351 return syntaxError(U_MEMORY_ALLOCATION_ERROR
, rule
, start
, status
);
1353 variableNames
.put(undefinedVariableName
, value
, status
);
1358 // If this is not a variable definition rule, we shouldn't have
1359 // any undefined variable names.
1360 if (undefinedVariableName
.length() != 0) {
1361 return syntaxError(// "Undefined variable $" + undefinedVariableName,
1362 U_UNDEFINED_VARIABLE
,
1363 rule
, start
, status
);
1367 if (segmentStandins
.length() > segmentObjects
.size()) {
1368 syntaxError(U_UNDEFINED_SEGMENT_REFERENCE
, rule
, start
, status
);
1370 for (i
=0; i
<segmentStandins
.length(); ++i
) {
1371 if (segmentStandins
.charAt(i
) == 0) {
1372 syntaxError(U_INTERNAL_TRANSLITERATOR_ERROR
, rule
, start
, status
); // will never happen
1375 for (i
=0; i
<segmentObjects
.size(); ++i
) {
1376 if (segmentObjects
.elementAt(i
) == NULL
) {
1377 syntaxError(U_INTERNAL_TRANSLITERATOR_ERROR
, rule
, start
, status
); // will never happen
1381 // If the direction we want doesn't match the rule
1382 // direction, do nothing.
1383 if (op
!= FWDREV_RULE_OP
&&
1384 ((direction
== UTRANS_FORWARD
) != (op
== FORWARD_RULE_OP
))) {
1388 // Transform the rule into a forward rule by swapping the
1389 // sides if necessary.
1390 if (direction
== UTRANS_REVERSE
) {
1395 // Remove non-applicable elements in forward-reverse
1396 // rules. Bidirectional rules ignore elements that do not
1398 if (op
== FWDREV_RULE_OP
) {
1399 right
->removeContext();
1401 left
->cursorOffset
= 0;
1404 // Normalize context
1405 if (left
->ante
< 0) {
1408 if (left
->post
< 0) {
1409 left
->post
= left
->text
.length();
1412 // Context is only allowed on the input side. Cursors are only
1413 // allowed on the output side. Segment delimiters can only appear
1414 // on the left, and references on the right. Cursor offset
1415 // cannot appear without an explicit cursor. Cursor offset
1416 // cannot place the cursor outside the limits of the context.
1417 // Anchors are only allowed on the input side.
1418 if (right
->ante
>= 0 || right
->post
>= 0 || left
->cursor
>= 0 ||
1419 (right
->cursorOffset
!= 0 && right
->cursor
< 0) ||
1420 // - The following two checks were used to ensure that the
1421 // - the cursor offset stayed within the ante- or postcontext.
1422 // - However, with the addition of quantifiers, we have to
1423 // - allow arbitrary cursor offsets and do runtime checking.
1424 //(right->cursorOffset > (left->text.length() - left->post)) ||
1425 //(-right->cursorOffset > left->ante) ||
1426 right
->anchorStart
|| right
->anchorEnd
||
1427 !left
->isValidInput(*this) || !right
->isValidOutput(*this) ||
1428 left
->ante
> left
->post
) {
1430 return syntaxError(U_MALFORMED_RULE
, rule
, start
, status
);
1433 // Flatten segment objects vector to an array
1434 UnicodeFunctor
** segmentsArray
= NULL
;
1435 if (segmentObjects
.size() > 0) {
1436 segmentsArray
= (UnicodeFunctor
**)uprv_malloc(segmentObjects
.size() * sizeof(UnicodeFunctor
*));
1437 // Null pointer check
1438 if (segmentsArray
== NULL
) {
1439 return syntaxError(U_MEMORY_ALLOCATION_ERROR
, rule
, start
, status
);
1441 segmentObjects
.toArray((void**) segmentsArray
);
1443 TransliterationRule
* temptr
= new TransliterationRule(
1444 left
->text
, left
->ante
, left
->post
,
1445 right
->text
, right
->cursor
, right
->cursorOffset
,
1447 segmentObjects
.size(),
1448 left
->anchorStart
, left
->anchorEnd
,
1451 //Null pointer check
1452 if (temptr
== NULL
) {
1453 uprv_free(segmentsArray
);
1454 return syntaxError(U_MEMORY_ALLOCATION_ERROR
, rule
, start
, status
);
1457 curData
->ruleSet
.addRule(temptr
, status
);
1463 * Called by main parser upon syntax error. Search the rule string
1464 * for the probable end of the rule. Of course, if the error is that
1465 * the end of rule marker is missing, then the rule end will not be found.
1466 * In any case the rule start will be correctly reported.
1467 * @param msg error description
1468 * @param rule pattern string
1469 * @param start position of first character of current rule
1471 int32_t TransliteratorParser::syntaxError(UErrorCode parseErrorCode
,
1472 const UnicodeString
& rule
,
1476 parseError
.offset
= pos
;
1477 parseError
.line
= 0 ; /* we are not using line numbers */
1480 const int32_t LEN
= U_PARSE_CONTEXT_LEN
- 1;
1481 int32_t start
= uprv_max(pos
- LEN
, 0);
1484 rule
.extract(start
,stop
-start
,parseError
.preContext
);
1485 //null terminate the buffer
1486 parseError
.preContext
[stop
-start
] = 0;
1490 stop
= uprv_min(pos
+ LEN
, rule
.length());
1492 rule
.extract(start
,stop
-start
,parseError
.postContext
);
1493 //null terminate the buffer
1494 parseError
.postContext
[stop
-start
]= 0;
1496 status
= (UErrorCode
)parseErrorCode
;
1502 * Parse a UnicodeSet out, store it, and return the stand-in character
1503 * used to represent it.
1505 UChar
TransliteratorParser::parseSet(const UnicodeString
& rule
,
1507 UErrorCode
& status
) {
1508 UnicodeSet
* set
= new UnicodeSet(rule
, pos
, USET_IGNORE_SPACE
, parseData
, status
);
1509 // Null pointer check
1511 status
= U_MEMORY_ALLOCATION_ERROR
;
1512 return (UChar
)0x0000; // Return empty character with error.
1515 return generateStandInFor(set
, status
);
1519 * Generate and return a stand-in for a new UnicodeFunctor. Store
1520 * the matcher (adopt it).
1522 UChar
TransliteratorParser::generateStandInFor(UnicodeFunctor
* adopted
, UErrorCode
& status
) {
1523 // assert(obj != null);
1525 // Look up previous stand-in, if any. This is a short list
1526 // (typical n is 0, 1, or 2); linear search is optimal.
1527 for (int32_t i
=0; i
<variablesVector
.size(); ++i
) {
1528 if (variablesVector
.elementAt(i
) == adopted
) { // [sic] pointer comparison
1529 return (UChar
) (curData
->variablesBase
+ i
);
1533 if (variableNext
>= variableLimit
) {
1535 status
= U_VARIABLE_RANGE_EXHAUSTED
;
1538 variablesVector
.addElement(adopted
, status
);
1539 return variableNext
++;
1543 * Return the standin for segment seg (1-based).
1545 UChar
TransliteratorParser::getSegmentStandin(int32_t seg
, UErrorCode
& status
) {
1546 // Special character used to indicate an empty spot
1547 UChar empty
= curData
->variablesBase
- 1;
1548 while (segmentStandins
.length() < seg
) {
1549 segmentStandins
.append(empty
);
1551 UChar c
= segmentStandins
.charAt(seg
-1);
1553 if (variableNext
>= variableLimit
) {
1554 status
= U_VARIABLE_RANGE_EXHAUSTED
;
1558 // Set a placeholder in the master variables vector that will be
1559 // filled in later by setSegmentObject(). We know that we will get
1560 // called first because setSegmentObject() will call us.
1561 variablesVector
.addElement((void*) NULL
, status
);
1562 segmentStandins
.setCharAt(seg
-1, c
);
1568 * Set the object for segment seg (1-based).
1570 void TransliteratorParser::setSegmentObject(int32_t seg
, StringMatcher
* adopted
, UErrorCode
& status
) {
1571 // Since we call parseSection() recursively, nested
1572 // segments will result in segment i+1 getting parsed
1573 // and stored before segment i; be careful with the
1574 // vector handling here.
1575 if (segmentObjects
.size() < seg
) {
1576 segmentObjects
.setSize(seg
, status
);
1578 int32_t index
= getSegmentStandin(seg
, status
) - curData
->variablesBase
;
1579 if (segmentObjects
.elementAt(seg
-1) != NULL
||
1580 variablesVector
.elementAt(index
) != NULL
) {
1581 // should never happen
1582 status
= U_INTERNAL_TRANSLITERATOR_ERROR
;
1585 segmentObjects
.setElementAt(adopted
, seg
-1);
1586 variablesVector
.setElementAt(adopted
, index
);
1590 * Return the stand-in for the dot set. It is allocated the first
1591 * time and reused thereafter.
1593 UChar
TransliteratorParser::getDotStandIn(UErrorCode
& status
) {
1594 if (dotStandIn
== (UChar
) -1) {
1595 UnicodeSet
* tempus
= new UnicodeSet(UnicodeString(TRUE
, DOT_SET
, -1), status
);
1596 // Null pointer check.
1597 if (tempus
== NULL
) {
1598 status
= U_MEMORY_ALLOCATION_ERROR
;
1599 return (UChar
)0x0000;
1601 dotStandIn
= generateStandInFor(tempus
, status
);
1607 * Append the value of the given variable name to the given
1610 void TransliteratorParser::appendVariableDef(const UnicodeString
& name
,
1612 UErrorCode
& status
) {
1613 const UnicodeString
* s
= (const UnicodeString
*) variableNames
.get(name
);
1615 // We allow one undefined variable so that variable definition
1616 // statements work. For the first undefined variable we return
1617 // the special placeholder variableLimit-1, and save the variable
1619 if (undefinedVariableName
.length() == 0) {
1620 undefinedVariableName
= name
;
1621 if (variableNext
>= variableLimit
) {
1622 // throw new RuntimeException("Private use variables exhausted");
1623 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1626 buf
.append((UChar
) --variableLimit
);
1628 //throw new IllegalArgumentException("Undefined variable $"
1630 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1639 * Glue method to get around access restrictions in C++.
1641 /*Transliterator* TransliteratorParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) {
1642 return Transliterator::createBasicInstance(id, canonID);
1648 utrans_stripRules(const UChar
*source
, int32_t sourceLen
, UChar
*target
, UErrorCode
*status
) {
1651 //const UChar *sourceStart = source;
1652 const UChar
*targetStart
= target
;
1653 const UChar
*sourceLimit
= source
+sourceLen
;
1654 UChar
*targetLimit
= target
+sourceLen
;
1656 UBool quoted
= FALSE
;
1659 uprv_memset(target
, 0, sourceLen
*U_SIZEOF_UCHAR
);
1661 /* read the rules into the buffer */
1662 while (source
< sourceLimit
)
1665 U16_NEXT_UNSAFE(source
, index
, c
);
1668 quoted
= (UBool
)!quoted
;
1671 if (c
== RULE_COMMENT_CHAR
) {
1672 /* skip comments and all preceding spaces */
1673 while (targetStart
< target
&& *(target
- 1) == 0x0020) {
1679 while (c
!= CR
&& c
!= LF
);
1681 else if (c
== ESCAPE
) {
1682 UChar32 c2
= *source
;
1683 if (c2
== CR
|| c2
== LF
) {
1684 /* A backslash at the end of a line. */
1685 /* Since we're stripping lines, ignore the backslash. */
1689 if (c2
== 0x0075 && source
+5 < sourceLimit
) { /* \u seen. \U isn't unescaped. */
1690 int32_t escapeOffset
= 0;
1691 UnicodeString
escapedStr(source
, 5);
1692 c2
= escapedStr
.unescapeAt(escapeOffset
);
1694 if (c2
== (UChar32
)0xFFFFFFFF || escapeOffset
== 0)
1696 *status
= U_PARSE_ERROR
;
1699 if (!PatternProps::isWhiteSpace(c2
) && !u_iscntrl(c2
) && !u_ispunct(c2
)) {
1700 /* It was escaped for a reason. Write what it was suppose to be. */
1705 else if (c2
== QUOTE
) {
1706 /* \' seen. Make sure we don't do anything when we see it again. */
1707 quoted
= (UBool
)!quoted
;
1711 if (c
== CR
|| c
== LF
)
1713 /* ignore spaces carriage returns, and all leading spaces on the next line.
1714 * and line feed unless in the form \uXXXX
1717 while (source
< sourceLimit
) {
1719 if (c
!= CR
&& c
!= LF
&& c
!= 0x0020) {
1727 /* Append UChar * after dissembling if c > 0xffff*/
1729 U16_APPEND_UNSAFE(target
, index
, c
);
1732 if (target
< targetLimit
) {
1735 return (int32_t)(target
-targetStart
);
1738 #endif /* #if !UCONFIG_NO_TRANSLITERATION */