2  ********************************************************************** 
   3  *   Copyright (C) 1999-2011, International Business Machines 
   4  *   Corporation and others.  All Rights Reserved. 
   5  ********************************************************************** 
   6  *   Date        Name        Description 
   7  *   11/17/99    aliu        Creation. 
   8  ********************************************************************** 
  11 #include "unicode/utypes.h" 
  13 #if !UCONFIG_NO_TRANSLITERATION 
  15 #include "unicode/unistr.h" 
  16 #include "unicode/uniset.h" 
  17 #include "unicode/utf16.h" 
  24 static void U_CALLCONV 
_deleteRule(void *rule
) { 
  25     delete (icu::TransliterationRule 
*)rule
; 
  29 //---------------------------------------------------------------------- 
  30 // BEGIN Debugging support 
  31 //---------------------------------------------------------------------- 
  40  * @param appendTo result is appended to this param. 
  41  * @param input the string being transliterated 
  42  * @param pos the index struct 
  44 static UnicodeString
& _formatInput(UnicodeString 
&appendTo
, 
  45                                    const UnicodeString
& input
, 
  46                                    const UTransPosition
& pos
) { 
  47     // Output a string of the form aaa{bbb|ccc|ddd}eee, where 
  48     // the {} indicate the context start and limit, and the || 
  49     // indicate the start and limit. 
  50     if (0 <= pos
.contextStart 
&& 
  51         pos
.contextStart 
<= pos
.start 
&& 
  52         pos
.start 
<= pos
.limit 
&& 
  53         pos
.limit 
<= pos
.contextLimit 
&& 
  54         pos
.contextLimit 
<= input
.length()) { 
  56         UnicodeString a
, b
, c
, d
, e
; 
  57         input
.extractBetween(0, pos
.contextStart
, a
); 
  58         input
.extractBetween(pos
.contextStart
, pos
.start
, b
); 
  59         input
.extractBetween(pos
.start
, pos
.limit
, c
); 
  60         input
.extractBetween(pos
.limit
, pos
.contextLimit
, d
); 
  61         input
.extractBetween(pos
.contextLimit
, input
.length(), e
); 
  62         appendTo
.append(a
).append((UChar
)123/*{*/).append(b
). 
  63             append((UChar
)124/*|*/).append(c
).append((UChar
)124/*|*/).append(d
). 
  64             append((UChar
)125/*}*/).append(e
); 
  66         appendTo
.append("INVALID UTransPosition"); 
  67         //appendTo.append((UnicodeString)"INVALID UTransPosition {cs=" + 
  68         //                pos.contextStart + ", s=" + pos.start + ", l=" + 
  69         //                pos.limit + ", cl=" + pos.contextLimit + "} on " + 
  75 // Append a hex string to the target 
  76 UnicodeString
& _appendHex(uint32_t number
, 
  78                           UnicodeString
& target
) { 
  79     static const UChar digitString
[] = { 
  80         0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 
  81         0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0 
  84         target 
+= digitString
[(number 
>> (digits
*4)) & 0xF]; 
  89 // Replace nonprintable characters with unicode escapes 
  90 UnicodeString
& _escape(const UnicodeString 
&source
, 
  91                        UnicodeString 
&target
) { 
  92     for (int32_t i 
= 0; i 
< source
.length(); ) { 
  93         UChar32 ch 
= source
.char32At(i
); 
  95         if (ch 
< 0x09 || (ch 
> 0x0A && ch 
< 0x20)|| ch 
> 0x7E) { 
  98                 _appendHex(ch
, 4, target
); 
 101                 _appendHex(ch
, 8, target
); 
 110 inline void _debugOut(const char* msg
, TransliterationRule
* rule
, 
 111                       const Replaceable
& theText
, UTransPosition
& pos
) { 
 112     UnicodeString 
buf(msg
, ""); 
 115         rule
->toRule(r
, TRUE
); 
 116         buf
.append((UChar
)32).append(r
); 
 118     buf
.append(UnicodeString(" => ", "")); 
 119     UnicodeString
* text 
= (UnicodeString
*)&theText
; 
 120     _formatInput(buf
, *text
, pos
); 
 123     CharString 
cbuf(esc
); 
 124     printf("%s\n", (const char*) cbuf
); 
 128 #define _debugOut(msg, rule, theText, pos) 
 131 //---------------------------------------------------------------------- 
 132 // END Debugging support 
 133 //---------------------------------------------------------------------- 
 135 // Fill the precontext and postcontext with the patterns of the rules 
 136 // that are masking one another. 
 137 static void maskingError(const icu::TransliterationRule
& rule1
, 
 138                          const icu::TransliterationRule
& rule2
, 
 139                          UParseError
& parseError
) { 
 140     icu::UnicodeString r
; 
 143     parseError
.line 
= parseError
.offset 
= -1; 
 146     rule1
.toRule(r
, FALSE
); 
 147     len 
= uprv_min(r
.length(), U_PARSE_CONTEXT_LEN
-1); 
 148     r
.extract(0, len
, parseError
.preContext
); 
 149     parseError
.preContext
[len
] = 0;    
 153     rule2
.toRule(r
, FALSE
); 
 154     len 
= uprv_min(r
.length(), U_PARSE_CONTEXT_LEN
-1); 
 155     r
.extract(0, len
, parseError
.postContext
); 
 156     parseError
.postContext
[len
] = 0;    
 162  * Construct a new empty rule set. 
 164 TransliterationRuleSet::TransliterationRuleSet(UErrorCode
& status
) : UMemory() { 
 165     ruleVector 
= new UVector(&_deleteRule
, NULL
, status
); 
 166     if (U_FAILURE(status
)) { 
 169     if (ruleVector 
== NULL
) { 
 170         status 
= U_MEMORY_ALLOCATION_ERROR
; 
 173     maxContextLength 
= 0; 
 179 TransliterationRuleSet::TransliterationRuleSet(const TransliterationRuleSet
& other
) : 
 183     maxContextLength(other
.maxContextLength
) { 
 186     uprv_memcpy(index
, other
.index
, sizeof(index
)); 
 187     UErrorCode status 
= U_ZERO_ERROR
; 
 188     ruleVector 
= new UVector(&_deleteRule
, NULL
, status
); 
 189     if (other
.ruleVector 
!= 0 && ruleVector 
!= 0 && U_SUCCESS(status
)) { 
 190         len 
= other
.ruleVector
->size(); 
 191         for (i
=0; i
<len 
&& U_SUCCESS(status
); ++i
) { 
 192             TransliterationRule 
*tempTranslitRule 
= new TransliterationRule(*(TransliterationRule
*)other
.ruleVector
->elementAt(i
)); 
 194             if (tempTranslitRule 
== NULL
) { 
 195                 status 
= U_MEMORY_ALLOCATION_ERROR
; 
 198             ruleVector
->addElement(tempTranslitRule
, status
); 
 199             if (U_FAILURE(status
)) { 
 204     if (other
.rules 
!= 0 && U_SUCCESS(status
)) { 
 213 TransliterationRuleSet::~TransliterationRuleSet() { 
 214     delete ruleVector
; // This deletes the contained rules 
 218 void TransliterationRuleSet::setData(const TransliterationRuleData
* d
) { 
 220      * We assume that the ruleset has already been frozen. 
 222     int32_t len 
= index
[256]; // see freeze() 
 223     for (int32_t i
=0; i
<len
; ++i
) { 
 224         rules
[i
]->setData(d
); 
 229  * Return the maximum context length. 
 230  * @return the length of the longest preceding context. 
 232 int32_t TransliterationRuleSet::getMaximumContextLength(void) const { 
 233     return maxContextLength
; 
 237  * Add a rule to this set.  Rules are added in order, and order is 
 238  * significant.  The last call to this method must be followed by 
 239  * a call to <code>freeze()</code> before the rule set is used. 
 241  * <p>If freeze() has already been called, calling addRule() 
 242  * unfreezes the rules, and freeze() must be called again. 
 244  * @param adoptedRule the rule to add 
 246 void TransliterationRuleSet::addRule(TransliterationRule
* adoptedRule
, 
 247                                      UErrorCode
& status
) { 
 248     if (U_FAILURE(status
)) { 
 252     ruleVector
->addElement(adoptedRule
, status
); 
 255     if ((len 
= adoptedRule
->getContextLength()) > maxContextLength
) { 
 256         maxContextLength 
= len
; 
 264  * Check this for masked rules and index it to optimize performance. 
 265  * The sequence of operations is: (1) add rules to a set using 
 266  * <code>addRule()</code>; (2) freeze the set using 
 267  * <code>freeze()</code>; (3) use the rule set.  If 
 268  * <code>addRule()</code> is called after calling this method, it 
 269  * invalidates this object, and this method must be called again. 
 270  * That is, <code>freeze()</code> may be called multiple times, 
 271  * although for optimal performance it shouldn't be. 
 273 void TransliterationRuleSet::freeze(UParseError
& parseError
,UErrorCode
& status
) { 
 274     /* Construct the rule array and index table.  We reorder the 
 275      * rules by sorting them into 256 bins.  Each bin contains all 
 276      * rules matching the index value for that bin.  A rule 
 277      * matches an index value if string whose first key character 
 278      * has a low byte equal to the index value can match the rule. 
 280      * Each bin contains zero or more rules, in the same order 
 281      * they were found originally.  However, the total rules in 
 282      * the bins may exceed the number in the original vector, 
 283      * since rules that have a variable as their first key 
 284      * character will generally fall into more than one bin. 
 286      * That is, each bin contains all rules that either have that 
 287      * first index value as their first key character, or have 
 288      * a set containing the index value as their first character. 
 290     int32_t n 
= ruleVector
->size(); 
 293     UVector 
v(2*n
, status
); // heuristic; adjust as needed 
 295     if (U_FAILURE(status
)) { 
 299     /* Precompute the index values.  This saves a LOT of time. 
 300      * Be careful not to call malloc(0). 
 302     int16_t* indexValue 
= (int16_t*) uprv_malloc( sizeof(int16_t) * (n 
> 0 ? n 
: 1) ); 
 304     if (indexValue 
== 0) { 
 305         status 
= U_MEMORY_ALLOCATION_ERROR
; 
 308     for (j
=0; j
<n
; ++j
) { 
 309         TransliterationRule
* r 
= (TransliterationRule
*) ruleVector
->elementAt(j
); 
 310         indexValue
[j
] = r
->getIndexValue(); 
 312     for (x
=0; x
<256; ++x
) { 
 314         for (j
=0; j
<n
; ++j
) { 
 315             if (indexValue
[j
] >= 0) { 
 316                 if (indexValue
[j
] == x
) { 
 317                     v
.addElement(ruleVector
->elementAt(j
), status
); 
 320                 // If the indexValue is < 0, then the first key character is 
 321                 // a set, and we must use the more time-consuming 
 322                 // matchesIndexValue check.  In practice this happens 
 323                 // rarely, so we seldom tread this code path. 
 324                 TransliterationRule
* r 
= (TransliterationRule
*) ruleVector
->elementAt(j
); 
 325                 if (r
->matchesIndexValue((uint8_t)x
)) { 
 326                     v
.addElement(r
, status
); 
 331     uprv_free(indexValue
); 
 332     index
[256] = v
.size(); 
 334     /* Freeze things into an array. 
 336     uprv_free(rules
); // Contains alias pointers 
 338     /* You can't do malloc(0)! */ 
 343     rules 
= (TransliterationRule 
**)uprv_malloc(v
.size() * sizeof(TransliterationRule 
*)); 
 346         status 
= U_MEMORY_ALLOCATION_ERROR
; 
 349     for (j
=0; j
<v
.size(); ++j
) { 
 350         rules
[j
] = (TransliterationRule
*) v
.elementAt(j
); 
 353     // TODO Add error reporting that indicates the rules that 
 355     //UnicodeString errors; 
 357     /* Check for masking.  This is MUCH faster than our old check, 
 358      * which was each rule against each following rule, since we 
 359      * only have to check for masking within each bin now.  It's 
 360      * 256*O(n2^2) instead of O(n1^2), where n1 is the total rule 
 361      * count, and n2 is the per-bin rule count.  But n2<<n1, so 
 364     for (x
=0; x
<256; ++x
) { 
 365         for (j
=index
[x
]; j
<index
[x
+1]-1; ++j
) { 
 366             TransliterationRule
* r1 
= rules
[j
]; 
 367             for (int32_t k
=j
+1; k
<index
[x
+1]; ++k
) { 
 368                 TransliterationRule
* r2 
= rules
[k
]; 
 369                 if (r1
->masks(*r2
)) { 
 370 //|                 if (errors == null) { 
 371 //|                     errors = new StringBuffer(); 
 373 //|                     errors.append("\n"); 
 375 //|                 errors.append("Rule " + r1 + " masks " + r2); 
 376                     status 
= U_RULE_MASK_ERROR
; 
 377                     maskingError(*r1
, *r2
, parseError
); 
 384     //if (errors != null) { 
 385     //    throw new IllegalArgumentException(errors.toString()); 
 390  * Transliterate the given text with the given UTransPosition 
 391  * indices.  Return TRUE if the transliteration should continue 
 392  * or FALSE if it should halt (because of a U_PARTIAL_MATCH match). 
 393  * Note that FALSE is only ever returned if isIncremental is TRUE. 
 394  * @param text the text to be transliterated 
 395  * @param pos the position indices, which will be updated 
 396  * @param incremental if TRUE, assume new text may be inserted 
 397  * at index.limit, and return FALSE if thre is a partial match. 
 398  * @return TRUE unless a U_PARTIAL_MATCH has been obtained, 
 399  * indicating that transliteration should stop until more text 
 402 UBool 
TransliterationRuleSet::transliterate(Replaceable
& text
, 
 405     int16_t indexByte 
= (int16_t) (text
.char32At(pos
.start
) & 0xFF); 
 406     for (int32_t i
=index
[indexByte
]; i
<index
[indexByte
+1]; ++i
) { 
 407         UMatchDegree m 
= rules
[i
]->matchAndReplace(text
, pos
, incremental
); 
 410             _debugOut("match", rules
[i
], text
, pos
); 
 412         case U_PARTIAL_MATCH
: 
 413             _debugOut("partial match", rules
[i
], text
, pos
); 
 415         default: /* Ram: added default to make GCC happy */ 
 419     // No match or partial match from any rule 
 420     pos
.start 
+= U16_LENGTH(text
.char32At(pos
.start
)); 
 421     _debugOut("no match", NULL
, text
, pos
); 
 426  * Create rule strings that represents this rule set. 
 428 UnicodeString
& TransliterationRuleSet::toRules(UnicodeString
& ruleSource
, 
 429                                                UBool escapeUnprintable
) const { 
 431     int32_t count 
= ruleVector
->size(); 
 432     ruleSource
.truncate(0); 
 433     for (i
=0; i
<count
; ++i
) { 
 435             ruleSource
.append((UChar
) 0x000A /*\n*/); 
 437         TransliterationRule 
*r 
= 
 438             (TransliterationRule
*) ruleVector
->elementAt(i
); 
 439         r
->toRule(ruleSource
, escapeUnprintable
); 
 445  * Return the set of all characters that may be modified 
 446  * (getTarget=false) or emitted (getTarget=true) by this set. 
 448 UnicodeSet
& TransliterationRuleSet::getSourceTargetSet(UnicodeSet
& result
, 
 449                                UBool getTarget
) const 
 452     int32_t count 
= ruleVector
->size(); 
 453     for (int32_t i
=0; i
<count
; ++i
) { 
 454         TransliterationRule
* r 
= 
 455             (TransliterationRule
*) ruleVector
->elementAt(i
); 
 457             r
->addTargetSetTo(result
); 
 459             r
->addSourceSetTo(result
); 
 467 #endif /* #if !UCONFIG_NO_TRANSLITERATION */