icuSources/i18n/unesctrn.cpp

   1 /*
   2  **********************************************************************
   3  *   Copyright (c) 2001-2011, International Business Machines
   4  *   Corporation and others.  All Rights Reserved.
   5  **********************************************************************
   6  *   Date        Name        Description
   7  *   11/19/2001  aliu        Creation.
   8  **********************************************************************
   9  */
  10
  11 #include "unicode/utypes.h"
  12
  13 #if !UCONFIG_NO_TRANSLITERATION
  14
  15 #include "unicode/uchar.h"
  16 #include "unicode/utf16.h"
  17 #include "unesctrn.h"
  18 #include "util.h"
  19
  20 #include "cmemory.h"
  21
  22 U_NAMESPACE_BEGIN
  23
  24 /**
  25  * Special character marking the end of the spec[] array.
  26  */
  27 static const UChar END = 0xFFFF;
  28
  29 // Unicode: "U+10FFFF" hex, min=4, max=6
  30 static const UChar SPEC_Unicode[] = {
  31     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,
  32     END
  33 };
  34
  35 // Java: "\\uFFFF" hex, min=4, max=4
  36 static const UChar SPEC_Java[] = {
  37     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
  38     END
  39 };
  40
  41 // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
  42 static const UChar SPEC_C[] = {
  43     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
  44     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,
  45     END
  46 };
  47
  48 // XML: "&#x10FFFF;" hex, min=1, max=6
  49 static const UChar SPEC_XML[] = {
  50     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,
  51     END
  52 };
  53
  54 // XML10: "&#1114111;" dec, min=1, max=7 (not really "Hex-Any")
  55 static const UChar SPEC_XML10[] = {
  56     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,
  57     END
  58 };
  59
  60 // Perl: "\\x{263A}" hex, min=1, max=6
  61 static const UChar SPEC_Perl[] = {
  62     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,
  63     END
  64 };
  65
  66 // All: Java, C, Perl, XML, XML10, Unicode
  67 static const UChar SPEC_Any[] = {
  68     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,                      // Unicode
  69     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,                     // Java
  70     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,                      // C (surrogates)
  71     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,   // XML
  72     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,             // XML10
  73     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl
  74     END
  75 };
  76
  77 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)
  78
  79 static UChar* copySpec(const UChar* spec) {
  80     int32_t len = 0;
  81     while (spec[len] != END) {
  82         ++len;
  83     }
  84     ++len;
  85     UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar));
  86     // Check for memory allocation error.
  87     if (result != NULL) {
  88         uprv_memcpy(result, spec, len*sizeof(result[0]));
  89     }
  90     return result;
  91 }
  92
  93 /**
  94  * Factory methods.  Ignore the context.
  95  */
  96 static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) {
  97     return new UnescapeTransliterator(ID, SPEC_Unicode);
  98 }
  99 static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) {
 100     return new UnescapeTransliterator(ID, SPEC_Java);
 101 }
 102 static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) {
 103     return new UnescapeTransliterator(ID, SPEC_C);
 104 }
 105 static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) {
 106     return new UnescapeTransliterator(ID, SPEC_XML);
 107 }
 108 static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) {
 109     return new UnescapeTransliterator(ID, SPEC_XML10);
 110 }
 111 static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) {
 112     return new UnescapeTransliterator(ID, SPEC_Perl);
 113 }
 114 static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) {
 115     return new UnescapeTransliterator(ID, SPEC_Any);
 116 }
 117
 118 /**
 119  * Registers standard variants with the system.  Called by
 120  * Transliterator during initialization.
 121  */
 122 void UnescapeTransliterator::registerIDs() {
 123     Token t = integerToken(0);
 124
 125     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);
 126
 127     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);
 128
 129     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);
 130
 131     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);
 132
 133     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);
 134
 135     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);
 136
 137     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
 138 }
 139
 140 /**
 141  * Constructor.  Takes the encoded spec array.
 142  */
 143 UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
 144                                                const UChar *newSpec) :
 145     Transliterator(newID, NULL)
 146 {
 147     this->spec = copySpec(newSpec);
 148 }
 149
 150 /**
 151  * Copy constructor.
 152  */
 153 UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
 154     Transliterator(o) {
 155     this->spec = copySpec(o.spec);
 156 }
 157
 158 UnescapeTransliterator::~UnescapeTransliterator() {
 159     uprv_free(spec);
 160 }
 161
 162 /**
 163  * Transliterator API.
 164  */
 165 Transliterator* UnescapeTransliterator::clone() const {
 166     return new UnescapeTransliterator(*this);
 167 }
 168
 169 /**
 170  * Implements {@link Transliterator#handleTransliterate}.
 171  */
 172 void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
 173                                                  UBool isIncremental) const {
 174     int32_t start = pos.start;
 175     int32_t limit = pos.limit;
 176     int32_t i, j, ipat;
 177
 178     while (start < limit) {
 179         // Loop over the forms in spec[].  Exit this loop when we
 180         // match one of the specs.  Exit the outer loop if a
 181         // partial match is detected and isIncremental is true.
 182         for (j=0, ipat=0; spec[ipat] != END; ++j) {
 183
 184             // Read the header
 185             int32_t prefixLen = spec[ipat++];
 186             int32_t suffixLen = spec[ipat++];
 187             int8_t  radix     = (int8_t) spec[ipat++];
 188             int32_t minDigits = spec[ipat++];
 189             int32_t maxDigits = spec[ipat++];
 190
 191             // s is a copy of start that is advanced over the
 192             // characters as we parse them.
 193             int32_t s = start;
 194             UBool match = TRUE;
 195
 196             for (i=0; i<prefixLen; ++i) {
 197                 if (s >= limit) {
 198                     if (i > 0) {
 199                         // We've already matched a character.  This is
 200                         // a partial match, so we return if in
 201                         // incremental mode.  In non-incremental mode,
 202                         // go to the next spec.
 203                         if (isIncremental) {
 204                             goto exit;
 205                         }
 206                         match = FALSE;
 207                         break;
 208                     }
 209                 }
 210                 UChar c = text.charAt(s++);
 211                 if (c != spec[ipat + i]) {
 212                     match = FALSE;
 213                     break;
 214                 }
 215             }
 216
 217             if (match) {
 218                 UChar32 u = 0;
 219                 int32_t digitCount = 0;
 220                 for (;;) {
 221                     if (s >= limit) {
 222                         // Check for partial match in incremental mode.
 223                         if (s > start && isIncremental) {
 224                             goto exit;
 225                         }
 226                         break;
 227                     }
 228                     UChar32 ch = text.char32At(s);
 229                     int32_t digit = u_digit(ch, radix);
 230                     if (digit < 0) {
 231                         break;
 232                     }
 233                     s += U16_LENGTH(ch);
 234                     u = (u * radix) + digit;
 235                     if (++digitCount == maxDigits) {
 236                         break;
 237                     }
 238                 }
 239
 240                 match = (digitCount >= minDigits);
 241
 242                 if (match) {
 243                     for (i=0; i<suffixLen; ++i) {
 244                         if (s >= limit) {
 245                             // Check for partial match in incremental mode.
 246                             if (s > start && isIncremental) {
 247                                 goto exit;
 248                             }
 249                             match = FALSE;
 250                             break;
 251                         }
 252                         UChar c = text.charAt(s++);
 253                         if (c != spec[ipat + prefixLen + i]) {
 254                             match = FALSE;
 255                             break;
 256                         }
 257                     }
 258
 259                     if (match) {
 260                         // At this point, we have a match
 261                         UnicodeString str(u);
 262                         text.handleReplaceBetween(start, s, str);
 263                         limit -= s - start - str.length();
 264                         // The following break statement leaves the
 265                         // loop that is traversing the forms in
 266                         // spec[].  We then parse the next input
 267                         // character.
 268                         break;
 269                     }
 270                 }
 271             }
 272
 273             ipat += prefixLen + suffixLen;
 274         }
 275
 276         if (start < limit) {
 277             start += U16_LENGTH(text.char32At(start));
 278         }
 279     }
 280
 281   exit:
 282     pos.contextLimit += limit - pos.limit;
 283     pos.limit = limit;
 284     pos.start = start;
 285 }
 286
 287 U_NAMESPACE_END
 288
 289 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
 290
 291 //eof