icuSources/i18n/unesctrn.cpp

   1 /*
   2  **********************************************************************
   3  *   Copyright (c) 2001-2008, International Business Machines
   4  *   Corporation and others.  All Rights Reserved.
   5  **********************************************************************
   6  *   Date        Name        Description
   7  *   11/19/2001  aliu        Creation.
   8  **********************************************************************
   9  */
  10
  11 #include "unicode/utypes.h"
  12
  13 #if !UCONFIG_NO_TRANSLITERATION
  14
  15 #include "unicode/uchar.h"
  16 #include "unesctrn.h"
  17 #include "util.h"
  18
  19 #include "cmemory.h"
  20
  21 U_NAMESPACE_BEGIN
  22
  23 /**
  24  * Special character marking the end of the spec[] array.
  25  */
  26 static const UChar END = 0xFFFF;
  27
  28 // Unicode: "U+10FFFF" hex, min=4, max=6
  29 static const UChar SPEC_Unicode[] = {
  30     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,
  31     END
  32 };
  33
  34 // Java: "\\uFFFF" hex, min=4, max=4
  35 static const UChar SPEC_Java[] = {
  36     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
  37     END
  38 };
  39
  40 // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
  41 static const UChar SPEC_C[] = {
  42     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
  43     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,
  44     END
  45 };
  46
  47 // XML: "&#x10FFFF;" hex, min=1, max=6
  48 static const UChar SPEC_XML[] = {
  49     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,
  50     END
  51 };
  52
  53 // XML10: "&#1114111;" dec, min=1, max=7 (not really "Hex-Any")
  54 static const UChar SPEC_XML10[] = {
  55     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,
  56     END
  57 };
  58
  59 // Perl: "\\x{263A}" hex, min=1, max=6
  60 static const UChar SPEC_Perl[] = {
  61     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,
  62     END
  63 };
  64
  65 // All: Java, C, Perl, XML, XML10, Unicode
  66 static const UChar SPEC_Any[] = {
  67     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,                      // Unicode
  68     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,                     // Java
  69     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,                      // C (surrogates)
  70     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,   // XML
  71     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,             // XML10
  72     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl
  73     END
  74 };
  75
  76 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)
  77
  78 static UChar* copySpec(const UChar* spec) {
  79     int32_t len = 0;
  80     while (spec[len] != END) {
  81         ++len;
  82     }
  83     ++len;
  84     UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar));
  85     // Check for memory allocation error.
  86     if (result != NULL) {
  87         uprv_memcpy(result, spec, len*sizeof(result[0]));
  88     }
  89     return result;
  90 }
  91
  92 /**
  93  * Factory methods.  Ignore the context.
  94  */
  95 static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) {
  96     return new UnescapeTransliterator(ID, SPEC_Unicode);
  97 }
  98 static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) {
  99     return new UnescapeTransliterator(ID, SPEC_Java);
 100 }
 101 static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) {
 102     return new UnescapeTransliterator(ID, SPEC_C);
 103 }
 104 static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) {
 105     return new UnescapeTransliterator(ID, SPEC_XML);
 106 }
 107 static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) {
 108     return new UnescapeTransliterator(ID, SPEC_XML10);
 109 }
 110 static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) {
 111     return new UnescapeTransliterator(ID, SPEC_Perl);
 112 }
 113 static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) {
 114     return new UnescapeTransliterator(ID, SPEC_Any);
 115 }
 116
 117 /**
 118  * Registers standard variants with the system.  Called by
 119  * Transliterator during initialization.
 120  */
 121 void UnescapeTransliterator::registerIDs() {
 122     Token t = integerToken(0);
 123
 124     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);
 125
 126     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);
 127
 128     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);
 129
 130     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);
 131
 132     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);
 133
 134     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);
 135
 136     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
 137 }
 138
 139 /**
 140  * Constructor.  Takes the encoded spec array.
 141  */
 142 UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
 143                                                const UChar *newSpec) :
 144     Transliterator(newID, NULL)
 145 {
 146     this->spec = copySpec(newSpec);
 147 }
 148
 149 /**
 150  * Copy constructor.
 151  */
 152 UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
 153     Transliterator(o) {
 154     this->spec = copySpec(o.spec);
 155 }
 156
 157 UnescapeTransliterator::~UnescapeTransliterator() {
 158     uprv_free(spec);
 159 }
 160
 161 /**
 162  * Transliterator API.
 163  */
 164 Transliterator* UnescapeTransliterator::clone() const {
 165     return new UnescapeTransliterator(*this);
 166 }
 167
 168 /**
 169  * Implements {@link Transliterator#handleTransliterate}.
 170  */
 171 void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
 172                                                  UBool isIncremental) const {
 173     int32_t start = pos.start;
 174     int32_t limit = pos.limit;
 175     int32_t i, j, ipat;
 176
 177     while (start < limit) {
 178         // Loop over the forms in spec[].  Exit this loop when we
 179         // match one of the specs.  Exit the outer loop if a
 180         // partial match is detected and isIncremental is true.
 181         for (j=0, ipat=0; spec[ipat] != END; ++j) {
 182
 183             // Read the header
 184             int32_t prefixLen = spec[ipat++];
 185             int32_t suffixLen = spec[ipat++];
 186             int8_t  radix     = (int8_t) spec[ipat++];
 187             int32_t minDigits = spec[ipat++];
 188             int32_t maxDigits = spec[ipat++];
 189
 190             // s is a copy of start that is advanced over the
 191             // characters as we parse them.
 192             int32_t s = start;
 193             UBool match = TRUE;
 194
 195             for (i=0; i<prefixLen; ++i) {
 196                 if (s >= limit) {
 197                     if (i > 0) {
 198                         // We've already matched a character.  This is
 199                         // a partial match, so we return if in
 200                         // incremental mode.  In non-incremental mode,
 201                         // go to the next spec.
 202                         if (isIncremental) {
 203                             goto exit;
 204                         }
 205                         match = FALSE;
 206                         break;
 207                     }
 208                 }
 209                 UChar c = text.charAt(s++);
 210                 if (c != spec[ipat + i]) {
 211                     match = FALSE;
 212                     break;
 213                 }
 214             }
 215
 216             if (match) {
 217                 UChar32 u = 0;
 218                 int32_t digitCount = 0;
 219                 for (;;) {
 220                     if (s >= limit) {
 221                         // Check for partial match in incremental mode.
 222                         if (s > start && isIncremental) {
 223                             goto exit;
 224                         }
 225                         break;
 226                     }
 227                     UChar32 ch = text.char32At(s);
 228                     int32_t digit = u_digit(ch, radix);
 229                     if (digit < 0) {
 230                         break;
 231                     }
 232                     s += UTF_CHAR_LENGTH(ch);
 233                     u = (u * radix) + digit;
 234                     if (++digitCount == maxDigits) {
 235                         break;
 236                     }
 237                 }
 238
 239                 match = (digitCount >= minDigits);
 240
 241                 if (match) {
 242                     for (i=0; i<suffixLen; ++i) {
 243                         if (s >= limit) {
 244                             // Check for partial match in incremental mode.
 245                             if (s > start && isIncremental) {
 246                                 goto exit;
 247                             }
 248                             match = FALSE;
 249                             break;
 250                         }
 251                         UChar c = text.charAt(s++);
 252                         if (c != spec[ipat + prefixLen + i]) {
 253                             match = FALSE;
 254                             break;
 255                         }
 256                     }
 257
 258                     if (match) {
 259                         // At this point, we have a match
 260                         UnicodeString str(u);
 261                         text.handleReplaceBetween(start, s, str);
 262                         limit -= s - start - str.length();
 263                         // The following break statement leaves the
 264                         // loop that is traversing the forms in
 265                         // spec[].  We then parse the next input
 266                         // character.
 267                         break;
 268                     }
 269                 }
 270             }
 271
 272             ipat += prefixLen + suffixLen;
 273         }
 274
 275         if (start < limit) {
 276             start += UTF_CHAR_LENGTH(text.char32At(start));
 277         }
 278     }
 279
 280   exit:
 281     pos.contextLimit += limit - pos.limit;
 282     pos.limit = limit;
 283     pos.start = start;
 284 }
 285
 286 U_NAMESPACE_END
 287
 288 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
 289
 290 //eof