icuSources/i18n/unesctrn.cpp

   1 /*
   2  **********************************************************************
   3  *   Copyright (c) 2001-2004, International Business Machines
   4  *   Corporation and others.  All Rights Reserved.
   5  **********************************************************************
   6  *   Date        Name        Description
   7  *   11/19/2001  aliu        Creation.
   8  **********************************************************************
   9  */
  10
  11 #include "unicode/utypes.h"
  12
  13 #if !UCONFIG_NO_TRANSLITERATION
  14
  15 #include "unicode/uchar.h"
  16 #include "unesctrn.h"
  17 #include "util.h"
  18
  19 #include "cmemory.h"
  20
  21 U_NAMESPACE_BEGIN
  22
  23 /**
  24  * Special character marking the end of the spec[] array.
  25  */
  26 static const UChar END = 0xFFFF;
  27
  28 // Unicode: "U+10FFFF" hex, min=4, max=6
  29 static const UChar SPEC_Unicode[] = {
  30     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,
  31     END
  32 };
  33
  34 // Java: "\\uFFFF" hex, min=4, max=4
  35 static const UChar SPEC_Java[] = {
  36     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
  37     END
  38 };
  39
  40 // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
  41 static const UChar SPEC_C[] = {
  42     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
  43     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,
  44     END
  45 };
  46
  47 // XML: "&#x10FFFF;" hex, min=1, max=6
  48 static const UChar SPEC_XML[] = {
  49     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,
  50     END
  51 };
  52
  53 // XML10: "&#1114111;" dec, min=1, max=7 (not really "Hex-Any")
  54 static const UChar SPEC_XML10[] = {
  55     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,
  56     END
  57 };
  58
  59 // Perl: "\\x{263A}" hex, min=1, max=6
  60 static const UChar SPEC_Perl[] = {
  61     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,
  62     END
  63 };
  64
  65 // All: Java, C, Perl, XML, XML10, Unicode
  66 static const UChar SPEC_Any[] = {
  67     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,                      // Unicode
  68     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,                     // Java
  69     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,                      // C (surrogates)
  70     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,   // XML
  71     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,             // XML10
  72     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl
  73     END
  74 };
  75
  76 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)
  77
  78 static UChar* copySpec(const UChar* spec) {
  79     int32_t len = 0;
  80     while (spec[len] != END) {
  81         ++len;
  82     }
  83     ++len;
  84     UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar));
  85     uprv_memcpy(result, spec, len*sizeof(result[0]));
  86     return result;
  87 }
  88
  89 /**
  90  * Factory methods.  Ignore the context.
  91  */
  92 static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) {
  93     return new UnescapeTransliterator(ID, SPEC_Unicode);
  94 }
  95 static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) {
  96     return new UnescapeTransliterator(ID, SPEC_Java);
  97 }
  98 static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) {
  99     return new UnescapeTransliterator(ID, SPEC_C);
 100 }
 101 static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) {
 102     return new UnescapeTransliterator(ID, SPEC_XML);
 103 }
 104 static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) {
 105     return new UnescapeTransliterator(ID, SPEC_XML10);
 106 }
 107 static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) {
 108     return new UnescapeTransliterator(ID, SPEC_Perl);
 109 }
 110 static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) {
 111     return new UnescapeTransliterator(ID, SPEC_Any);
 112 }
 113
 114 /**
 115  * Registers standard variants with the system.  Called by
 116  * Transliterator during initialization.
 117  */
 118 void UnescapeTransliterator::registerIDs() {
 119     Token t = integerToken(0);
 120
 121     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);
 122
 123     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);
 124
 125     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);
 126
 127     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);
 128
 129     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);
 130
 131     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);
 132
 133     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
 134 }
 135
 136 /**
 137  * Constructor.  Takes the encoded spec array.
 138  */
 139 UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
 140                                                const UChar *newSpec) :
 141     Transliterator(newID, NULL)
 142 {
 143     this->spec = copySpec(newSpec);
 144 }
 145
 146 /**
 147  * Copy constructor.
 148  */
 149 UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
 150     Transliterator(o) {
 151     this->spec = copySpec(o.spec);
 152 }
 153
 154 UnescapeTransliterator::~UnescapeTransliterator() {
 155     uprv_free(spec);
 156 }
 157
 158 /**
 159  * Transliterator API.
 160  */
 161 Transliterator* UnescapeTransliterator::clone() const {
 162     return new UnescapeTransliterator(*this);
 163 }
 164
 165 /**
 166  * Implements {@link Transliterator#handleTransliterate}.
 167  */
 168 void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
 169                                                  UBool isIncremental) const {
 170     int32_t start = pos.start;
 171     int32_t limit = pos.limit;
 172     int32_t i, j, ipat;
 173
 174     while (start < limit) {
 175         // Loop over the forms in spec[].  Exit this loop when we
 176         // match one of the specs.  Exit the outer loop if a
 177         // partial match is detected and isIncremental is true.
 178         for (j=0, ipat=0; spec[ipat] != END; ++j) {
 179
 180             // Read the header
 181             int32_t prefixLen = spec[ipat++];
 182             int32_t suffixLen = spec[ipat++];
 183             int8_t  radix     = (int8_t) spec[ipat++];
 184             int32_t minDigits = spec[ipat++];
 185             int32_t maxDigits = spec[ipat++];
 186
 187             // s is a copy of start that is advanced over the
 188             // characters as we parse them.
 189             int32_t s = start;
 190             UBool match = TRUE;
 191
 192             for (i=0; i<prefixLen; ++i) {
 193                 if (s >= limit) {
 194                     if (i > 0) {
 195                         // We've already matched a character.  This is
 196                         // a partial match, so we return if in
 197                         // incremental mode.  In non-incremental mode,
 198                         // go to the next spec.
 199                         if (isIncremental) {
 200                             goto exit;
 201                         }
 202                         match = FALSE;
 203                         break;
 204                     }
 205                 }
 206                 UChar c = text.charAt(s++);
 207                 if (c != spec[ipat + i]) {
 208                     match = FALSE;
 209                     break;
 210                 }
 211             }
 212
 213             if (match) {
 214                 UChar32 u = 0;
 215                 int32_t digitCount = 0;
 216                 for (;;) {
 217                     if (s >= limit) {
 218                         // Check for partial match in incremental mode.
 219                         if (s > start && isIncremental) {
 220                             goto exit;
 221                         }
 222                         break;
 223                     }
 224                     UChar32 ch = text.char32At(s);
 225                     int32_t digit = u_digit(ch, radix);
 226                     if (digit < 0) {
 227                         break;
 228                     }
 229                     s += UTF_CHAR_LENGTH(ch);
 230                     u = (u * radix) + digit;
 231                     if (++digitCount == maxDigits) {
 232                         break;
 233                     }
 234                 }
 235
 236                 match = (digitCount >= minDigits);
 237
 238                 if (match) {
 239                     for (i=0; i<suffixLen; ++i) {
 240                         if (s >= limit) {
 241                             // Check for partial match in incremental mode.
 242                             if (s > start && isIncremental) {
 243                                 goto exit;
 244                             }
 245                             match = FALSE;
 246                             break;
 247                         }
 248                         UChar c = text.charAt(s++);
 249                         if (c != spec[ipat + prefixLen + i]) {
 250                             match = FALSE;
 251                             break;
 252                         }
 253                     }
 254
 255                     if (match) {
 256                         // At this point, we have a match
 257                         UnicodeString str(u);
 258                         text.handleReplaceBetween(start, s, str);
 259                         limit -= s - start - str.length();
 260                         // The following break statement leaves the
 261                         // loop that is traversing the forms in
 262                         // spec[].  We then parse the next input
 263                         // character.
 264                         break;
 265                     }
 266                 }
 267             }
 268
 269             ipat += prefixLen + suffixLen;
 270         }
 271
 272         if (start < limit) {
 273             start += UTF_CHAR_LENGTH(text.char32At(start));
 274         }
 275     }
 276
 277   exit:
 278     pos.contextLimit += limit - pos.limit;
 279     pos.limit = limit;
 280     pos.start = start;
 281 }
 282
 283 U_NAMESPACE_END
 284
 285 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
 286
 287 //eof