icuSources/i18n/unesctrn.cpp

   1 /*
   2  **********************************************************************
   3  *   Copyright (c) 2001-2004, International Business Machines
   4  *   Corporation and others.  All Rights Reserved.
   5  **********************************************************************
   6  *   Date        Name        Description
   7  *   11/19/2001  aliu        Creation.
   8  **********************************************************************
   9  */
  10
  11 #include "unicode/utypes.h"
  12
  13 #if !UCONFIG_NO_TRANSLITERATION
  14
  15 #include "unicode/uchar.h"
  16 #include "unesctrn.h"
  17 #include "util.h"
  18
  19 #include "cmemory.h"
  20
  21 U_NAMESPACE_BEGIN
  22
  23 /**
  24  * Special character marking the end of the spec[] array.
  25  */
  26 static const UChar END = 0xFFFF;
  27
  28 // Unicode: "U+10FFFF" hex, min=4, max=6
  29 static const UChar SPEC_Unicode[] = {
  30     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,
  31     END
  32 };
  33
  34 // Java: "\\uFFFF" hex, min=4, max=4
  35 static const UChar SPEC_Java[] = {
  36     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
  37     END
  38 };
  39
  40 // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
  41 static const UChar SPEC_C[] = {
  42     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
  43     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,
  44     END
  45 };
  46
  47 // XML: "&#x10FFFF;" hex, min=1, max=6
  48 static const UChar SPEC_XML[] = {
  49     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,
  50     END
  51 };
  52
  53 // XML10: "&#1114111;" dec, min=1, max=7 (not really "Hex-Any")
  54 static const UChar SPEC_XML10[] = {
  55     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,
  56     END
  57 };
  58
  59 // Perl: "\\x{263A}" hex, min=1, max=6
  60 static const UChar SPEC_Perl[] = {
  61     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,
  62     END
  63 };
  64
  65 // All: Java, C, Perl, XML, XML10, Unicode
  66 static const UChar SPEC_Any[] = {
  67     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,                      // Unicode
  68     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,                     // Java
  69     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,                      // C (surrogates)
  70     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,   // XML
  71     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,             // XML10
  72     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl
  73     END
  74 };
  75
  76 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)
  77
  78 /**
  79  * Factory methods.  Ignore the context.
  80  */
  81 Transliterator* UnescapeTransliterator::_createUnicode(const UnicodeString& ID, Token /*context*/) {
  82     return new UnescapeTransliterator(ID, SPEC_Unicode);
  83 }
  84 Transliterator* UnescapeTransliterator::_createJava(const UnicodeString& ID, Token /*context*/) {
  85     return new UnescapeTransliterator(ID, SPEC_Java);
  86 }
  87 Transliterator* UnescapeTransliterator::_createC(const UnicodeString& ID, Token /*context*/) {
  88     return new UnescapeTransliterator(ID, SPEC_C);
  89 }
  90 Transliterator* UnescapeTransliterator::_createXML(const UnicodeString& ID, Token /*context*/) {
  91     return new UnescapeTransliterator(ID, SPEC_XML);
  92 }
  93 Transliterator* UnescapeTransliterator::_createXML10(const UnicodeString& ID, Token /*context*/) {
  94     return new UnescapeTransliterator(ID, SPEC_XML10);
  95 }
  96 Transliterator* UnescapeTransliterator::_createPerl(const UnicodeString& ID, Token /*context*/) {
  97     return new UnescapeTransliterator(ID, SPEC_Perl);
  98 }
  99 Transliterator* UnescapeTransliterator::_createAny(const UnicodeString& ID, Token /*context*/) {
 100     return new UnescapeTransliterator(ID, SPEC_Any);
 101 }
 102
 103 /**
 104  * Registers standard variants with the system.  Called by
 105  * Transliterator during initialization.
 106  */
 107 void UnescapeTransliterator::registerIDs() {
 108     Token t = integerToken(0);
 109
 110     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);
 111
 112     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);
 113
 114     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);
 115
 116     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);
 117
 118     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);
 119
 120     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);
 121
 122     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
 123 }
 124
 125 /**
 126  * Constructor.  Takes the encoded spec array.
 127  */
 128 UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
 129                                                const UChar *newSpec) :
 130     Transliterator(newID, NULL)
 131 {
 132     this->spec = copySpec(newSpec);
 133 }
 134
 135 /**
 136  * Copy constructor.
 137  */
 138 UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
 139     Transliterator(o) {
 140     this->spec = copySpec(o.spec);
 141 }
 142
 143 UnescapeTransliterator::~UnescapeTransliterator() {
 144     uprv_free(spec);
 145 }
 146
 147 /**
 148  * Transliterator API.
 149  */
 150 Transliterator* UnescapeTransliterator::clone() const {
 151     return new UnescapeTransliterator(*this);
 152 }
 153
 154 UChar* UnescapeTransliterator::copySpec(const UChar* spec) {
 155     int32_t len = 0;
 156     while (spec[len] != END) {
 157         ++len;
 158     }
 159     ++len;
 160     UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar));
 161     uprv_memcpy(result, spec, len*sizeof(result[0]));
 162     return result;
 163 }
 164
 165 /**
 166  * Implements {@link Transliterator#handleTransliterate}.
 167  */
 168 void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
 169                                                  UBool isIncremental) const {
 170     int32_t start = pos.start;
 171     int32_t limit = pos.limit;
 172     int32_t i, j, ipat;
 173
 174     while (start < limit) {
 175         // Loop over the forms in spec[].  Exit this loop when we
 176         // match one of the specs.  Exit the outer loop if a
 177         // partial match is detected and isIncremental is true.
 178         for (j=0, ipat=0; spec[ipat] != END; ++j) {
 179
 180             // Read the header
 181             int32_t prefixLen = spec[ipat++];
 182             int32_t suffixLen = spec[ipat++];
 183             int8_t  radix     = (int8_t) spec[ipat++];
 184             int32_t minDigits = spec[ipat++];
 185             int32_t maxDigits = spec[ipat++];
 186
 187             // s is a copy of start that is advanced over the
 188             // characters as we parse them.
 189             int32_t s = start;
 190             UBool match = TRUE;
 191
 192             for (i=0; i<prefixLen; ++i) {
 193                 if (s >= limit) {
 194                     if (i > 0) {
 195                         // We've already matched a character.  This is
 196                         // a partial match, so we return if in
 197                         // incremental mode.  In non-incremental mode,
 198                         // go to the next spec.
 199                         if (isIncremental) {
 200                             goto exit;
 201                         }
 202                         match = FALSE;
 203                         break;
 204                     }
 205                 }
 206                 UChar c = text.charAt(s++);
 207                 if (c != spec[ipat + i]) {
 208                     match = FALSE;
 209                     break;
 210                 }
 211             }
 212
 213             if (match) {
 214                 UChar32 u = 0;
 215                 int32_t digitCount = 0;
 216                 for (;;) {
 217                     if (s >= limit) {
 218                         // Check for partial match in incremental mode.
 219                         if (s > start && isIncremental) {
 220                             goto exit;
 221                         }
 222                         break;
 223                     }
 224                     UChar32 ch = text.char32At(s);
 225                     int32_t digit = u_digit(ch, radix);
 226                     if (digit < 0) {
 227                         break;
 228                     }
 229                     s += UTF_CHAR_LENGTH(ch);
 230                     u = (u * radix) + digit;
 231                     if (++digitCount == maxDigits) {
 232                         break;
 233                     }
 234                 }
 235
 236                 match = (digitCount >= minDigits);
 237
 238                 if (match) {
 239                     for (i=0; i<suffixLen; ++i) {
 240                         if (s >= limit) {
 241                             // Check for partial match in incremental mode.
 242                             if (s > start && isIncremental) {
 243                                 goto exit;
 244                             }
 245                             match = FALSE;
 246                             break;
 247                         }
 248                         UChar c = text.charAt(s++);
 249                         if (c != spec[ipat + prefixLen + i]) {
 250                             match = FALSE;
 251                             break;
 252                         }
 253                     }
 254
 255                     if (match) {
 256                         // At this point, we have a match
 257                         UnicodeString str(u);
 258                         text.handleReplaceBetween(start, s, str);
 259                         limit -= s - start - str.length();
 260                         // The following break statement leaves the
 261                         // loop that is traversing the forms in
 262                         // spec[].  We then parse the next input
 263                         // character.
 264                         break;
 265                     }
 266                 }
 267             }
 268
 269             ipat += prefixLen + suffixLen;
 270         }
 271
 272         if (start < limit) {
 273             start += UTF_CHAR_LENGTH(text.char32At(start));
 274         }
 275     }
 276
 277   exit:
 278     pos.contextLimit += limit - pos.limit;
 279     pos.limit = limit;
 280     pos.start = start;
 281 }
 282
 283 U_NAMESPACE_END
 284
 285 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
 286
 287 //eof