icuSources/i18n/name2uni.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2001-2006, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   Date        Name        Description
   7 *   06/07/01    aliu        Creation.
   8 **********************************************************************
   9 */
  10
  11 #include "unicode/utypes.h"
  12
  13 #if !UCONFIG_NO_TRANSLITERATION
  14
  15 #include "unicode/unifilt.h"
  16 #include "unicode/uchar.h"
  17 #include "unicode/uniset.h"
  18 #include "name2uni.h"
  19 #include "cmemory.h"
  20 #include "uprops.h"
  21 #include "uinvchar.h"
  22 #include "util.h"
  23
  24 U_NAMESPACE_BEGIN
  25
  26 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NameUnicodeTransliterator)
  27
  28 static const UChar OPEN[] = {92,78,126,123,126,0}; // "\N~{~"
  29 static const UChar OPEN_DELIM  = 92;  // '\\' first char of OPEN
  30 static const UChar CLOSE_DELIM = 125; // '}'
  31 static const UChar SPACE       = 32;  // ' '
  32
  33 U_CDECL_BEGIN
  34
  35 // USetAdder implementation
  36 // Does not use uset.h to reduce code dependencies
  37 static void U_CALLCONV
  38 _set_add(USet *set, UChar32 c) {
  39     ((UnicodeSet *)set)->add(c);
  40 }
  41
  42 static void U_CALLCONV
  43 _set_addRange(USet *set, UChar32 start, UChar32 end) {
  44     ((UnicodeSet *)set)->add(start, end);
  45 }
  46
  47 static void U_CALLCONV
  48 _set_addString(USet *set, const UChar *str, int32_t length) {
  49     ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length));
  50 }
  51
  52 U_CDECL_END
  53
  54 /**
  55  * Constructs a transliterator with the default delimiters '{' and
  56  * '}'.
  57  */
  58 NameUnicodeTransliterator::NameUnicodeTransliterator(UnicodeFilter* adoptedFilter) :
  59     Transliterator(UNICODE_STRING("Name-Any", 8), adoptedFilter) {
  60
  61     UnicodeSet *legalPtr = &legal;
  62     // Get the legal character set
  63     USetAdder sa = {
  64         (USet *)legalPtr, // USet* == UnicodeSet*
  65         _set_add,
  66         _set_addRange,
  67         _set_addString,
  68         NULL // don't need remove()
  69     };
  70     uprv_getCharNameCharacters(&sa);
  71 }
  72
  73 /**
  74  * Destructor.
  75  */
  76 NameUnicodeTransliterator::~NameUnicodeTransliterator() {}
  77
  78 /**
  79  * Copy constructor.
  80  */
  81 NameUnicodeTransliterator::NameUnicodeTransliterator(const NameUnicodeTransliterator& o) :
  82     Transliterator(o), legal(o.legal) {}
  83
  84 /**
  85  * Assignment operator.
  86  */
  87 NameUnicodeTransliterator& NameUnicodeTransliterator::operator=(
  88                              const NameUnicodeTransliterator& o) {
  89     Transliterator::operator=(o);
  90     // not necessary: the legal sets should all be the same -- legal=o.legal;
  91     return *this;
  92 }
  93
  94 /**
  95  * Transliterator API.
  96  */
  97 Transliterator* NameUnicodeTransliterator::clone(void) const {
  98     return new NameUnicodeTransliterator(*this);
  99 }
 100
 101 /**
 102  * Implements {@link Transliterator#handleTransliterate}.
 103  */
 104 void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
 105                                                     UBool isIncremental) const {
 106     // The failure mode, here and below, is to behave like Any-Null,
 107     // if either there is no name data (max len == 0) or there is no
 108     // memory (malloc() => NULL).
 109
 110     int32_t maxLen = uprv_getMaxCharNameLength();
 111     if (maxLen == 0) {
 112         offsets.start = offsets.limit;
 113         return;
 114     }
 115
 116     // Accomodate the longest possible name
 117     ++maxLen; // allow for temporary trailing space
 118     char* cbuf = (char*) uprv_malloc(maxLen);
 119     if (cbuf == NULL) {
 120         offsets.start = offsets.limit;
 121         return;
 122     }
 123
 124     UnicodeString openPat(TRUE, OPEN, -1);
 125     UnicodeString str, name;
 126
 127     int32_t cursor = offsets.start;
 128     int32_t limit = offsets.limit;
 129
 130     // Modes:
 131     // 0 - looking for open delimiter
 132     // 1 - after open delimiter
 133     int32_t mode = 0;
 134     int32_t openPos = -1; // open delim candidate pos
 135
 136     UChar32 c;
 137     while (cursor < limit) {
 138         c = text.char32At(cursor);
 139
 140         switch (mode) {
 141         case 0: // looking for open delimiter
 142             if (c == OPEN_DELIM) { // quick check first
 143                 openPos = cursor;
 144                 int32_t i =
 145                     ICU_Utility::parsePattern(openPat, text, cursor, limit);
 146                 if (i >= 0 && i < limit) {
 147                     mode = 1;
 148                     name.truncate(0);
 149                     cursor = i;
 150                     continue; // *** reprocess char32At(cursor)
 151                 }
 152             }
 153             break;
 154
 155         case 1: // after open delimiter
 156             // Look for legal chars.  If \s+ is found, convert it
 157             // to a single space.  If closeDelimiter is found, exit
 158             // the loop.  If any other character is found, exit the
 159             // loop.  If the limit is reached, exit the loop.
 160
 161             // Convert \s+ => SPACE.  This assumes there are no
 162             // runs of >1 space characters in names.
 163             if (uprv_isRuleWhiteSpace(c)) {
 164                 // Ignore leading whitespace
 165                 if (name.length() > 0 &&
 166                     name.charAt(name.length()-1) != SPACE) {
 167                     name.append(SPACE);
 168                     // If we are too long then abort.  maxLen includes
 169                     // temporary trailing space, so use '>'.
 170                     if (name.length() > maxLen) {
 171                         mode = 0;
 172                     }
 173                 }
 174                 break;
 175             }
 176
 177             if (c == CLOSE_DELIM) {
 178                 int32_t len = name.length();
 179
 180                 // Delete trailing space, if any
 181                 if (len > 0 &&
 182                     name.charAt(len-1) == SPACE) {
 183                     --len;
 184                 }
 185
 186                 if (uprv_isInvariantUString(name.getBuffer(), len)) {
 187                     name.extract(0, len, cbuf, maxLen, US_INV);
 188
 189                     UErrorCode status = U_ZERO_ERROR;
 190                     c = u_charFromName(U_EXTENDED_CHAR_NAME, cbuf, &status);
 191                     if (U_SUCCESS(status)) {
 192                         // Lookup succeeded
 193
 194                         // assert(UTF_CHAR_LENGTH(CLOSE_DELIM) == 1);
 195                         cursor++; // advance over CLOSE_DELIM
 196
 197                         str.truncate(0);
 198                         str.append(c);
 199                         text.handleReplaceBetween(openPos, cursor, str);
 200
 201                         // Adjust indices for the change in the length of
 202                         // the string.  Do not assume that str.length() ==
 203                         // 1, in case of surrogates.
 204                         int32_t delta = cursor - openPos - str.length();
 205                         cursor -= delta;
 206                         limit -= delta;
 207                         // assert(cursor == openPos + str.length());
 208                     }
 209                 }
 210                 // If the lookup failed, we leave things as-is and
 211                 // still switch to mode 0 and continue.
 212                 mode = 0;
 213                 openPos = -1; // close off candidate
 214                 continue; // *** reprocess char32At(cursor)
 215             }
 216
 217             // Check if c is a legal char.  We assume here that
 218             // legal.contains(OPEN_DELIM) is FALSE, so when we abort a
 219             // name, we don't have to go back to openPos+1.
 220             if (legal.contains(c)) {
 221                 name.append(c);
 222                 // If we go past the longest possible name then abort.
 223                 // maxLen includes temporary trailing space, so use '>='.
 224                 if (name.length() >= maxLen) {
 225                     mode = 0;
 226                 }
 227             }
 228
 229             // Invalid character
 230             else {
 231                 --cursor; // Backup and reprocess this character
 232                 mode = 0;
 233             }
 234
 235             break;
 236         }
 237
 238         cursor += UTF_CHAR_LENGTH(c);
 239     }
 240
 241     offsets.contextLimit += limit - offsets.limit;
 242     offsets.limit = limit;
 243     // In incremental mode, only advance the cursor up to the last
 244     // open delimiter candidate.
 245     offsets.start = (isIncremental && openPos >= 0) ? openPos : cursor;
 246
 247     uprv_free(cbuf);
 248 }
 249
 250 U_NAMESPACE_END
 251
 252 #endif /* #if !UCONFIG_NO_TRANSLITERATION */