icuSources/i18n/name2uni.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2001-2004, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   Date        Name        Description
   7 *   06/07/01    aliu        Creation.
   8 **********************************************************************
   9 */
  10
  11 #include "unicode/utypes.h"
  12
  13 #if !UCONFIG_NO_TRANSLITERATION
  14
  15 #include "unicode/unifilt.h"
  16 #include "unicode/uchar.h"
  17 #include "unicode/uniset.h"
  18 #include "name2uni.h"
  19 #include "cmemory.h"
  20 #include "uprops.h"
  21 #include "util.h"
  22
  23 U_NAMESPACE_BEGIN
  24
  25 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NameUnicodeTransliterator)
  26
  27 static const UChar OPEN[] = {92,78,126,123,126,0}; // "\N~{~"
  28 static const UChar OPEN_DELIM  = 92;  // '\\' first char of OPEN
  29 static const UChar CLOSE_DELIM = 125; // '}'
  30 static const UChar SPACE       = 32;  // ' '
  31
  32 U_CDECL_BEGIN
  33
  34 // USetAdder implementation
  35 // Does not use uset.h to reduce code dependencies
  36 static void U_CALLCONV
  37 _set_add(USet *set, UChar32 c) {
  38     ((UnicodeSet *)set)->add(c);
  39 }
  40
  41 static void U_CALLCONV
  42 _set_addRange(USet *set, UChar32 start, UChar32 end) {
  43     ((UnicodeSet *)set)->add(start, end);
  44 }
  45
  46 static void U_CALLCONV
  47 _set_addString(USet *set, const UChar *str, int32_t length) {
  48     ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length));
  49 }
  50
  51 U_CDECL_END
  52
  53 /**
  54  * Constructs a transliterator with the default delimiters '{' and
  55  * '}'.
  56  */
  57 NameUnicodeTransliterator::NameUnicodeTransliterator(UnicodeFilter* adoptedFilter) :
  58     Transliterator(UNICODE_STRING("Name-Any", 8), adoptedFilter) {
  59
  60     // Get the legal character set
  61     USetAdder sa = {
  62         (USet *)&legal, // USet* == UnicodeSet*
  63         _set_add,
  64         _set_addRange,
  65         _set_addString
  66     };
  67     uprv_getCharNameCharacters(&sa);
  68 }
  69
  70 /**
  71  * Destructor.
  72  */
  73 NameUnicodeTransliterator::~NameUnicodeTransliterator() {}
  74
  75 /**
  76  * Copy constructor.
  77  */
  78 NameUnicodeTransliterator::NameUnicodeTransliterator(const NameUnicodeTransliterator& o) :
  79     Transliterator(o), legal(o.legal) {}
  80
  81 /**
  82  * Assignment operator.
  83  */
  84 NameUnicodeTransliterator& NameUnicodeTransliterator::operator=(
  85                              const NameUnicodeTransliterator& o) {
  86     Transliterator::operator=(o);
  87     // not necessary: the legal sets should all be the same -- legal=o.legal;
  88     return *this;
  89 }
  90
  91 /**
  92  * Transliterator API.
  93  */
  94 Transliterator* NameUnicodeTransliterator::clone(void) const {
  95     return new NameUnicodeTransliterator(*this);
  96 }
  97
  98 /**
  99  * Implements {@link Transliterator#handleTransliterate}.
 100  */
 101 void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
 102                                                     UBool isIncremental) const {
 103     // The failure mode, here and below, is to behave like Any-Null,
 104     // if either there is no name data (max len == 0) or there is no
 105     // memory (malloc() => NULL).
 106
 107     int32_t maxLen = uprv_getMaxCharNameLength();
 108     if (maxLen == 0) {
 109         offsets.start = offsets.limit;
 110         return;
 111     }
 112
 113     // Accomodate the longest possible name
 114     ++maxLen; // allow for temporary trailing space
 115     char* cbuf = (char*) uprv_malloc(maxLen);
 116     if (cbuf == NULL) {
 117         offsets.start = offsets.limit;
 118         return;
 119     }
 120
 121     UnicodeString openPat(TRUE, OPEN, -1);
 122     UnicodeString str, name;
 123
 124     int32_t cursor = offsets.start;
 125     int32_t limit = offsets.limit;
 126
 127     // Modes:
 128     // 0 - looking for open delimiter
 129     // 1 - after open delimiter
 130     int32_t mode = 0;
 131     int32_t openPos = -1; // open delim candidate pos
 132
 133     UChar32 c;
 134     while (cursor < limit) {
 135         c = text.char32At(cursor);
 136
 137         switch (mode) {
 138         case 0: // looking for open delimiter
 139             if (c == OPEN_DELIM) { // quick check first
 140                 openPos = cursor;
 141                 int32_t i =
 142                     ICU_Utility::parsePattern(openPat, text, cursor, limit);
 143                 if (i >= 0 && i < limit) {
 144                     mode = 1;
 145                     name.truncate(0);
 146                     cursor = i;
 147                     continue; // *** reprocess char32At(cursor)
 148                 }
 149             }
 150             break;
 151
 152         case 1: // after open delimiter
 153             // Look for legal chars.  If \s+ is found, convert it
 154             // to a single space.  If closeDelimiter is found, exit
 155             // the loop.  If any other character is found, exit the
 156             // loop.  If the limit is reached, exit the loop.
 157
 158             // Convert \s+ => SPACE.  This assumes there are no
 159             // runs of >1 space characters in names.
 160             if (uprv_isRuleWhiteSpace(c)) {
 161                 // Ignore leading whitespace
 162                 if (name.length() > 0 &&
 163                     name.charAt(name.length()-1) != SPACE) {
 164                     name.append(SPACE);
 165                     // If we are too long then abort.  maxLen includes
 166                     // temporary trailing space, so use '>'.
 167                     if (name.length() > maxLen) {
 168                         mode = 0;
 169                     }
 170                 }
 171                 break;
 172             }
 173
 174             if (c == CLOSE_DELIM) {
 175
 176                 int32_t len = name.length();
 177
 178                 // Delete trailing space, if any
 179                 if (len > 0 &&
 180                     name.charAt(len-1) == SPACE) {
 181                     --len;
 182                 }
 183
 184                 name.extract(0, len, cbuf, "");
 185
 186                 UErrorCode status = U_ZERO_ERROR;
 187                 c = u_charFromName(U_EXTENDED_CHAR_NAME, cbuf, &status);
 188                 if (U_SUCCESS(status)) {
 189                     // Lookup succeeded
 190
 191                     // assert(UTF_CHAR_LENGTH(CLOSE_DELIM) == 1);
 192                     cursor++; // advance over CLOSE_DELIM
 193
 194                     str.truncate(0);
 195                     str.append(c);
 196                     text.handleReplaceBetween(openPos, cursor, str);
 197
 198                     // Adjust indices for the change in the length of
 199                     // the string.  Do not assume that str.length() ==
 200                     // 1, in case of surrogates.
 201                     int32_t delta = cursor - openPos - str.length();
 202                     cursor -= delta;
 203                     limit -= delta;
 204                     // assert(cursor == openPos + str.length());
 205                 }
 206                 // If the lookup failed, we leave things as-is and
 207                 // still switch to mode 0 and continue.
 208                 mode = 0;
 209                 openPos = -1; // close off candidate
 210                 continue; // *** reprocess char32At(cursor)
 211             }
 212
 213             // Check if c is a legal char.  We assume here that
 214             // legal.contains(OPEN_DELIM) is FALSE, so when we abort a
 215             // name, we don't have to go back to openPos+1.
 216             if (legal.contains(c)) {
 217                 name.append(c);
 218                 // If we go past the longest possible name then abort.
 219                 // maxLen includes temporary trailing space, so use '>='.
 220                 if (name.length() >= maxLen) {
 221                     mode = 0;
 222                 }
 223             }
 224
 225             // Invalid character
 226             else {
 227                 --cursor; // Backup and reprocess this character
 228                 mode = 0;
 229             }
 230
 231             break;
 232         }
 233
 234         cursor += UTF_CHAR_LENGTH(c);
 235     }
 236
 237     offsets.contextLimit += limit - offsets.limit;
 238     offsets.limit = limit;
 239     // In incremental mode, only advance the cursor up to the last
 240     // open delimiter candidate.
 241     offsets.start = (isIncremental && openPos >= 0) ? openPos : cursor;
 242
 243     uprv_free(cbuf);
 244 }
 245
 246 U_NAMESPACE_END
 247
 248 #endif /* #if !UCONFIG_NO_TRANSLITERATION */