]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ****************************************************************************** | |
3 | * | |
2ca993e8 | 4 | * Copyright (C) 1999-2015, International Business Machines |
b75a7d8f A |
5 | * Corporation and others. All Rights Reserved. |
6 | * | |
7 | ****************************************************************************** | |
8 | * | |
9 | * | |
4388f060 | 10 | * ucnv_io.cpp: |
73c04bcf A |
11 | * initializes global variables and defines functions pertaining to converter |
12 | * name resolution aspect of the conversion code. | |
b75a7d8f A |
13 | * |
14 | * new implementation: | |
15 | * | |
16 | * created on: 1999nov22 | |
17 | * created by: Markus W. Scherer | |
18 | * | |
19 | * Use the binary cnvalias.icu (created from convrtrs.txt) to work | |
20 | * with aliases for converter names. | |
21 | * | |
22 | * Date Name Description | |
23 | * 11/22/1999 markus Created | |
24 | * 06/28/2002 grhoten Major overhaul of the converter alias design. | |
25 | * Now an alias can map to different converters | |
26 | * depending on the specified standard. | |
27 | ******************************************************************************* | |
28 | */ | |
29 | ||
30 | #include "unicode/utypes.h" | |
374ca955 A |
31 | |
32 | #if !UCONFIG_NO_CONVERSION | |
33 | ||
73c04bcf | 34 | #include "unicode/ucnv.h" |
b75a7d8f A |
35 | #include "unicode/udata.h" |
36 | ||
37 | #include "umutex.h" | |
374ca955 | 38 | #include "uarrsort.h" |
57a6839d | 39 | #include "uassert.h" |
374ca955 | 40 | #include "udataswp.h" |
b75a7d8f A |
41 | #include "cstring.h" |
42 | #include "cmemory.h" | |
43 | #include "ucnv_io.h" | |
44 | #include "uenumimp.h" | |
45 | #include "ucln_cmn.h" | |
46 | ||
47 | /* Format of cnvalias.icu ----------------------------------------------------- | |
48 | * | |
49 | * cnvalias.icu is a binary, memory-mappable form of convrtrs.txt. | |
50 | * This binary form contains several tables. All indexes are to uint16_t | |
51 | * units, and not to the bytes (uint8_t units). Addressing everything on | |
52 | * 16-bit boundaries allows us to store more information with small index | |
53 | * numbers, which are also 16-bit in size. The majority of the table (except | |
54 | * the string table) are 16-bit numbers. | |
55 | * | |
56 | * First there is the size of the Table of Contents (TOC). The TOC | |
57 | * entries contain the size of each section. In order to find the offset | |
58 | * you just need to sum up the previous offsets. | |
374ca955 A |
59 | * The TOC length and entries are an array of uint32_t values. |
60 | * The first section after the TOC starts immediately after the TOC. | |
b75a7d8f A |
61 | * |
62 | * 1) This section contains a list of converters. This list contains indexes | |
63 | * into the string table for the converter name. The index of this list is | |
64 | * also used by other sections, which are mentioned later on. | |
374ca955 | 65 | * This list is not sorted. |
b75a7d8f A |
66 | * |
67 | * 2) This section contains a list of tags. This list contains indexes | |
68 | * into the string table for the tag name. The index of this list is | |
69 | * also used by other sections, which are mentioned later on. | |
374ca955 | 70 | * This list is in priority order of standards. |
b75a7d8f A |
71 | * |
72 | * 3) This section contains a list of sorted unique aliases. This | |
73 | * list contains indexes into the string table for the alias name. The | |
74 | * index of this list is also used by other sections, like the 4th section. | |
75 | * The index for the 3rd and 4th section is used to get the | |
76 | * alias -> converter name mapping. Section 3 and 4 form a two column table. | |
73c04bcf A |
77 | * Some of the most significant bits of each index may contain other |
78 | * information (see findConverter for details). | |
b75a7d8f A |
79 | * |
80 | * 4) This section contains a list of mapped converter names. Consider this | |
81 | * as a table that maps the 3rd section to the 1st section. This list contains | |
82 | * indexes into the 1st section. The index of this list is the same index in | |
83 | * the 3rd section. There is also some extra information in the high bits of | |
84 | * each converter index in this table. Currently it's only used to say that | |
85 | * an alias mapped to this converter is ambiguous. See UCNV_CONVERTER_INDEX_MASK | |
86 | * and UCNV_AMBIGUOUS_ALIAS_MAP_BIT for more information. This section is | |
87 | * the predigested form of the 5th section so that an alias lookup can be fast. | |
374ca955 | 88 | * |
b75a7d8f A |
89 | * 5) This section contains a 2D array with indexes to the 6th section. This |
90 | * section is the full form of all alias mappings. The column index is the | |
91 | * index into the converter list (column header). The row index is the index | |
92 | * to tag list (row header). This 2D array is the top part a 3D array. The | |
93 | * third dimension is in the 6th section. | |
94 | * | |
95 | * 6) This is blob of variable length arrays. Each array starts with a size, | |
96 | * and is followed by indexes to alias names in the string table. This is | |
97 | * the third dimension to the section 5. No other section should be referencing | |
98 | * this section. | |
99 | * | |
73c04bcf A |
100 | * 7) Starting in ICU 3.6, this can be a UConverterAliasOptions struct. Its |
101 | * presence indicates that a section 9 exists. UConverterAliasOptions specifies | |
102 | * what type of string normalization is used among other potential things in the | |
103 | * future. | |
b75a7d8f A |
104 | * |
105 | * 8) This is the string table. All strings are indexed on an even address. | |
106 | * There are two reasons for this. First many chip architectures locate strings | |
107 | * faster on even address boundaries. Second, since all indexes are 16-bit | |
108 | * numbers, this string table can be 128KB in size instead of 64KB when we | |
109 | * only have strings starting on an even address. | |
110 | * | |
73c04bcf A |
111 | * 9) When present this is a set of prenormalized strings from section 8. This |
112 | * table contains normalized strings with the dashes and spaces stripped out, | |
113 | * and all strings lowercased. In the future, the options in section 7 may state | |
114 | * other types of normalization. | |
b75a7d8f A |
115 | * |
116 | * Here is the concept of section 5 and 6. It's a 3D cube. Each tag | |
117 | * has a unique alias among all converters. That same alias can | |
118 | * be mentioned in other standards on different converters, | |
119 | * but only one alias per tag can be unique. | |
120 | * | |
121 | * | |
122 | * Converter Names (Usually in TR22 form) | |
123 | * -------------------------------------------. | |
124 | * T / /| | |
125 | * a / / | | |
126 | * g / / | | |
127 | * s / / | | |
128 | * / / | | |
129 | * ------------------------------------------/ | | |
130 | * A | | | | |
131 | * l | | | | |
132 | * i | | / | |
133 | * a | | / | |
134 | * s | | / | |
135 | * e | | / | |
136 | * s | |/ | |
137 | * ------------------------------------------- | |
138 | * | |
139 | * | |
140 | * | |
141 | * Here is what it really looks like. It's like swiss cheese. | |
142 | * There are holes. Some converters aren't recognized by | |
143 | * a standard, or they are really old converters that the | |
144 | * standard doesn't recognize anymore. | |
145 | * | |
146 | * Converter Names (Usually in TR22 form) | |
147 | * -------------------------------------------. | |
148 | * T /##########################################/| | |
149 | * a / # # /# | |
374ca955 A |
150 | * g / # ## ## ### # ### ### ### #/ |
151 | * s / # ##### #### ## ## #/# | |
152 | * / ### # # ## # # # ### # # #/## | |
b75a7d8f A |
153 | * ------------------------------------------/# # |
154 | * A |### # # ## # # # ### # # #|# # | |
155 | * l |# # # # # ## # #|# # | |
156 | * i |# # # # # # #|# | |
157 | * a |# #|# | |
158 | * s | #|# | |
374ca955 A |
159 | * e |
160 | * s | |
161 | * | |
b75a7d8f A |
162 | */ |
163 | ||
164 | /** | |
165 | * Used by the UEnumeration API | |
166 | */ | |
167 | typedef struct UAliasContext { | |
168 | uint32_t listOffset; | |
169 | uint32_t listIdx; | |
170 | } UAliasContext; | |
171 | ||
172 | static const char DATA_NAME[] = "cnvalias"; | |
173 | static const char DATA_TYPE[] = "icu"; | |
174 | ||
175 | static UDataMemory *gAliasData=NULL; | |
57a6839d | 176 | static icu::UInitOnce gAliasDataInitOnce = U_INITONCE_INITIALIZER; |
b75a7d8f | 177 | |
374ca955 A |
178 | enum { |
179 | tocLengthIndex=0, | |
180 | converterListIndex=1, | |
181 | tagListIndex=2, | |
182 | aliasListIndex=3, | |
183 | untaggedConvArrayIndex=4, | |
184 | taggedAliasArrayIndex=5, | |
185 | taggedAliasListsIndex=6, | |
73c04bcf | 186 | tableOptionsIndex=7, |
374ca955 | 187 | stringTableIndex=8, |
73c04bcf A |
188 | normalizedStringTableIndex=9, |
189 | offsetsCount, /* length of the swapper's temporary offsets[] */ | |
190 | minTocLength=8 /* min. tocLength in the file, does not count the tocLengthIndex! */ | |
374ca955 A |
191 | }; |
192 | ||
73c04bcf A |
193 | static const UConverterAliasOptions defaultTableOptions = { |
194 | UCNV_IO_UNNORMALIZED, | |
195 | 0 /* containsCnvOptionInfo */ | |
196 | }; | |
197 | static UConverterAlias gMainTable; | |
b75a7d8f | 198 | |
73c04bcf A |
199 | #define GET_STRING(idx) (const char *)(gMainTable.stringTable + (idx)) |
200 | #define GET_NORMALIZED_STRING(idx) (const char *)(gMainTable.normalizedStringTable + (idx)) | |
b75a7d8f A |
201 | |
202 | static UBool U_CALLCONV | |
4388f060 A |
203 | isAcceptable(void * /*context*/, |
204 | const char * /*type*/, const char * /*name*/, | |
b75a7d8f A |
205 | const UDataInfo *pInfo) { |
206 | return (UBool)( | |
207 | pInfo->size>=20 && | |
208 | pInfo->isBigEndian==U_IS_BIG_ENDIAN && | |
209 | pInfo->charsetFamily==U_CHARSET_FAMILY && | |
210 | pInfo->dataFormat[0]==0x43 && /* dataFormat="CvAl" */ | |
211 | pInfo->dataFormat[1]==0x76 && | |
212 | pInfo->dataFormat[2]==0x41 && | |
213 | pInfo->dataFormat[3]==0x6c && | |
214 | pInfo->formatVersion[0]==3); | |
215 | } | |
216 | ||
374ca955 A |
217 | static UBool U_CALLCONV ucnv_io_cleanup(void) |
218 | { | |
219 | if (gAliasData) { | |
220 | udata_close(gAliasData); | |
221 | gAliasData = NULL; | |
222 | } | |
57a6839d | 223 | gAliasDataInitOnce.reset(); |
374ca955 | 224 | |
73c04bcf | 225 | uprv_memset(&gMainTable, 0, sizeof(gMainTable)); |
374ca955 A |
226 | |
227 | return TRUE; /* Everything was cleaned up */ | |
228 | } | |
229 | ||
57a6839d A |
230 | static void U_CALLCONV initAliasData(UErrorCode &errCode) { |
231 | UDataMemory *data; | |
232 | const uint16_t *table; | |
233 | const uint32_t *sectionSizes; | |
234 | uint32_t tableStart; | |
235 | uint32_t currOffset; | |
b75a7d8f | 236 | |
57a6839d | 237 | ucln_common_registerCleanup(UCLN_COMMON_UCNV_IO, ucnv_io_cleanup); |
b75a7d8f | 238 | |
57a6839d A |
239 | U_ASSERT(gAliasData == NULL); |
240 | data = udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errCode); | |
241 | if(U_FAILURE(errCode)) { | |
242 | return; | |
243 | } | |
b75a7d8f | 244 | |
57a6839d A |
245 | sectionSizes = (const uint32_t *)udata_getMemory(data); |
246 | table = (const uint16_t *)sectionSizes; | |
b75a7d8f | 247 | |
57a6839d A |
248 | tableStart = sectionSizes[0]; |
249 | if (tableStart < minTocLength) { | |
250 | errCode = U_INVALID_FORMAT_ERROR; | |
251 | udata_close(data); | |
252 | return; | |
253 | } | |
254 | gAliasData = data; | |
255 | ||
256 | gMainTable.converterListSize = sectionSizes[1]; | |
257 | gMainTable.tagListSize = sectionSizes[2]; | |
258 | gMainTable.aliasListSize = sectionSizes[3]; | |
259 | gMainTable.untaggedConvArraySize = sectionSizes[4]; | |
260 | gMainTable.taggedAliasArraySize = sectionSizes[5]; | |
261 | gMainTable.taggedAliasListsSize = sectionSizes[6]; | |
262 | gMainTable.optionTableSize = sectionSizes[7]; | |
263 | gMainTable.stringTableSize = sectionSizes[8]; | |
264 | ||
265 | if (tableStart > 8) { | |
266 | gMainTable.normalizedStringTableSize = sectionSizes[9]; | |
267 | } | |
b75a7d8f | 268 | |
57a6839d A |
269 | currOffset = tableStart * (sizeof(uint32_t)/sizeof(uint16_t)) + (sizeof(uint32_t)/sizeof(uint16_t)); |
270 | gMainTable.converterList = table + currOffset; | |
b75a7d8f | 271 | |
57a6839d A |
272 | currOffset += gMainTable.converterListSize; |
273 | gMainTable.tagList = table + currOffset; | |
b75a7d8f | 274 | |
57a6839d A |
275 | currOffset += gMainTable.tagListSize; |
276 | gMainTable.aliasList = table + currOffset; | |
b75a7d8f | 277 | |
57a6839d A |
278 | currOffset += gMainTable.aliasListSize; |
279 | gMainTable.untaggedConvArray = table + currOffset; | |
b75a7d8f | 280 | |
57a6839d A |
281 | currOffset += gMainTable.untaggedConvArraySize; |
282 | gMainTable.taggedAliasArray = table + currOffset; | |
b75a7d8f | 283 | |
57a6839d A |
284 | /* aliasLists is a 1's based array, but it has a padding character */ |
285 | currOffset += gMainTable.taggedAliasArraySize; | |
286 | gMainTable.taggedAliasLists = table + currOffset; | |
73c04bcf | 287 | |
57a6839d A |
288 | currOffset += gMainTable.taggedAliasListsSize; |
289 | if (gMainTable.optionTableSize > 0 | |
290 | && ((const UConverterAliasOptions *)(table + currOffset))->stringNormalizationType < UCNV_IO_NORM_TYPE_COUNT) | |
291 | { | |
292 | /* Faster table */ | |
293 | gMainTable.optionTable = (const UConverterAliasOptions *)(table + currOffset); | |
294 | } | |
295 | else { | |
296 | /* Smaller table, or I can't handle this normalization mode! | |
297 | Use the original slower table lookup. */ | |
298 | gMainTable.optionTable = &defaultTableOptions; | |
299 | } | |
b75a7d8f | 300 | |
57a6839d A |
301 | currOffset += gMainTable.optionTableSize; |
302 | gMainTable.stringTable = table + currOffset; | |
729e4ab9 | 303 | |
57a6839d A |
304 | currOffset += gMainTable.stringTableSize; |
305 | gMainTable.normalizedStringTable = ((gMainTable.optionTable->stringNormalizationType == UCNV_IO_UNNORMALIZED) | |
306 | ? gMainTable.stringTable : (table + currOffset)); | |
307 | } | |
b75a7d8f | 308 | |
b75a7d8f | 309 | |
57a6839d A |
310 | static UBool |
311 | haveAliasData(UErrorCode *pErrorCode) { | |
312 | umtx_initOnce(gAliasDataInitOnce, &initAliasData, *pErrorCode); | |
313 | return U_SUCCESS(*pErrorCode); | |
b75a7d8f A |
314 | } |
315 | ||
4388f060 | 316 | static inline UBool |
b75a7d8f A |
317 | isAlias(const char *alias, UErrorCode *pErrorCode) { |
318 | if(alias==NULL) { | |
319 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
320 | return FALSE; | |
b75a7d8f | 321 | } |
73c04bcf | 322 | return (UBool)(*alias!=0); |
b75a7d8f A |
323 | } |
324 | ||
b75a7d8f | 325 | static uint32_t getTagNumber(const char *tagname) { |
73c04bcf | 326 | if (gMainTable.tagList) { |
b75a7d8f | 327 | uint32_t tagNum; |
73c04bcf A |
328 | for (tagNum = 0; tagNum < gMainTable.tagListSize; tagNum++) { |
329 | if (!uprv_stricmp(GET_STRING(gMainTable.tagList[tagNum]), tagname)) { | |
b75a7d8f A |
330 | return tagNum; |
331 | } | |
332 | } | |
333 | } | |
334 | ||
335 | return UINT32_MAX; | |
336 | } | |
337 | ||
73c04bcf A |
338 | /* character types relevant for ucnv_compareNames() */ |
339 | enum { | |
57a6839d | 340 | UIGNORE, |
73c04bcf A |
341 | ZERO, |
342 | NONZERO, | |
343 | MINLETTER /* any values from here on are lowercase letter mappings */ | |
344 | }; | |
345 | ||
346 | /* character types for ASCII 00..7F */ | |
347 | static const uint8_t asciiTypes[128] = { | |
348 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
349 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
350 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
351 | ZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, 0, 0, 0, 0, 0, 0, | |
352 | 0, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, | |
353 | 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0, 0, 0, 0, 0, | |
354 | 0, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, | |
355 | 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0, 0, 0, 0, 0 | |
356 | }; | |
357 | ||
57a6839d | 358 | #define GET_ASCII_TYPE(c) ((int8_t)(c) >= 0 ? asciiTypes[(uint8_t)c] : (uint8_t)UIGNORE) |
73c04bcf A |
359 | |
360 | /* character types for EBCDIC 80..FF */ | |
361 | static const uint8_t ebcdicTypes[128] = { | |
362 | 0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0, 0, 0, 0, 0, 0, | |
363 | 0, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0, 0, 0, 0, 0, 0, | |
364 | 0, 0, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0, 0, 0, 0, 0, 0, | |
365 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
366 | 0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0, 0, 0, 0, 0, 0, | |
367 | 0, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0, 0, 0, 0, 0, 0, | |
368 | 0, 0, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0, 0, 0, 0, 0, 0, | |
369 | ZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, 0, 0, 0, 0, 0, 0 | |
370 | }; | |
371 | ||
57a6839d | 372 | #define GET_EBCDIC_TYPE(c) ((int8_t)(c) < 0 ? ebcdicTypes[(c)&0x7f] : (uint8_t)UIGNORE) |
73c04bcf A |
373 | |
374 | #if U_CHARSET_FAMILY==U_ASCII_FAMILY | |
375 | # define GET_CHAR_TYPE(c) GET_ASCII_TYPE(c) | |
376 | #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY | |
377 | # define GET_CHAR_TYPE(c) GET_EBCDIC_TYPE(c) | |
378 | #else | |
379 | # error U_CHARSET_FAMILY is not valid | |
380 | #endif | |
381 | ||
b75a7d8f A |
382 | /* @see ucnv_compareNames */ |
383 | U_CFUNC char * U_EXPORT2 | |
374ca955 | 384 | ucnv_io_stripASCIIForCompare(char *dst, const char *name) { |
b75a7d8f | 385 | char *dstItr = dst; |
73c04bcf A |
386 | uint8_t type, nextType; |
387 | char c1; | |
388 | UBool afterDigit = FALSE; | |
389 | ||
390 | while ((c1 = *name++) != 0) { | |
391 | type = GET_ASCII_TYPE(c1); | |
392 | switch (type) { | |
57a6839d | 393 | case UIGNORE: |
73c04bcf A |
394 | afterDigit = FALSE; |
395 | continue; /* ignore all but letters and digits */ | |
396 | case ZERO: | |
397 | if (!afterDigit) { | |
398 | nextType = GET_ASCII_TYPE(*name); | |
399 | if (nextType == ZERO || nextType == NONZERO) { | |
400 | continue; /* ignore leading zero before another digit */ | |
401 | } | |
402 | } | |
403 | break; | |
404 | case NONZERO: | |
405 | afterDigit = TRUE; | |
406 | break; | |
407 | default: | |
408 | c1 = (char)type; /* lowercased letter */ | |
409 | afterDigit = FALSE; | |
410 | break; | |
b75a7d8f | 411 | } |
73c04bcf | 412 | *dstItr++ = c1; |
374ca955 | 413 | } |
73c04bcf | 414 | *dstItr = 0; |
374ca955 A |
415 | return dst; |
416 | } | |
417 | ||
418 | U_CFUNC char * U_EXPORT2 | |
419 | ucnv_io_stripEBCDICForCompare(char *dst, const char *name) { | |
374ca955 | 420 | char *dstItr = dst; |
73c04bcf A |
421 | uint8_t type, nextType; |
422 | char c1; | |
423 | UBool afterDigit = FALSE; | |
424 | ||
425 | while ((c1 = *name++) != 0) { | |
426 | type = GET_EBCDIC_TYPE(c1); | |
427 | switch (type) { | |
57a6839d | 428 | case UIGNORE: |
73c04bcf A |
429 | afterDigit = FALSE; |
430 | continue; /* ignore all but letters and digits */ | |
431 | case ZERO: | |
432 | if (!afterDigit) { | |
433 | nextType = GET_EBCDIC_TYPE(*name); | |
434 | if (nextType == ZERO || nextType == NONZERO) { | |
435 | continue; /* ignore leading zero before another digit */ | |
436 | } | |
437 | } | |
438 | break; | |
439 | case NONZERO: | |
440 | afterDigit = TRUE; | |
441 | break; | |
442 | default: | |
443 | c1 = (char)type; /* lowercased letter */ | |
444 | afterDigit = FALSE; | |
445 | break; | |
374ca955 | 446 | } |
73c04bcf | 447 | *dstItr++ = c1; |
b75a7d8f | 448 | } |
73c04bcf | 449 | *dstItr = 0; |
b75a7d8f A |
450 | return dst; |
451 | } | |
452 | ||
453 | /** | |
73c04bcf A |
454 | * Do a fuzzy compare of two converter/alias names. |
455 | * The comparison is case-insensitive, ignores leading zeroes if they are not | |
456 | * followed by further digits, and ignores all but letters and digits. | |
457 | * Thus the strings "UTF-8", "utf_8", "u*T@f08" and "Utf 8" are exactly equivalent. | |
458 | * See section 1.4, Charset Alias Matching in Unicode Technical Standard #22 | |
459 | * at http://www.unicode.org/reports/tr22/ | |
374ca955 | 460 | * |
b75a7d8f A |
461 | * This is a symmetrical (commutative) operation; order of arguments |
462 | * is insignificant. This is an important property for sorting the | |
463 | * list (when the list is preprocessed into binary form) and for | |
464 | * performing binary searches on it at run time. | |
374ca955 | 465 | * |
b75a7d8f A |
466 | * @param name1 a converter name or alias, zero-terminated |
467 | * @param name2 a converter name or alias, zero-terminated | |
468 | * @return 0 if the names match, or a negative value if the name1 | |
469 | * lexically precedes name2, or a positive value if the name1 | |
470 | * lexically follows name2. | |
471 | * | |
472 | * @see ucnv_io_stripForCompare | |
473 | */ | |
474 | U_CAPI int U_EXPORT2 | |
475 | ucnv_compareNames(const char *name1, const char *name2) { | |
476 | int rc; | |
73c04bcf | 477 | uint8_t type, nextType; |
b75a7d8f | 478 | char c1, c2; |
73c04bcf | 479 | UBool afterDigit1 = FALSE, afterDigit2 = FALSE; |
b75a7d8f A |
480 | |
481 | for (;;) { | |
73c04bcf A |
482 | while ((c1 = *name1++) != 0) { |
483 | type = GET_CHAR_TYPE(c1); | |
484 | switch (type) { | |
57a6839d | 485 | case UIGNORE: |
73c04bcf A |
486 | afterDigit1 = FALSE; |
487 | continue; /* ignore all but letters and digits */ | |
488 | case ZERO: | |
489 | if (!afterDigit1) { | |
490 | nextType = GET_CHAR_TYPE(*name1); | |
491 | if (nextType == ZERO || nextType == NONZERO) { | |
492 | continue; /* ignore leading zero before another digit */ | |
493 | } | |
494 | } | |
495 | break; | |
496 | case NONZERO: | |
497 | afterDigit1 = TRUE; | |
498 | break; | |
499 | default: | |
500 | c1 = (char)type; /* lowercased letter */ | |
501 | afterDigit1 = FALSE; | |
502 | break; | |
503 | } | |
504 | break; /* deliver c1 */ | |
b75a7d8f | 505 | } |
73c04bcf A |
506 | while ((c2 = *name2++) != 0) { |
507 | type = GET_CHAR_TYPE(c2); | |
508 | switch (type) { | |
57a6839d | 509 | case UIGNORE: |
73c04bcf A |
510 | afterDigit2 = FALSE; |
511 | continue; /* ignore all but letters and digits */ | |
512 | case ZERO: | |
513 | if (!afterDigit2) { | |
514 | nextType = GET_CHAR_TYPE(*name2); | |
515 | if (nextType == ZERO || nextType == NONZERO) { | |
516 | continue; /* ignore leading zero before another digit */ | |
517 | } | |
518 | } | |
519 | break; | |
520 | case NONZERO: | |
521 | afterDigit2 = TRUE; | |
522 | break; | |
523 | default: | |
524 | c2 = (char)type; /* lowercased letter */ | |
525 | afterDigit2 = FALSE; | |
526 | break; | |
527 | } | |
528 | break; /* deliver c2 */ | |
b75a7d8f A |
529 | } |
530 | ||
531 | /* If we reach the ends of both strings then they match */ | |
532 | if ((c1|c2)==0) { | |
533 | return 0; | |
534 | } | |
374ca955 | 535 | |
b75a7d8f | 536 | /* Case-insensitive comparison */ |
73c04bcf | 537 | rc = (int)(unsigned char)c1 - (int)(unsigned char)c2; |
b75a7d8f A |
538 | if (rc != 0) { |
539 | return rc; | |
540 | } | |
b75a7d8f A |
541 | } |
542 | } | |
543 | ||
544 | /* | |
545 | * search for an alias | |
546 | * return the converter number index for gConverterList | |
547 | */ | |
4388f060 | 548 | static inline uint32_t |
73c04bcf | 549 | findConverter(const char *alias, UBool *containsOption, UErrorCode *pErrorCode) { |
b75a7d8f | 550 | uint32_t mid, start, limit; |
374ca955 | 551 | uint32_t lastMid; |
b75a7d8f | 552 | int result; |
73c04bcf A |
553 | int isUnnormalized = (gMainTable.optionTable->stringNormalizationType == UCNV_IO_UNNORMALIZED); |
554 | char strippedName[UCNV_MAX_CONVERTER_NAME_LENGTH]; | |
555 | ||
556 | if (!isUnnormalized) { | |
557 | if (uprv_strlen(alias) >= UCNV_MAX_CONVERTER_NAME_LENGTH) { | |
558 | *pErrorCode = U_BUFFER_OVERFLOW_ERROR; | |
559 | return UINT32_MAX; | |
560 | } | |
561 | ||
562 | /* Lower case and remove ignoreable characters. */ | |
563 | ucnv_io_stripForCompare(strippedName, alias); | |
564 | alias = strippedName; | |
565 | } | |
b75a7d8f A |
566 | |
567 | /* do a binary search for the alias */ | |
568 | start = 0; | |
73c04bcf | 569 | limit = gMainTable.untaggedConvArraySize; |
b75a7d8f | 570 | mid = limit; |
374ca955 | 571 | lastMid = UINT32_MAX; |
b75a7d8f A |
572 | |
573 | for (;;) { | |
574 | mid = (uint32_t)((start + limit) / 2); | |
374ca955 A |
575 | if (lastMid == mid) { /* Have we moved? */ |
576 | break; /* We haven't moved, and it wasn't found. */ | |
577 | } | |
578 | lastMid = mid; | |
73c04bcf A |
579 | if (isUnnormalized) { |
580 | result = ucnv_compareNames(alias, GET_STRING(gMainTable.aliasList[mid])); | |
581 | } | |
582 | else { | |
583 | result = uprv_strcmp(alias, GET_NORMALIZED_STRING(gMainTable.aliasList[mid])); | |
584 | } | |
b75a7d8f A |
585 | |
586 | if (result < 0) { | |
587 | limit = mid; | |
588 | } else if (result > 0) { | |
589 | start = mid; | |
590 | } else { | |
591 | /* Since the gencnval tool folds duplicates into one entry, | |
592 | * this alias in gAliasList is unique, but different standards | |
593 | * may map an alias to different converters. | |
594 | */ | |
73c04bcf | 595 | if (gMainTable.untaggedConvArray[mid] & UCNV_AMBIGUOUS_ALIAS_MAP_BIT) { |
b75a7d8f A |
596 | *pErrorCode = U_AMBIGUOUS_ALIAS_WARNING; |
597 | } | |
73c04bcf A |
598 | /* State whether the canonical converter name contains an option. |
599 | This information is contained in this list in order to maintain backward & forward compatibility. */ | |
600 | if (containsOption) { | |
601 | UBool containsCnvOptionInfo = (UBool)gMainTable.optionTable->containsCnvOptionInfo; | |
602 | *containsOption = (UBool)((containsCnvOptionInfo | |
603 | && ((gMainTable.untaggedConvArray[mid] & UCNV_CONTAINS_OPTION_BIT) != 0)) | |
604 | || !containsCnvOptionInfo); | |
605 | } | |
606 | return gMainTable.untaggedConvArray[mid] & UCNV_CONVERTER_INDEX_MASK; | |
b75a7d8f A |
607 | } |
608 | } | |
609 | ||
610 | return UINT32_MAX; | |
611 | } | |
612 | ||
613 | /* | |
614 | * Is this alias in this list? | |
615 | * alias and listOffset should be non-NULL. | |
616 | */ | |
4388f060 | 617 | static inline UBool |
b75a7d8f A |
618 | isAliasInList(const char *alias, uint32_t listOffset) { |
619 | if (listOffset) { | |
620 | uint32_t currAlias; | |
73c04bcf | 621 | uint32_t listCount = gMainTable.taggedAliasLists[listOffset]; |
b75a7d8f | 622 | /* +1 to skip listCount */ |
73c04bcf | 623 | const uint16_t *currList = gMainTable.taggedAliasLists + listOffset + 1; |
b75a7d8f A |
624 | for (currAlias = 0; currAlias < listCount; currAlias++) { |
625 | if (currList[currAlias] | |
626 | && ucnv_compareNames(alias, GET_STRING(currList[currAlias]))==0) | |
627 | { | |
628 | return TRUE; | |
629 | } | |
630 | } | |
631 | } | |
632 | return FALSE; | |
633 | } | |
634 | ||
635 | /* | |
636 | * Search for an standard name of an alias (what is the default name | |
637 | * that this standard uses?) | |
638 | * return the listOffset for gTaggedAliasLists. If it's 0, | |
639 | * the it couldn't be found, but the parameters are valid. | |
640 | */ | |
641 | static uint32_t | |
642 | findTaggedAliasListsOffset(const char *alias, const char *standard, UErrorCode *pErrorCode) { | |
643 | uint32_t idx; | |
644 | uint32_t listOffset; | |
645 | uint32_t convNum; | |
646 | UErrorCode myErr = U_ZERO_ERROR; | |
647 | uint32_t tagNum = getTagNumber(standard); | |
648 | ||
649 | /* Make a quick guess. Hopefully they used a TR22 canonical alias. */ | |
73c04bcf | 650 | convNum = findConverter(alias, NULL, &myErr); |
b75a7d8f A |
651 | if (myErr != U_ZERO_ERROR) { |
652 | *pErrorCode = myErr; | |
653 | } | |
654 | ||
73c04bcf A |
655 | if (tagNum < (gMainTable.tagListSize - UCNV_NUM_HIDDEN_TAGS) && convNum < gMainTable.converterListSize) { |
656 | listOffset = gMainTable.taggedAliasArray[tagNum*gMainTable.converterListSize + convNum]; | |
657 | if (listOffset && gMainTable.taggedAliasLists[listOffset + 1]) { | |
b75a7d8f A |
658 | return listOffset; |
659 | } | |
660 | if (myErr == U_AMBIGUOUS_ALIAS_WARNING) { | |
661 | /* Uh Oh! They used an ambiguous alias. | |
662 | We have to search the whole swiss cheese starting | |
663 | at the highest standard affinity. | |
664 | This may take a while. | |
665 | */ | |
73c04bcf A |
666 | for (idx = 0; idx < gMainTable.taggedAliasArraySize; idx++) { |
667 | listOffset = gMainTable.taggedAliasArray[idx]; | |
b75a7d8f | 668 | if (listOffset && isAliasInList(alias, listOffset)) { |
73c04bcf A |
669 | uint32_t currTagNum = idx/gMainTable.converterListSize; |
670 | uint32_t currConvNum = (idx - currTagNum*gMainTable.converterListSize); | |
671 | uint32_t tempListOffset = gMainTable.taggedAliasArray[tagNum*gMainTable.converterListSize + currConvNum]; | |
672 | if (tempListOffset && gMainTable.taggedAliasLists[tempListOffset + 1]) { | |
b75a7d8f A |
673 | return tempListOffset; |
674 | } | |
675 | /* else keep on looking */ | |
676 | /* We could speed this up by starting on the next row | |
677 | because an alias is unique per row, right now. | |
678 | This would change if alias versioning appears. */ | |
679 | } | |
680 | } | |
681 | /* The standard doesn't know about the alias */ | |
682 | } | |
683 | /* else no default name */ | |
684 | return 0; | |
685 | } | |
686 | /* else converter or tag not found */ | |
687 | ||
688 | return UINT32_MAX; | |
689 | } | |
690 | ||
691 | /* Return the canonical name */ | |
692 | static uint32_t | |
693 | findTaggedConverterNum(const char *alias, const char *standard, UErrorCode *pErrorCode) { | |
694 | uint32_t idx; | |
695 | uint32_t listOffset; | |
696 | uint32_t convNum; | |
697 | UErrorCode myErr = U_ZERO_ERROR; | |
698 | uint32_t tagNum = getTagNumber(standard); | |
699 | ||
700 | /* Make a quick guess. Hopefully they used a TR22 canonical alias. */ | |
73c04bcf | 701 | convNum = findConverter(alias, NULL, &myErr); |
b75a7d8f A |
702 | if (myErr != U_ZERO_ERROR) { |
703 | *pErrorCode = myErr; | |
704 | } | |
705 | ||
73c04bcf A |
706 | if (tagNum < (gMainTable.tagListSize - UCNV_NUM_HIDDEN_TAGS) && convNum < gMainTable.converterListSize) { |
707 | listOffset = gMainTable.taggedAliasArray[tagNum*gMainTable.converterListSize + convNum]; | |
b75a7d8f A |
708 | if (listOffset && isAliasInList(alias, listOffset)) { |
709 | return convNum; | |
710 | } | |
711 | if (myErr == U_AMBIGUOUS_ALIAS_WARNING) { | |
712 | /* Uh Oh! They used an ambiguous alias. | |
713 | We have to search one slice of the swiss cheese. | |
714 | We search only in the requested tag, not the whole thing. | |
715 | This may take a while. | |
716 | */ | |
73c04bcf A |
717 | uint32_t convStart = (tagNum)*gMainTable.converterListSize; |
718 | uint32_t convLimit = (tagNum+1)*gMainTable.converterListSize; | |
b75a7d8f | 719 | for (idx = convStart; idx < convLimit; idx++) { |
73c04bcf | 720 | listOffset = gMainTable.taggedAliasArray[idx]; |
b75a7d8f A |
721 | if (listOffset && isAliasInList(alias, listOffset)) { |
722 | return idx-convStart; | |
723 | } | |
724 | } | |
725 | /* The standard doesn't know about the alias */ | |
726 | } | |
727 | /* else no canonical name */ | |
728 | } | |
729 | /* else converter or tag not found */ | |
730 | ||
731 | return UINT32_MAX; | |
732 | } | |
733 | ||
734 | ||
735 | ||
736 | U_CFUNC const char * | |
73c04bcf | 737 | ucnv_io_getConverterName(const char *alias, UBool *containsOption, UErrorCode *pErrorCode) { |
4388f060 A |
738 | const char *aliasTmp = alias; |
739 | int32_t i = 0; | |
740 | for (i = 0; i < 2; i++) { | |
741 | if (i == 1) { | |
742 | /* | |
743 | * After the first unsuccess converter lookup, check to see if | |
744 | * the name begins with 'x-'. If it does, strip it off and try | |
745 | * again. This behaviour is similar to how ICU4J does it. | |
746 | */ | |
2ca993e8 | 747 | if (aliasTmp[0] == 'x' && aliasTmp[1] == '-') { |
4388f060 A |
748 | aliasTmp = aliasTmp+2; |
749 | } else { | |
750 | break; | |
751 | } | |
752 | } | |
753 | if(haveAliasData(pErrorCode) && isAlias(aliasTmp, pErrorCode)) { | |
754 | uint32_t convNum = findConverter(aliasTmp, containsOption, pErrorCode); | |
755 | if (convNum < gMainTable.converterListSize) { | |
756 | return GET_STRING(gMainTable.converterList[convNum]); | |
757 | } | |
758 | /* else converter not found */ | |
759 | } else { | |
760 | break; | |
b75a7d8f | 761 | } |
b75a7d8f | 762 | } |
4388f060 | 763 | |
b75a7d8f A |
764 | return NULL; |
765 | } | |
766 | ||
767 | static int32_t U_CALLCONV | |
4388f060 | 768 | ucnv_io_countStandardAliases(UEnumeration *enumerator, UErrorCode * /*pErrorCode*/) { |
b75a7d8f A |
769 | int32_t value = 0; |
770 | UAliasContext *myContext = (UAliasContext *)(enumerator->context); | |
771 | uint32_t listOffset = myContext->listOffset; | |
772 | ||
773 | if (listOffset) { | |
73c04bcf | 774 | value = gMainTable.taggedAliasLists[listOffset]; |
b75a7d8f A |
775 | } |
776 | return value; | |
777 | } | |
778 | ||
779 | static const char* U_CALLCONV | |
780 | ucnv_io_nextStandardAliases(UEnumeration *enumerator, | |
781 | int32_t* resultLength, | |
4388f060 | 782 | UErrorCode * /*pErrorCode*/) |
b75a7d8f A |
783 | { |
784 | UAliasContext *myContext = (UAliasContext *)(enumerator->context); | |
785 | uint32_t listOffset = myContext->listOffset; | |
786 | ||
787 | if (listOffset) { | |
73c04bcf A |
788 | uint32_t listCount = gMainTable.taggedAliasLists[listOffset]; |
789 | const uint16_t *currList = gMainTable.taggedAliasLists + listOffset + 1; | |
b75a7d8f A |
790 | |
791 | if (myContext->listIdx < listCount) { | |
792 | const char *myStr = GET_STRING(currList[myContext->listIdx++]); | |
793 | if (resultLength) { | |
374ca955 | 794 | *resultLength = (int32_t)uprv_strlen(myStr); |
b75a7d8f A |
795 | } |
796 | return myStr; | |
797 | } | |
798 | } | |
799 | /* Either we accessed a zero length list, or we enumerated too far. */ | |
73c04bcf A |
800 | if (resultLength) { |
801 | *resultLength = 0; | |
802 | } | |
b75a7d8f A |
803 | return NULL; |
804 | } | |
805 | ||
806 | static void U_CALLCONV | |
4388f060 | 807 | ucnv_io_resetStandardAliases(UEnumeration *enumerator, UErrorCode * /*pErrorCode*/) { |
b75a7d8f A |
808 | ((UAliasContext *)(enumerator->context))->listIdx = 0; |
809 | } | |
810 | ||
811 | static void U_CALLCONV | |
812 | ucnv_io_closeUEnumeration(UEnumeration *enumerator) { | |
813 | uprv_free(enumerator->context); | |
814 | uprv_free(enumerator); | |
815 | } | |
816 | ||
817 | /* Enumerate the aliases for the specified converter and standard tag */ | |
818 | static const UEnumeration gEnumAliases = { | |
819 | NULL, | |
820 | NULL, | |
821 | ucnv_io_closeUEnumeration, | |
822 | ucnv_io_countStandardAliases, | |
823 | uenum_unextDefault, | |
824 | ucnv_io_nextStandardAliases, | |
825 | ucnv_io_resetStandardAliases | |
826 | }; | |
827 | ||
828 | U_CAPI UEnumeration * U_EXPORT2 | |
829 | ucnv_openStandardNames(const char *convName, | |
830 | const char *standard, | |
831 | UErrorCode *pErrorCode) | |
832 | { | |
833 | UEnumeration *myEnum = NULL; | |
834 | if (haveAliasData(pErrorCode) && isAlias(convName, pErrorCode)) { | |
835 | uint32_t listOffset = findTaggedAliasListsOffset(convName, standard, pErrorCode); | |
836 | ||
837 | /* When listOffset == 0, we want to acknowledge that the | |
838 | converter name and standard are okay, but there | |
839 | is nothing to enumerate. */ | |
73c04bcf | 840 | if (listOffset < gMainTable.taggedAliasListsSize) { |
b75a7d8f A |
841 | UAliasContext *myContext; |
842 | ||
51004dcb | 843 | myEnum = static_cast<UEnumeration *>(uprv_malloc(sizeof(UEnumeration))); |
b75a7d8f A |
844 | if (myEnum == NULL) { |
845 | *pErrorCode = U_MEMORY_ALLOCATION_ERROR; | |
846 | return NULL; | |
847 | } | |
848 | uprv_memcpy(myEnum, &gEnumAliases, sizeof(UEnumeration)); | |
51004dcb | 849 | myContext = static_cast<UAliasContext *>(uprv_malloc(sizeof(UAliasContext))); |
b75a7d8f A |
850 | if (myContext == NULL) { |
851 | *pErrorCode = U_MEMORY_ALLOCATION_ERROR; | |
852 | uprv_free(myEnum); | |
853 | return NULL; | |
854 | } | |
855 | myContext->listOffset = listOffset; | |
856 | myContext->listIdx = 0; | |
857 | myEnum->context = myContext; | |
858 | } | |
859 | /* else converter or tag not found */ | |
860 | } | |
861 | return myEnum; | |
862 | } | |
863 | ||
73c04bcf | 864 | static uint16_t |
b75a7d8f A |
865 | ucnv_io_countAliases(const char *alias, UErrorCode *pErrorCode) { |
866 | if(haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) { | |
73c04bcf A |
867 | uint32_t convNum = findConverter(alias, NULL, pErrorCode); |
868 | if (convNum < gMainTable.converterListSize) { | |
b75a7d8f | 869 | /* tagListNum - 1 is the ALL tag */ |
73c04bcf | 870 | int32_t listOffset = gMainTable.taggedAliasArray[(gMainTable.tagListSize - 1)*gMainTable.converterListSize + convNum]; |
b75a7d8f A |
871 | |
872 | if (listOffset) { | |
73c04bcf | 873 | return gMainTable.taggedAliasLists[listOffset]; |
b75a7d8f A |
874 | } |
875 | /* else this shouldn't happen. internal program error */ | |
876 | } | |
877 | /* else converter not found */ | |
878 | } | |
879 | return 0; | |
880 | } | |
881 | ||
73c04bcf | 882 | static uint16_t |
b75a7d8f A |
883 | ucnv_io_getAliases(const char *alias, uint16_t start, const char **aliases, UErrorCode *pErrorCode) { |
884 | if(haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) { | |
885 | uint32_t currAlias; | |
73c04bcf A |
886 | uint32_t convNum = findConverter(alias, NULL, pErrorCode); |
887 | if (convNum < gMainTable.converterListSize) { | |
b75a7d8f | 888 | /* tagListNum - 1 is the ALL tag */ |
73c04bcf | 889 | int32_t listOffset = gMainTable.taggedAliasArray[(gMainTable.tagListSize - 1)*gMainTable.converterListSize + convNum]; |
b75a7d8f A |
890 | |
891 | if (listOffset) { | |
73c04bcf | 892 | uint32_t listCount = gMainTable.taggedAliasLists[listOffset]; |
b75a7d8f | 893 | /* +1 to skip listCount */ |
73c04bcf | 894 | const uint16_t *currList = gMainTable.taggedAliasLists + listOffset + 1; |
b75a7d8f A |
895 | |
896 | for (currAlias = start; currAlias < listCount; currAlias++) { | |
897 | aliases[currAlias] = GET_STRING(currList[currAlias]); | |
898 | } | |
899 | } | |
900 | /* else this shouldn't happen. internal program error */ | |
901 | } | |
902 | /* else converter not found */ | |
903 | } | |
904 | return 0; | |
905 | } | |
906 | ||
73c04bcf | 907 | static const char * |
b75a7d8f A |
908 | ucnv_io_getAlias(const char *alias, uint16_t n, UErrorCode *pErrorCode) { |
909 | if(haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) { | |
73c04bcf A |
910 | uint32_t convNum = findConverter(alias, NULL, pErrorCode); |
911 | if (convNum < gMainTable.converterListSize) { | |
b75a7d8f | 912 | /* tagListNum - 1 is the ALL tag */ |
73c04bcf | 913 | int32_t listOffset = gMainTable.taggedAliasArray[(gMainTable.tagListSize - 1)*gMainTable.converterListSize + convNum]; |
b75a7d8f A |
914 | |
915 | if (listOffset) { | |
73c04bcf | 916 | uint32_t listCount = gMainTable.taggedAliasLists[listOffset]; |
b75a7d8f | 917 | /* +1 to skip listCount */ |
73c04bcf | 918 | const uint16_t *currList = gMainTable.taggedAliasLists + listOffset + 1; |
b75a7d8f A |
919 | |
920 | if (n < listCount) { | |
921 | return GET_STRING(currList[n]); | |
922 | } | |
923 | *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; | |
924 | } | |
925 | /* else this shouldn't happen. internal program error */ | |
926 | } | |
927 | /* else converter not found */ | |
928 | } | |
929 | return NULL; | |
930 | } | |
931 | ||
73c04bcf | 932 | static uint16_t |
b75a7d8f A |
933 | ucnv_io_countStandards(UErrorCode *pErrorCode) { |
934 | if (haveAliasData(pErrorCode)) { | |
935 | /* Don't include the empty list */ | |
73c04bcf | 936 | return (uint16_t)(gMainTable.tagListSize - UCNV_NUM_HIDDEN_TAGS); |
b75a7d8f A |
937 | } |
938 | ||
939 | return 0; | |
940 | } | |
941 | ||
942 | U_CAPI const char * U_EXPORT2 | |
943 | ucnv_getStandard(uint16_t n, UErrorCode *pErrorCode) { | |
944 | if (haveAliasData(pErrorCode)) { | |
73c04bcf A |
945 | if (n < gMainTable.tagListSize - UCNV_NUM_HIDDEN_TAGS) { |
946 | return GET_STRING(gMainTable.tagList[n]); | |
b75a7d8f A |
947 | } |
948 | *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; | |
949 | } | |
950 | ||
951 | return NULL; | |
952 | } | |
953 | ||
954 | U_CAPI const char * U_EXPORT2 | |
955 | ucnv_getStandardName(const char *alias, const char *standard, UErrorCode *pErrorCode) { | |
956 | if (haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) { | |
957 | uint32_t listOffset = findTaggedAliasListsOffset(alias, standard, pErrorCode); | |
958 | ||
73c04bcf A |
959 | if (0 < listOffset && listOffset < gMainTable.taggedAliasListsSize) { |
960 | const uint16_t *currList = gMainTable.taggedAliasLists + listOffset + 1; | |
b75a7d8f A |
961 | |
962 | /* Get the preferred name from this list */ | |
963 | if (currList[0]) { | |
964 | return GET_STRING(currList[0]); | |
965 | } | |
966 | /* else someone screwed up the alias table. */ | |
967 | /* *pErrorCode = U_INVALID_FORMAT_ERROR */ | |
968 | } | |
969 | } | |
970 | ||
971 | return NULL; | |
972 | } | |
973 | ||
73c04bcf A |
974 | U_CAPI uint16_t U_EXPORT2 |
975 | ucnv_countAliases(const char *alias, UErrorCode *pErrorCode) | |
976 | { | |
977 | return ucnv_io_countAliases(alias, pErrorCode); | |
978 | } | |
b75a7d8f | 979 | |
b75a7d8f | 980 | |
73c04bcf A |
981 | U_CAPI const char* U_EXPORT2 |
982 | ucnv_getAlias(const char *alias, uint16_t n, UErrorCode *pErrorCode) | |
983 | { | |
984 | return ucnv_io_getAlias(alias, n, pErrorCode); | |
b75a7d8f A |
985 | } |
986 | ||
73c04bcf A |
987 | U_CAPI void U_EXPORT2 |
988 | ucnv_getAliases(const char *alias, const char **aliases, UErrorCode *pErrorCode) | |
989 | { | |
990 | ucnv_io_getAliases(alias, 0, aliases, pErrorCode); | |
b75a7d8f A |
991 | } |
992 | ||
73c04bcf A |
993 | U_CAPI uint16_t U_EXPORT2 |
994 | ucnv_countStandards(void) | |
995 | { | |
996 | UErrorCode err = U_ZERO_ERROR; | |
997 | return ucnv_io_countStandards(&err); | |
b75a7d8f A |
998 | } |
999 | ||
73c04bcf A |
1000 | U_CAPI const char * U_EXPORT2 |
1001 | ucnv_getCanonicalName(const char *alias, const char *standard, UErrorCode *pErrorCode) { | |
1002 | if (haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) { | |
1003 | uint32_t convNum = findTaggedConverterNum(alias, standard, pErrorCode); | |
b75a7d8f | 1004 | |
73c04bcf A |
1005 | if (convNum < gMainTable.converterListSize) { |
1006 | return GET_STRING(gMainTable.converterList[convNum]); | |
b75a7d8f | 1007 | } |
b75a7d8f | 1008 | } |
73c04bcf | 1009 | |
b75a7d8f A |
1010 | return NULL; |
1011 | } | |
1012 | ||
1013 | static int32_t U_CALLCONV | |
4388f060 | 1014 | ucnv_io_countAllConverters(UEnumeration * /*enumerator*/, UErrorCode * /*pErrorCode*/) { |
73c04bcf | 1015 | return gMainTable.converterListSize; |
b75a7d8f A |
1016 | } |
1017 | ||
1018 | static const char* U_CALLCONV | |
1019 | ucnv_io_nextAllConverters(UEnumeration *enumerator, | |
1020 | int32_t* resultLength, | |
4388f060 | 1021 | UErrorCode * /*pErrorCode*/) |
b75a7d8f A |
1022 | { |
1023 | uint16_t *myContext = (uint16_t *)(enumerator->context); | |
1024 | ||
73c04bcf A |
1025 | if (*myContext < gMainTable.converterListSize) { |
1026 | const char *myStr = GET_STRING(gMainTable.converterList[(*myContext)++]); | |
b75a7d8f | 1027 | if (resultLength) { |
374ca955 | 1028 | *resultLength = (int32_t)uprv_strlen(myStr); |
b75a7d8f A |
1029 | } |
1030 | return myStr; | |
1031 | } | |
1032 | /* Either we accessed a zero length list, or we enumerated too far. */ | |
73c04bcf A |
1033 | if (resultLength) { |
1034 | *resultLength = 0; | |
1035 | } | |
b75a7d8f A |
1036 | return NULL; |
1037 | } | |
1038 | ||
1039 | static void U_CALLCONV | |
4388f060 | 1040 | ucnv_io_resetAllConverters(UEnumeration *enumerator, UErrorCode * /*pErrorCode*/) { |
b75a7d8f A |
1041 | *((uint16_t *)(enumerator->context)) = 0; |
1042 | } | |
1043 | ||
1044 | static const UEnumeration gEnumAllConverters = { | |
1045 | NULL, | |
1046 | NULL, | |
1047 | ucnv_io_closeUEnumeration, | |
1048 | ucnv_io_countAllConverters, | |
1049 | uenum_unextDefault, | |
1050 | ucnv_io_nextAllConverters, | |
1051 | ucnv_io_resetAllConverters | |
1052 | }; | |
1053 | ||
1054 | U_CAPI UEnumeration * U_EXPORT2 | |
1055 | ucnv_openAllNames(UErrorCode *pErrorCode) { | |
1056 | UEnumeration *myEnum = NULL; | |
1057 | if (haveAliasData(pErrorCode)) { | |
1058 | uint16_t *myContext; | |
1059 | ||
51004dcb | 1060 | myEnum = static_cast<UEnumeration *>(uprv_malloc(sizeof(UEnumeration))); |
b75a7d8f A |
1061 | if (myEnum == NULL) { |
1062 | *pErrorCode = U_MEMORY_ALLOCATION_ERROR; | |
1063 | return NULL; | |
1064 | } | |
1065 | uprv_memcpy(myEnum, &gEnumAllConverters, sizeof(UEnumeration)); | |
51004dcb | 1066 | myContext = static_cast<uint16_t *>(uprv_malloc(sizeof(uint16_t))); |
b75a7d8f A |
1067 | if (myContext == NULL) { |
1068 | *pErrorCode = U_MEMORY_ALLOCATION_ERROR; | |
1069 | uprv_free(myEnum); | |
1070 | return NULL; | |
1071 | } | |
1072 | *myContext = 0; | |
1073 | myEnum->context = myContext; | |
1074 | } | |
1075 | return myEnum; | |
1076 | } | |
1077 | ||
1078 | U_CFUNC uint16_t | |
46f4442e | 1079 | ucnv_io_countKnownConverters(UErrorCode *pErrorCode) { |
b75a7d8f | 1080 | if (haveAliasData(pErrorCode)) { |
46f4442e | 1081 | return (uint16_t)gMainTable.converterListSize; |
b75a7d8f A |
1082 | } |
1083 | return 0; | |
1084 | } | |
1085 | ||
374ca955 A |
1086 | /* alias table swapping ----------------------------------------------------- */ |
1087 | ||
1088 | typedef char * U_CALLCONV StripForCompareFn(char *dst, const char *name); | |
1089 | ||
1090 | /* | |
1091 | * row of a temporary array | |
1092 | * | |
1093 | * gets platform-endian charset string indexes and sorting indexes; | |
1094 | * after sorting this array by strings, the actual arrays are permutated | |
1095 | * according to the sorting indexes | |
1096 | */ | |
1097 | typedef struct TempRow { | |
1098 | uint16_t strIndex, sortIndex; | |
1099 | } TempRow; | |
1100 | ||
1101 | typedef struct TempAliasTable { | |
1102 | const char *chars; | |
1103 | TempRow *rows; | |
1104 | uint16_t *resort; | |
1105 | StripForCompareFn *stripForCompare; | |
1106 | } TempAliasTable; | |
1107 | ||
1108 | enum { | |
1109 | STACK_ROW_CAPACITY=500 | |
1110 | }; | |
1111 | ||
1112 | static int32_t | |
1113 | io_compareRows(const void *context, const void *left, const void *right) { | |
1114 | char strippedLeft[UCNV_MAX_CONVERTER_NAME_LENGTH], | |
1115 | strippedRight[UCNV_MAX_CONVERTER_NAME_LENGTH]; | |
1116 | ||
1117 | TempAliasTable *tempTable=(TempAliasTable *)context; | |
1118 | const char *chars=tempTable->chars; | |
1119 | ||
1120 | return (int32_t)uprv_strcmp(tempTable->stripForCompare(strippedLeft, chars+2*((const TempRow *)left)->strIndex), | |
1121 | tempTable->stripForCompare(strippedRight, chars+2*((const TempRow *)right)->strIndex)); | |
1122 | } | |
1123 | ||
1124 | U_CAPI int32_t U_EXPORT2 | |
1125 | ucnv_swapAliases(const UDataSwapper *ds, | |
1126 | const void *inData, int32_t length, void *outData, | |
1127 | UErrorCode *pErrorCode) { | |
1128 | const UDataInfo *pInfo; | |
1129 | int32_t headerSize; | |
1130 | ||
1131 | const uint16_t *inTable; | |
46f4442e | 1132 | const uint32_t *inSectionSizes; |
374ca955 A |
1133 | uint32_t toc[offsetsCount]; |
1134 | uint32_t offsets[offsetsCount]; /* 16-bit-addressed offsets from inTable/outTable */ | |
1135 | uint32_t i, count, tocLength, topOffset; | |
1136 | ||
1137 | TempRow rows[STACK_ROW_CAPACITY]; | |
1138 | uint16_t resort[STACK_ROW_CAPACITY]; | |
1139 | TempAliasTable tempTable; | |
1140 | ||
1141 | /* udata_swapDataHeader checks the arguments */ | |
1142 | headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); | |
1143 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | |
1144 | return 0; | |
1145 | } | |
1146 | ||
1147 | /* check data format and format version */ | |
1148 | pInfo=(const UDataInfo *)((const char *)inData+4); | |
1149 | if(!( | |
1150 | pInfo->dataFormat[0]==0x43 && /* dataFormat="CvAl" */ | |
1151 | pInfo->dataFormat[1]==0x76 && | |
1152 | pInfo->dataFormat[2]==0x41 && | |
1153 | pInfo->dataFormat[3]==0x6c && | |
1154 | pInfo->formatVersion[0]==3 | |
1155 | )) { | |
1156 | udata_printError(ds, "ucnv_swapAliases(): data format %02x.%02x.%02x.%02x (format version %02x) is not an alias table\n", | |
1157 | pInfo->dataFormat[0], pInfo->dataFormat[1], | |
1158 | pInfo->dataFormat[2], pInfo->dataFormat[3], | |
1159 | pInfo->formatVersion[0]); | |
1160 | *pErrorCode=U_UNSUPPORTED_ERROR; | |
1161 | return 0; | |
1162 | } | |
1163 | ||
1164 | /* an alias table must contain at least the table of contents array */ | |
1165 | if(length>=0 && (length-headerSize)<4*(1+minTocLength)) { | |
1166 | udata_printError(ds, "ucnv_swapAliases(): too few bytes (%d after header) for an alias table\n", | |
1167 | length-headerSize); | |
1168 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
1169 | return 0; | |
1170 | } | |
1171 | ||
46f4442e A |
1172 | inSectionSizes=(const uint32_t *)((const char *)inData+headerSize); |
1173 | inTable=(const uint16_t *)inSectionSizes; | |
73c04bcf | 1174 | uprv_memset(toc, 0, sizeof(toc)); |
46f4442e | 1175 | toc[tocLengthIndex]=tocLength=ds->readUInt32(inSectionSizes[tocLengthIndex]); |
73c04bcf A |
1176 | if(tocLength<minTocLength || offsetsCount<=tocLength) { |
1177 | udata_printError(ds, "ucnv_swapAliases(): table of contents contains unsupported number of sections (%u sections)\n", tocLength); | |
374ca955 A |
1178 | *pErrorCode=U_INVALID_FORMAT_ERROR; |
1179 | return 0; | |
1180 | } | |
1181 | ||
1182 | /* read the known part of the table of contents */ | |
73c04bcf | 1183 | for(i=converterListIndex; i<=tocLength; ++i) { |
46f4442e | 1184 | toc[i]=ds->readUInt32(inSectionSizes[i]); |
374ca955 A |
1185 | } |
1186 | ||
1187 | /* compute offsets */ | |
73c04bcf | 1188 | uprv_memset(offsets, 0, sizeof(offsets)); |
374ca955 | 1189 | offsets[converterListIndex]=2*(1+tocLength); /* count two 16-bit units per toc entry */ |
73c04bcf | 1190 | for(i=tagListIndex; i<=tocLength; ++i) { |
374ca955 A |
1191 | offsets[i]=offsets[i-1]+toc[i-1]; |
1192 | } | |
1193 | ||
1194 | /* compute the overall size of the after-header data, in numbers of 16-bit units */ | |
1195 | topOffset=offsets[i-1]+toc[i-1]; | |
1196 | ||
1197 | if(length>=0) { | |
1198 | uint16_t *outTable; | |
1199 | const uint16_t *p, *p2; | |
1200 | uint16_t *q, *q2; | |
1201 | uint16_t oldIndex; | |
1202 | ||
1203 | if((length-headerSize)<(2*(int32_t)topOffset)) { | |
1204 | udata_printError(ds, "ucnv_swapAliases(): too few bytes (%d after header) for an alias table\n", | |
1205 | length-headerSize); | |
1206 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
1207 | return 0; | |
1208 | } | |
1209 | ||
1210 | outTable=(uint16_t *)((char *)outData+headerSize); | |
1211 | ||
1212 | /* swap the entire table of contents */ | |
1213 | ds->swapArray32(ds, inTable, 4*(1+tocLength), outTable, pErrorCode); | |
1214 | ||
73c04bcf A |
1215 | /* swap unormalized strings & normalized strings */ |
1216 | ds->swapInvChars(ds, inTable+offsets[stringTableIndex], 2*(int32_t)(toc[stringTableIndex]+toc[normalizedStringTableIndex]), | |
374ca955 A |
1217 | outTable+offsets[stringTableIndex], pErrorCode); |
1218 | if(U_FAILURE(*pErrorCode)) { | |
73c04bcf | 1219 | udata_printError(ds, "ucnv_swapAliases().swapInvChars(charset names) failed\n"); |
374ca955 A |
1220 | return 0; |
1221 | } | |
1222 | ||
1223 | if(ds->inCharset==ds->outCharset) { | |
1224 | /* no need to sort, just swap all 16-bit values together */ | |
1225 | ds->swapArray16(ds, | |
1226 | inTable+offsets[converterListIndex], | |
1227 | 2*(int32_t)(offsets[stringTableIndex]-offsets[converterListIndex]), | |
1228 | outTable+offsets[converterListIndex], | |
1229 | pErrorCode); | |
1230 | } else { | |
1231 | /* allocate the temporary table for sorting */ | |
1232 | count=toc[aliasListIndex]; | |
1233 | ||
1234 | tempTable.chars=(const char *)(outTable+offsets[stringTableIndex]); /* sort by outCharset */ | |
1235 | ||
1236 | if(count<=STACK_ROW_CAPACITY) { | |
1237 | tempTable.rows=rows; | |
1238 | tempTable.resort=resort; | |
1239 | } else { | |
1240 | tempTable.rows=(TempRow *)uprv_malloc(count*sizeof(TempRow)+count*2); | |
1241 | if(tempTable.rows==NULL) { | |
1242 | udata_printError(ds, "ucnv_swapAliases(): unable to allocate memory for sorting tables (max length: %u)\n", | |
1243 | count); | |
1244 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; | |
1245 | return 0; | |
1246 | } | |
1247 | tempTable.resort=(uint16_t *)(tempTable.rows+count); | |
1248 | } | |
1249 | ||
1250 | if(ds->outCharset==U_ASCII_FAMILY) { | |
1251 | tempTable.stripForCompare=ucnv_io_stripASCIIForCompare; | |
1252 | } else /* U_EBCDIC_FAMILY */ { | |
1253 | tempTable.stripForCompare=ucnv_io_stripEBCDICForCompare; | |
1254 | } | |
1255 | ||
1256 | /* | |
1257 | * Sort unique aliases+mapped names. | |
1258 | * | |
1259 | * We need to sort the list again by outCharset strings because they | |
1260 | * sort differently for different charset families. | |
1261 | * First we set up a temporary table with the string indexes and | |
1262 | * sorting indexes and sort that. | |
1263 | * Then we permutate and copy/swap the actual values. | |
1264 | */ | |
1265 | p=inTable+offsets[aliasListIndex]; | |
1266 | q=outTable+offsets[aliasListIndex]; | |
1267 | ||
1268 | p2=inTable+offsets[untaggedConvArrayIndex]; | |
1269 | q2=outTable+offsets[untaggedConvArrayIndex]; | |
1270 | ||
1271 | for(i=0; i<count; ++i) { | |
1272 | tempTable.rows[i].strIndex=ds->readUInt16(p[i]); | |
1273 | tempTable.rows[i].sortIndex=(uint16_t)i; | |
1274 | } | |
1275 | ||
1276 | uprv_sortArray(tempTable.rows, (int32_t)count, sizeof(TempRow), | |
1277 | io_compareRows, &tempTable, | |
1278 | FALSE, pErrorCode); | |
1279 | ||
1280 | if(U_SUCCESS(*pErrorCode)) { | |
1281 | /* copy/swap/permutate items */ | |
1282 | if(p!=q) { | |
1283 | for(i=0; i<count; ++i) { | |
1284 | oldIndex=tempTable.rows[i].sortIndex; | |
1285 | ds->swapArray16(ds, p+oldIndex, 2, q+i, pErrorCode); | |
1286 | ds->swapArray16(ds, p2+oldIndex, 2, q2+i, pErrorCode); | |
1287 | } | |
1288 | } else { | |
1289 | /* | |
1290 | * If we swap in-place, then the permutation must use another | |
1291 | * temporary array (tempTable.resort) | |
1292 | * before the results are copied to the outBundle. | |
1293 | */ | |
1294 | uint16_t *r=tempTable.resort; | |
1295 | ||
1296 | for(i=0; i<count; ++i) { | |
1297 | oldIndex=tempTable.rows[i].sortIndex; | |
1298 | ds->swapArray16(ds, p+oldIndex, 2, r+i, pErrorCode); | |
1299 | } | |
a62d09fc | 1300 | uprv_memcpy(q, r, 2*(size_t)count); |
374ca955 A |
1301 | |
1302 | for(i=0; i<count; ++i) { | |
1303 | oldIndex=tempTable.rows[i].sortIndex; | |
1304 | ds->swapArray16(ds, p2+oldIndex, 2, r+i, pErrorCode); | |
1305 | } | |
a62d09fc | 1306 | uprv_memcpy(q2, r, 2*(size_t)count); |
374ca955 A |
1307 | } |
1308 | } | |
1309 | ||
1310 | if(tempTable.rows!=rows) { | |
1311 | uprv_free(tempTable.rows); | |
1312 | } | |
1313 | ||
1314 | if(U_FAILURE(*pErrorCode)) { | |
73c04bcf A |
1315 | udata_printError(ds, "ucnv_swapAliases().uprv_sortArray(%u items) failed\n", |
1316 | count); | |
374ca955 A |
1317 | return 0; |
1318 | } | |
1319 | ||
1320 | /* swap remaining 16-bit values */ | |
1321 | ds->swapArray16(ds, | |
1322 | inTable+offsets[converterListIndex], | |
1323 | 2*(int32_t)(offsets[aliasListIndex]-offsets[converterListIndex]), | |
1324 | outTable+offsets[converterListIndex], | |
1325 | pErrorCode); | |
1326 | ds->swapArray16(ds, | |
1327 | inTable+offsets[taggedAliasArrayIndex], | |
1328 | 2*(int32_t)(offsets[stringTableIndex]-offsets[taggedAliasArrayIndex]), | |
1329 | outTable+offsets[taggedAliasArrayIndex], | |
1330 | pErrorCode); | |
1331 | } | |
1332 | } | |
1333 | ||
1334 | return headerSize+2*(int32_t)topOffset; | |
1335 | } | |
1336 | ||
1337 | #endif | |
1338 | ||
57a6839d | 1339 | |
b75a7d8f A |
1340 | /* |
1341 | * Hey, Emacs, please set the following: | |
1342 | * | |
1343 | * Local Variables: | |
1344 | * indent-tabs-mode: nil | |
1345 | * End: | |
1346 | * | |
1347 | */ |