]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ****************************************************************************** | |
3 | * | |
73c04bcf | 4 | * Copyright (C) 1999-2006, International Business Machines |
b75a7d8f A |
5 | * Corporation and others. All Rights Reserved. |
6 | * | |
7 | ****************************************************************************** | |
8 | * | |
9 | * | |
10 | * ucnv_io.c: | |
73c04bcf A |
11 | * initializes global variables and defines functions pertaining to converter |
12 | * name resolution aspect of the conversion code. | |
b75a7d8f A |
13 | * |
14 | * new implementation: | |
15 | * | |
16 | * created on: 1999nov22 | |
17 | * created by: Markus W. Scherer | |
18 | * | |
19 | * Use the binary cnvalias.icu (created from convrtrs.txt) to work | |
20 | * with aliases for converter names. | |
21 | * | |
22 | * Date Name Description | |
23 | * 11/22/1999 markus Created | |
24 | * 06/28/2002 grhoten Major overhaul of the converter alias design. | |
25 | * Now an alias can map to different converters | |
26 | * depending on the specified standard. | |
27 | ******************************************************************************* | |
28 | */ | |
29 | ||
30 | #include "unicode/utypes.h" | |
374ca955 A |
31 | |
32 | #if !UCONFIG_NO_CONVERSION | |
33 | ||
73c04bcf | 34 | #include "unicode/ucnv.h" |
b75a7d8f A |
35 | #include "unicode/udata.h" |
36 | ||
37 | #include "umutex.h" | |
374ca955 A |
38 | #include "uarrsort.h" |
39 | #include "udataswp.h" | |
b75a7d8f A |
40 | #include "cstring.h" |
41 | #include "cmemory.h" | |
42 | #include "ucnv_io.h" | |
43 | #include "uenumimp.h" | |
44 | #include "ucln_cmn.h" | |
45 | ||
46 | /* Format of cnvalias.icu ----------------------------------------------------- | |
47 | * | |
48 | * cnvalias.icu is a binary, memory-mappable form of convrtrs.txt. | |
49 | * This binary form contains several tables. All indexes are to uint16_t | |
50 | * units, and not to the bytes (uint8_t units). Addressing everything on | |
51 | * 16-bit boundaries allows us to store more information with small index | |
52 | * numbers, which are also 16-bit in size. The majority of the table (except | |
53 | * the string table) are 16-bit numbers. | |
54 | * | |
55 | * First there is the size of the Table of Contents (TOC). The TOC | |
56 | * entries contain the size of each section. In order to find the offset | |
57 | * you just need to sum up the previous offsets. | |
374ca955 A |
58 | * The TOC length and entries are an array of uint32_t values. |
59 | * The first section after the TOC starts immediately after the TOC. | |
b75a7d8f A |
60 | * |
61 | * 1) This section contains a list of converters. This list contains indexes | |
62 | * into the string table for the converter name. The index of this list is | |
63 | * also used by other sections, which are mentioned later on. | |
374ca955 | 64 | * This list is not sorted. |
b75a7d8f A |
65 | * |
66 | * 2) This section contains a list of tags. This list contains indexes | |
67 | * into the string table for the tag name. The index of this list is | |
68 | * also used by other sections, which are mentioned later on. | |
374ca955 | 69 | * This list is in priority order of standards. |
b75a7d8f A |
70 | * |
71 | * 3) This section contains a list of sorted unique aliases. This | |
72 | * list contains indexes into the string table for the alias name. The | |
73 | * index of this list is also used by other sections, like the 4th section. | |
74 | * The index for the 3rd and 4th section is used to get the | |
75 | * alias -> converter name mapping. Section 3 and 4 form a two column table. | |
73c04bcf A |
76 | * Some of the most significant bits of each index may contain other |
77 | * information (see findConverter for details). | |
b75a7d8f A |
78 | * |
79 | * 4) This section contains a list of mapped converter names. Consider this | |
80 | * as a table that maps the 3rd section to the 1st section. This list contains | |
81 | * indexes into the 1st section. The index of this list is the same index in | |
82 | * the 3rd section. There is also some extra information in the high bits of | |
83 | * each converter index in this table. Currently it's only used to say that | |
84 | * an alias mapped to this converter is ambiguous. See UCNV_CONVERTER_INDEX_MASK | |
85 | * and UCNV_AMBIGUOUS_ALIAS_MAP_BIT for more information. This section is | |
86 | * the predigested form of the 5th section so that an alias lookup can be fast. | |
374ca955 | 87 | * |
b75a7d8f A |
88 | * 5) This section contains a 2D array with indexes to the 6th section. This |
89 | * section is the full form of all alias mappings. The column index is the | |
90 | * index into the converter list (column header). The row index is the index | |
91 | * to tag list (row header). This 2D array is the top part a 3D array. The | |
92 | * third dimension is in the 6th section. | |
93 | * | |
94 | * 6) This is blob of variable length arrays. Each array starts with a size, | |
95 | * and is followed by indexes to alias names in the string table. This is | |
96 | * the third dimension to the section 5. No other section should be referencing | |
97 | * this section. | |
98 | * | |
73c04bcf A |
99 | * 7) Starting in ICU 3.6, this can be a UConverterAliasOptions struct. Its |
100 | * presence indicates that a section 9 exists. UConverterAliasOptions specifies | |
101 | * what type of string normalization is used among other potential things in the | |
102 | * future. | |
b75a7d8f A |
103 | * |
104 | * 8) This is the string table. All strings are indexed on an even address. | |
105 | * There are two reasons for this. First many chip architectures locate strings | |
106 | * faster on even address boundaries. Second, since all indexes are 16-bit | |
107 | * numbers, this string table can be 128KB in size instead of 64KB when we | |
108 | * only have strings starting on an even address. | |
109 | * | |
73c04bcf A |
110 | * 9) When present this is a set of prenormalized strings from section 8. This |
111 | * table contains normalized strings with the dashes and spaces stripped out, | |
112 | * and all strings lowercased. In the future, the options in section 7 may state | |
113 | * other types of normalization. | |
b75a7d8f A |
114 | * |
115 | * Here is the concept of section 5 and 6. It's a 3D cube. Each tag | |
116 | * has a unique alias among all converters. That same alias can | |
117 | * be mentioned in other standards on different converters, | |
118 | * but only one alias per tag can be unique. | |
119 | * | |
120 | * | |
121 | * Converter Names (Usually in TR22 form) | |
122 | * -------------------------------------------. | |
123 | * T / /| | |
124 | * a / / | | |
125 | * g / / | | |
126 | * s / / | | |
127 | * / / | | |
128 | * ------------------------------------------/ | | |
129 | * A | | | | |
130 | * l | | | | |
131 | * i | | / | |
132 | * a | | / | |
133 | * s | | / | |
134 | * e | | / | |
135 | * s | |/ | |
136 | * ------------------------------------------- | |
137 | * | |
138 | * | |
139 | * | |
140 | * Here is what it really looks like. It's like swiss cheese. | |
141 | * There are holes. Some converters aren't recognized by | |
142 | * a standard, or they are really old converters that the | |
143 | * standard doesn't recognize anymore. | |
144 | * | |
145 | * Converter Names (Usually in TR22 form) | |
146 | * -------------------------------------------. | |
147 | * T /##########################################/| | |
148 | * a / # # /# | |
374ca955 A |
149 | * g / # ## ## ### # ### ### ### #/ |
150 | * s / # ##### #### ## ## #/# | |
151 | * / ### # # ## # # # ### # # #/## | |
b75a7d8f A |
152 | * ------------------------------------------/# # |
153 | * A |### # # ## # # # ### # # #|# # | |
154 | * l |# # # # # ## # #|# # | |
155 | * i |# # # # # # #|# | |
156 | * a |# #|# | |
157 | * s | #|# | |
374ca955 A |
158 | * e |
159 | * s | |
160 | * | |
b75a7d8f A |
161 | */ |
162 | ||
163 | /** | |
164 | * Used by the UEnumeration API | |
165 | */ | |
166 | typedef struct UAliasContext { | |
167 | uint32_t listOffset; | |
168 | uint32_t listIdx; | |
169 | } UAliasContext; | |
170 | ||
171 | static const char DATA_NAME[] = "cnvalias"; | |
172 | static const char DATA_TYPE[] = "icu"; | |
173 | ||
174 | static UDataMemory *gAliasData=NULL; | |
175 | ||
374ca955 A |
176 | enum { |
177 | tocLengthIndex=0, | |
178 | converterListIndex=1, | |
179 | tagListIndex=2, | |
180 | aliasListIndex=3, | |
181 | untaggedConvArrayIndex=4, | |
182 | taggedAliasArrayIndex=5, | |
183 | taggedAliasListsIndex=6, | |
73c04bcf | 184 | tableOptionsIndex=7, |
374ca955 | 185 | stringTableIndex=8, |
73c04bcf A |
186 | normalizedStringTableIndex=9, |
187 | offsetsCount, /* length of the swapper's temporary offsets[] */ | |
188 | minTocLength=8 /* min. tocLength in the file, does not count the tocLengthIndex! */ | |
374ca955 A |
189 | }; |
190 | ||
73c04bcf A |
191 | static const UConverterAliasOptions defaultTableOptions = { |
192 | UCNV_IO_UNNORMALIZED, | |
193 | 0 /* containsCnvOptionInfo */ | |
194 | }; | |
195 | static UConverterAlias gMainTable; | |
b75a7d8f | 196 | |
73c04bcf A |
197 | #define GET_STRING(idx) (const char *)(gMainTable.stringTable + (idx)) |
198 | #define GET_NORMALIZED_STRING(idx) (const char *)(gMainTable.normalizedStringTable + (idx)) | |
b75a7d8f A |
199 | |
200 | static UBool U_CALLCONV | |
201 | isAcceptable(void *context, | |
202 | const char *type, const char *name, | |
203 | const UDataInfo *pInfo) { | |
204 | return (UBool)( | |
205 | pInfo->size>=20 && | |
206 | pInfo->isBigEndian==U_IS_BIG_ENDIAN && | |
207 | pInfo->charsetFamily==U_CHARSET_FAMILY && | |
208 | pInfo->dataFormat[0]==0x43 && /* dataFormat="CvAl" */ | |
209 | pInfo->dataFormat[1]==0x76 && | |
210 | pInfo->dataFormat[2]==0x41 && | |
211 | pInfo->dataFormat[3]==0x6c && | |
212 | pInfo->formatVersion[0]==3); | |
213 | } | |
214 | ||
374ca955 A |
215 | static UBool U_CALLCONV ucnv_io_cleanup(void) |
216 | { | |
217 | if (gAliasData) { | |
218 | udata_close(gAliasData); | |
219 | gAliasData = NULL; | |
220 | } | |
221 | ||
73c04bcf | 222 | uprv_memset(&gMainTable, 0, sizeof(gMainTable)); |
374ca955 A |
223 | |
224 | return TRUE; /* Everything was cleaned up */ | |
225 | } | |
226 | ||
b75a7d8f A |
227 | static UBool |
228 | haveAliasData(UErrorCode *pErrorCode) { | |
73c04bcf | 229 | int needInit; |
b75a7d8f A |
230 | |
231 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | |
232 | return FALSE; | |
233 | } | |
234 | ||
73c04bcf | 235 | UMTX_CHECK(NULL, (gAliasData==NULL), needInit); |
b75a7d8f A |
236 | |
237 | /* load converter alias data from file if necessary */ | |
73c04bcf | 238 | if (needInit) { |
b75a7d8f A |
239 | UDataMemory *data = NULL; |
240 | const uint16_t *table = NULL; | |
241 | uint32_t tableStart; | |
242 | uint32_t currOffset; | |
b75a7d8f A |
243 | |
244 | data = udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, pErrorCode); | |
245 | if(U_FAILURE(*pErrorCode)) { | |
246 | return FALSE; | |
247 | } | |
248 | ||
249 | table = (const uint16_t *)udata_getMemory(data); | |
250 | ||
251 | tableStart = ((const uint32_t *)(table))[0]; | |
374ca955 | 252 | if (tableStart < minTocLength) { |
b75a7d8f A |
253 | *pErrorCode = U_INVALID_FORMAT_ERROR; |
254 | udata_close(data); | |
255 | return FALSE; | |
256 | } | |
257 | ||
258 | umtx_lock(NULL); | |
259 | if(gAliasData==NULL) { | |
260 | gAliasData = data; | |
261 | data=NULL; | |
262 | ||
73c04bcf A |
263 | gMainTable.converterListSize = ((const uint32_t *)(table))[1]; |
264 | gMainTable.tagListSize = ((const uint32_t *)(table))[2]; | |
265 | gMainTable.aliasListSize = ((const uint32_t *)(table))[3]; | |
266 | gMainTable.untaggedConvArraySize = ((const uint32_t *)(table))[4]; | |
267 | gMainTable.taggedAliasArraySize = ((const uint32_t *)(table))[5]; | |
268 | gMainTable.taggedAliasListsSize = ((const uint32_t *)(table))[6]; | |
269 | gMainTable.optionTableSize = ((const uint32_t *)(table))[7]; | |
270 | gMainTable.stringTableSize = ((const uint32_t *)(table))[8]; | |
271 | ||
272 | if (((const uint32_t *)(table))[0] > 8) { | |
273 | gMainTable.normalizedStringTableSize = ((const uint32_t *)(table))[9]; | |
274 | } | |
b75a7d8f A |
275 | |
276 | currOffset = tableStart * (sizeof(uint32_t)/sizeof(uint16_t)) + (sizeof(uint32_t)/sizeof(uint16_t)); | |
73c04bcf | 277 | gMainTable.converterList = table + currOffset; |
b75a7d8f | 278 | |
73c04bcf A |
279 | currOffset += gMainTable.converterListSize; |
280 | gMainTable.tagList = table + currOffset; | |
b75a7d8f | 281 | |
73c04bcf A |
282 | currOffset += gMainTable.tagListSize; |
283 | gMainTable.aliasList = table + currOffset; | |
b75a7d8f | 284 | |
73c04bcf A |
285 | currOffset += gMainTable.aliasListSize; |
286 | gMainTable.untaggedConvArray = table + currOffset; | |
b75a7d8f | 287 | |
73c04bcf A |
288 | currOffset += gMainTable.untaggedConvArraySize; |
289 | gMainTable.taggedAliasArray = table + currOffset; | |
b75a7d8f A |
290 | |
291 | /* aliasLists is a 1's based array, but it has a padding character */ | |
73c04bcf A |
292 | currOffset += gMainTable.taggedAliasArraySize; |
293 | gMainTable.taggedAliasLists = table + currOffset; | |
b75a7d8f | 294 | |
73c04bcf A |
295 | currOffset += gMainTable.taggedAliasListsSize; |
296 | if (gMainTable.optionTableSize > 0 | |
297 | && ((const UConverterAliasOptions *)(table + currOffset))->stringNormalizationType < UCNV_IO_NORM_TYPE_COUNT) | |
298 | { | |
299 | /* Faster table */ | |
300 | gMainTable.optionTable = (const UConverterAliasOptions *)(table + currOffset); | |
301 | } | |
302 | else { | |
303 | /* Smaller table, or I can't handle this normalization mode! | |
304 | Use the original slower table lookup. */ | |
305 | gMainTable.optionTable = &defaultTableOptions; | |
306 | } | |
b75a7d8f | 307 | |
73c04bcf A |
308 | currOffset += gMainTable.optionTableSize; |
309 | gMainTable.stringTable = table + currOffset; | |
310 | ||
311 | currOffset += gMainTable.stringTableSize; | |
312 | gMainTable.normalizedStringTable = ((gMainTable.optionTable->stringNormalizationType == UCNV_IO_UNNORMALIZED) | |
313 | ? gMainTable.stringTable : (table + currOffset)); | |
b75a7d8f | 314 | |
374ca955 | 315 | ucln_common_registerCleanup(UCLN_COMMON_UCNV_IO, ucnv_io_cleanup); |
b75a7d8f A |
316 | } |
317 | umtx_unlock(NULL); | |
318 | ||
319 | /* if a different thread set it first, then close the extra data */ | |
320 | if(data!=NULL) { | |
321 | udata_close(data); /* NULL if it was set correctly */ | |
322 | } | |
323 | } | |
324 | ||
325 | return TRUE; | |
326 | } | |
327 | ||
328 | static U_INLINE UBool | |
329 | isAlias(const char *alias, UErrorCode *pErrorCode) { | |
330 | if(alias==NULL) { | |
331 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
332 | return FALSE; | |
b75a7d8f | 333 | } |
73c04bcf | 334 | return (UBool)(*alias!=0); |
b75a7d8f A |
335 | } |
336 | ||
b75a7d8f | 337 | static uint32_t getTagNumber(const char *tagname) { |
73c04bcf | 338 | if (gMainTable.tagList) { |
b75a7d8f | 339 | uint32_t tagNum; |
73c04bcf A |
340 | for (tagNum = 0; tagNum < gMainTable.tagListSize; tagNum++) { |
341 | if (!uprv_stricmp(GET_STRING(gMainTable.tagList[tagNum]), tagname)) { | |
b75a7d8f A |
342 | return tagNum; |
343 | } | |
344 | } | |
345 | } | |
346 | ||
347 | return UINT32_MAX; | |
348 | } | |
349 | ||
73c04bcf A |
350 | /* character types relevant for ucnv_compareNames() */ |
351 | enum { | |
352 | IGNORE, | |
353 | ZERO, | |
354 | NONZERO, | |
355 | MINLETTER /* any values from here on are lowercase letter mappings */ | |
356 | }; | |
357 | ||
358 | /* character types for ASCII 00..7F */ | |
359 | static const uint8_t asciiTypes[128] = { | |
360 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
361 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
362 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
363 | ZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, 0, 0, 0, 0, 0, 0, | |
364 | 0, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, | |
365 | 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0, 0, 0, 0, 0, | |
366 | 0, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, | |
367 | 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0, 0, 0, 0, 0 | |
368 | }; | |
369 | ||
370 | #define GET_ASCII_TYPE(c) ((int8_t)(c) >= 0 ? asciiTypes[(uint8_t)c] : (uint8_t)IGNORE) | |
371 | ||
372 | /* character types for EBCDIC 80..FF */ | |
373 | static const uint8_t ebcdicTypes[128] = { | |
374 | 0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0, 0, 0, 0, 0, 0, | |
375 | 0, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0, 0, 0, 0, 0, 0, | |
376 | 0, 0, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0, 0, 0, 0, 0, 0, | |
377 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
378 | 0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0, 0, 0, 0, 0, 0, | |
379 | 0, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0, 0, 0, 0, 0, 0, | |
380 | 0, 0, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0, 0, 0, 0, 0, 0, | |
381 | ZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, 0, 0, 0, 0, 0, 0 | |
382 | }; | |
383 | ||
384 | #define GET_EBCDIC_TYPE(c) ((int8_t)(c) < 0 ? ebcdicTypes[(c)&0x7f] : (uint8_t)IGNORE) | |
385 | ||
386 | #if U_CHARSET_FAMILY==U_ASCII_FAMILY | |
387 | # define GET_CHAR_TYPE(c) GET_ASCII_TYPE(c) | |
388 | #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY | |
389 | # define GET_CHAR_TYPE(c) GET_EBCDIC_TYPE(c) | |
390 | #else | |
391 | # error U_CHARSET_FAMILY is not valid | |
392 | #endif | |
393 | ||
b75a7d8f A |
394 | /* @see ucnv_compareNames */ |
395 | U_CFUNC char * U_EXPORT2 | |
374ca955 | 396 | ucnv_io_stripASCIIForCompare(char *dst, const char *name) { |
b75a7d8f | 397 | char *dstItr = dst; |
73c04bcf A |
398 | uint8_t type, nextType; |
399 | char c1; | |
400 | UBool afterDigit = FALSE; | |
401 | ||
402 | while ((c1 = *name++) != 0) { | |
403 | type = GET_ASCII_TYPE(c1); | |
404 | switch (type) { | |
405 | case IGNORE: | |
406 | afterDigit = FALSE; | |
407 | continue; /* ignore all but letters and digits */ | |
408 | case ZERO: | |
409 | if (!afterDigit) { | |
410 | nextType = GET_ASCII_TYPE(*name); | |
411 | if (nextType == ZERO || nextType == NONZERO) { | |
412 | continue; /* ignore leading zero before another digit */ | |
413 | } | |
414 | } | |
415 | break; | |
416 | case NONZERO: | |
417 | afterDigit = TRUE; | |
418 | break; | |
419 | default: | |
420 | c1 = (char)type; /* lowercased letter */ | |
421 | afterDigit = FALSE; | |
422 | break; | |
b75a7d8f | 423 | } |
73c04bcf | 424 | *dstItr++ = c1; |
374ca955 | 425 | } |
73c04bcf | 426 | *dstItr = 0; |
374ca955 A |
427 | return dst; |
428 | } | |
429 | ||
430 | U_CFUNC char * U_EXPORT2 | |
431 | ucnv_io_stripEBCDICForCompare(char *dst, const char *name) { | |
374ca955 | 432 | char *dstItr = dst; |
73c04bcf A |
433 | uint8_t type, nextType; |
434 | char c1; | |
435 | UBool afterDigit = FALSE; | |
436 | ||
437 | while ((c1 = *name++) != 0) { | |
438 | type = GET_EBCDIC_TYPE(c1); | |
439 | switch (type) { | |
440 | case IGNORE: | |
441 | afterDigit = FALSE; | |
442 | continue; /* ignore all but letters and digits */ | |
443 | case ZERO: | |
444 | if (!afterDigit) { | |
445 | nextType = GET_EBCDIC_TYPE(*name); | |
446 | if (nextType == ZERO || nextType == NONZERO) { | |
447 | continue; /* ignore leading zero before another digit */ | |
448 | } | |
449 | } | |
450 | break; | |
451 | case NONZERO: | |
452 | afterDigit = TRUE; | |
453 | break; | |
454 | default: | |
455 | c1 = (char)type; /* lowercased letter */ | |
456 | afterDigit = FALSE; | |
457 | break; | |
374ca955 | 458 | } |
73c04bcf | 459 | *dstItr++ = c1; |
b75a7d8f | 460 | } |
73c04bcf | 461 | *dstItr = 0; |
b75a7d8f A |
462 | return dst; |
463 | } | |
464 | ||
465 | /** | |
73c04bcf A |
466 | * Do a fuzzy compare of two converter/alias names. |
467 | * The comparison is case-insensitive, ignores leading zeroes if they are not | |
468 | * followed by further digits, and ignores all but letters and digits. | |
469 | * Thus the strings "UTF-8", "utf_8", "u*T@f08" and "Utf 8" are exactly equivalent. | |
470 | * See section 1.4, Charset Alias Matching in Unicode Technical Standard #22 | |
471 | * at http://www.unicode.org/reports/tr22/ | |
374ca955 | 472 | * |
b75a7d8f A |
473 | * This is a symmetrical (commutative) operation; order of arguments |
474 | * is insignificant. This is an important property for sorting the | |
475 | * list (when the list is preprocessed into binary form) and for | |
476 | * performing binary searches on it at run time. | |
374ca955 | 477 | * |
b75a7d8f A |
478 | * @param name1 a converter name or alias, zero-terminated |
479 | * @param name2 a converter name or alias, zero-terminated | |
480 | * @return 0 if the names match, or a negative value if the name1 | |
481 | * lexically precedes name2, or a positive value if the name1 | |
482 | * lexically follows name2. | |
483 | * | |
484 | * @see ucnv_io_stripForCompare | |
485 | */ | |
486 | U_CAPI int U_EXPORT2 | |
487 | ucnv_compareNames(const char *name1, const char *name2) { | |
488 | int rc; | |
73c04bcf | 489 | uint8_t type, nextType; |
b75a7d8f | 490 | char c1, c2; |
73c04bcf | 491 | UBool afterDigit1 = FALSE, afterDigit2 = FALSE; |
b75a7d8f A |
492 | |
493 | for (;;) { | |
73c04bcf A |
494 | while ((c1 = *name1++) != 0) { |
495 | type = GET_CHAR_TYPE(c1); | |
496 | switch (type) { | |
497 | case IGNORE: | |
498 | afterDigit1 = FALSE; | |
499 | continue; /* ignore all but letters and digits */ | |
500 | case ZERO: | |
501 | if (!afterDigit1) { | |
502 | nextType = GET_CHAR_TYPE(*name1); | |
503 | if (nextType == ZERO || nextType == NONZERO) { | |
504 | continue; /* ignore leading zero before another digit */ | |
505 | } | |
506 | } | |
507 | break; | |
508 | case NONZERO: | |
509 | afterDigit1 = TRUE; | |
510 | break; | |
511 | default: | |
512 | c1 = (char)type; /* lowercased letter */ | |
513 | afterDigit1 = FALSE; | |
514 | break; | |
515 | } | |
516 | break; /* deliver c1 */ | |
b75a7d8f | 517 | } |
73c04bcf A |
518 | while ((c2 = *name2++) != 0) { |
519 | type = GET_CHAR_TYPE(c2); | |
520 | switch (type) { | |
521 | case IGNORE: | |
522 | afterDigit2 = FALSE; | |
523 | continue; /* ignore all but letters and digits */ | |
524 | case ZERO: | |
525 | if (!afterDigit2) { | |
526 | nextType = GET_CHAR_TYPE(*name2); | |
527 | if (nextType == ZERO || nextType == NONZERO) { | |
528 | continue; /* ignore leading zero before another digit */ | |
529 | } | |
530 | } | |
531 | break; | |
532 | case NONZERO: | |
533 | afterDigit2 = TRUE; | |
534 | break; | |
535 | default: | |
536 | c2 = (char)type; /* lowercased letter */ | |
537 | afterDigit2 = FALSE; | |
538 | break; | |
539 | } | |
540 | break; /* deliver c2 */ | |
b75a7d8f A |
541 | } |
542 | ||
543 | /* If we reach the ends of both strings then they match */ | |
544 | if ((c1|c2)==0) { | |
545 | return 0; | |
546 | } | |
374ca955 | 547 | |
b75a7d8f | 548 | /* Case-insensitive comparison */ |
73c04bcf | 549 | rc = (int)(unsigned char)c1 - (int)(unsigned char)c2; |
b75a7d8f A |
550 | if (rc != 0) { |
551 | return rc; | |
552 | } | |
b75a7d8f A |
553 | } |
554 | } | |
555 | ||
556 | /* | |
557 | * search for an alias | |
558 | * return the converter number index for gConverterList | |
559 | */ | |
560 | static U_INLINE uint32_t | |
73c04bcf | 561 | findConverter(const char *alias, UBool *containsOption, UErrorCode *pErrorCode) { |
b75a7d8f | 562 | uint32_t mid, start, limit; |
374ca955 | 563 | uint32_t lastMid; |
b75a7d8f | 564 | int result; |
73c04bcf A |
565 | int isUnnormalized = (gMainTable.optionTable->stringNormalizationType == UCNV_IO_UNNORMALIZED); |
566 | char strippedName[UCNV_MAX_CONVERTER_NAME_LENGTH]; | |
567 | ||
568 | if (!isUnnormalized) { | |
569 | if (uprv_strlen(alias) >= UCNV_MAX_CONVERTER_NAME_LENGTH) { | |
570 | *pErrorCode = U_BUFFER_OVERFLOW_ERROR; | |
571 | return UINT32_MAX; | |
572 | } | |
573 | ||
574 | /* Lower case and remove ignoreable characters. */ | |
575 | ucnv_io_stripForCompare(strippedName, alias); | |
576 | alias = strippedName; | |
577 | } | |
b75a7d8f A |
578 | |
579 | /* do a binary search for the alias */ | |
580 | start = 0; | |
73c04bcf | 581 | limit = gMainTable.untaggedConvArraySize; |
b75a7d8f | 582 | mid = limit; |
374ca955 | 583 | lastMid = UINT32_MAX; |
b75a7d8f A |
584 | |
585 | for (;;) { | |
586 | mid = (uint32_t)((start + limit) / 2); | |
374ca955 A |
587 | if (lastMid == mid) { /* Have we moved? */ |
588 | break; /* We haven't moved, and it wasn't found. */ | |
589 | } | |
590 | lastMid = mid; | |
73c04bcf A |
591 | if (isUnnormalized) { |
592 | result = ucnv_compareNames(alias, GET_STRING(gMainTable.aliasList[mid])); | |
593 | } | |
594 | else { | |
595 | result = uprv_strcmp(alias, GET_NORMALIZED_STRING(gMainTable.aliasList[mid])); | |
596 | } | |
b75a7d8f A |
597 | |
598 | if (result < 0) { | |
599 | limit = mid; | |
600 | } else if (result > 0) { | |
601 | start = mid; | |
602 | } else { | |
603 | /* Since the gencnval tool folds duplicates into one entry, | |
604 | * this alias in gAliasList is unique, but different standards | |
605 | * may map an alias to different converters. | |
606 | */ | |
73c04bcf | 607 | if (gMainTable.untaggedConvArray[mid] & UCNV_AMBIGUOUS_ALIAS_MAP_BIT) { |
b75a7d8f A |
608 | *pErrorCode = U_AMBIGUOUS_ALIAS_WARNING; |
609 | } | |
73c04bcf A |
610 | /* State whether the canonical converter name contains an option. |
611 | This information is contained in this list in order to maintain backward & forward compatibility. */ | |
612 | if (containsOption) { | |
613 | UBool containsCnvOptionInfo = (UBool)gMainTable.optionTable->containsCnvOptionInfo; | |
614 | *containsOption = (UBool)((containsCnvOptionInfo | |
615 | && ((gMainTable.untaggedConvArray[mid] & UCNV_CONTAINS_OPTION_BIT) != 0)) | |
616 | || !containsCnvOptionInfo); | |
617 | } | |
618 | return gMainTable.untaggedConvArray[mid] & UCNV_CONVERTER_INDEX_MASK; | |
b75a7d8f A |
619 | } |
620 | } | |
621 | ||
622 | return UINT32_MAX; | |
623 | } | |
624 | ||
625 | /* | |
626 | * Is this alias in this list? | |
627 | * alias and listOffset should be non-NULL. | |
628 | */ | |
374ca955 | 629 | static U_INLINE UBool |
b75a7d8f A |
630 | isAliasInList(const char *alias, uint32_t listOffset) { |
631 | if (listOffset) { | |
632 | uint32_t currAlias; | |
73c04bcf | 633 | uint32_t listCount = gMainTable.taggedAliasLists[listOffset]; |
b75a7d8f | 634 | /* +1 to skip listCount */ |
73c04bcf | 635 | const uint16_t *currList = gMainTable.taggedAliasLists + listOffset + 1; |
b75a7d8f A |
636 | for (currAlias = 0; currAlias < listCount; currAlias++) { |
637 | if (currList[currAlias] | |
638 | && ucnv_compareNames(alias, GET_STRING(currList[currAlias]))==0) | |
639 | { | |
640 | return TRUE; | |
641 | } | |
642 | } | |
643 | } | |
644 | return FALSE; | |
645 | } | |
646 | ||
647 | /* | |
648 | * Search for an standard name of an alias (what is the default name | |
649 | * that this standard uses?) | |
650 | * return the listOffset for gTaggedAliasLists. If it's 0, | |
651 | * the it couldn't be found, but the parameters are valid. | |
652 | */ | |
653 | static uint32_t | |
654 | findTaggedAliasListsOffset(const char *alias, const char *standard, UErrorCode *pErrorCode) { | |
655 | uint32_t idx; | |
656 | uint32_t listOffset; | |
657 | uint32_t convNum; | |
658 | UErrorCode myErr = U_ZERO_ERROR; | |
659 | uint32_t tagNum = getTagNumber(standard); | |
660 | ||
661 | /* Make a quick guess. Hopefully they used a TR22 canonical alias. */ | |
73c04bcf | 662 | convNum = findConverter(alias, NULL, &myErr); |
b75a7d8f A |
663 | if (myErr != U_ZERO_ERROR) { |
664 | *pErrorCode = myErr; | |
665 | } | |
666 | ||
73c04bcf A |
667 | if (tagNum < (gMainTable.tagListSize - UCNV_NUM_HIDDEN_TAGS) && convNum < gMainTable.converterListSize) { |
668 | listOffset = gMainTable.taggedAliasArray[tagNum*gMainTable.converterListSize + convNum]; | |
669 | if (listOffset && gMainTable.taggedAliasLists[listOffset + 1]) { | |
b75a7d8f A |
670 | return listOffset; |
671 | } | |
672 | if (myErr == U_AMBIGUOUS_ALIAS_WARNING) { | |
673 | /* Uh Oh! They used an ambiguous alias. | |
674 | We have to search the whole swiss cheese starting | |
675 | at the highest standard affinity. | |
676 | This may take a while. | |
677 | */ | |
73c04bcf A |
678 | for (idx = 0; idx < gMainTable.taggedAliasArraySize; idx++) { |
679 | listOffset = gMainTable.taggedAliasArray[idx]; | |
b75a7d8f | 680 | if (listOffset && isAliasInList(alias, listOffset)) { |
73c04bcf A |
681 | uint32_t currTagNum = idx/gMainTable.converterListSize; |
682 | uint32_t currConvNum = (idx - currTagNum*gMainTable.converterListSize); | |
683 | uint32_t tempListOffset = gMainTable.taggedAliasArray[tagNum*gMainTable.converterListSize + currConvNum]; | |
684 | if (tempListOffset && gMainTable.taggedAliasLists[tempListOffset + 1]) { | |
b75a7d8f A |
685 | return tempListOffset; |
686 | } | |
687 | /* else keep on looking */ | |
688 | /* We could speed this up by starting on the next row | |
689 | because an alias is unique per row, right now. | |
690 | This would change if alias versioning appears. */ | |
691 | } | |
692 | } | |
693 | /* The standard doesn't know about the alias */ | |
694 | } | |
695 | /* else no default name */ | |
696 | return 0; | |
697 | } | |
698 | /* else converter or tag not found */ | |
699 | ||
700 | return UINT32_MAX; | |
701 | } | |
702 | ||
703 | /* Return the canonical name */ | |
704 | static uint32_t | |
705 | findTaggedConverterNum(const char *alias, const char *standard, UErrorCode *pErrorCode) { | |
706 | uint32_t idx; | |
707 | uint32_t listOffset; | |
708 | uint32_t convNum; | |
709 | UErrorCode myErr = U_ZERO_ERROR; | |
710 | uint32_t tagNum = getTagNumber(standard); | |
711 | ||
712 | /* Make a quick guess. Hopefully they used a TR22 canonical alias. */ | |
73c04bcf | 713 | convNum = findConverter(alias, NULL, &myErr); |
b75a7d8f A |
714 | if (myErr != U_ZERO_ERROR) { |
715 | *pErrorCode = myErr; | |
716 | } | |
717 | ||
73c04bcf A |
718 | if (tagNum < (gMainTable.tagListSize - UCNV_NUM_HIDDEN_TAGS) && convNum < gMainTable.converterListSize) { |
719 | listOffset = gMainTable.taggedAliasArray[tagNum*gMainTable.converterListSize + convNum]; | |
b75a7d8f A |
720 | if (listOffset && isAliasInList(alias, listOffset)) { |
721 | return convNum; | |
722 | } | |
723 | if (myErr == U_AMBIGUOUS_ALIAS_WARNING) { | |
724 | /* Uh Oh! They used an ambiguous alias. | |
725 | We have to search one slice of the swiss cheese. | |
726 | We search only in the requested tag, not the whole thing. | |
727 | This may take a while. | |
728 | */ | |
73c04bcf A |
729 | uint32_t convStart = (tagNum)*gMainTable.converterListSize; |
730 | uint32_t convLimit = (tagNum+1)*gMainTable.converterListSize; | |
b75a7d8f | 731 | for (idx = convStart; idx < convLimit; idx++) { |
73c04bcf | 732 | listOffset = gMainTable.taggedAliasArray[idx]; |
b75a7d8f A |
733 | if (listOffset && isAliasInList(alias, listOffset)) { |
734 | return idx-convStart; | |
735 | } | |
736 | } | |
737 | /* The standard doesn't know about the alias */ | |
738 | } | |
739 | /* else no canonical name */ | |
740 | } | |
741 | /* else converter or tag not found */ | |
742 | ||
743 | return UINT32_MAX; | |
744 | } | |
745 | ||
746 | ||
747 | ||
748 | U_CFUNC const char * | |
73c04bcf | 749 | ucnv_io_getConverterName(const char *alias, UBool *containsOption, UErrorCode *pErrorCode) { |
b75a7d8f | 750 | if(haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) { |
73c04bcf A |
751 | uint32_t convNum = findConverter(alias, containsOption, pErrorCode); |
752 | if (convNum < gMainTable.converterListSize) { | |
753 | return GET_STRING(gMainTable.converterList[convNum]); | |
b75a7d8f A |
754 | } |
755 | /* else converter not found */ | |
756 | } | |
757 | return NULL; | |
758 | } | |
759 | ||
760 | static int32_t U_CALLCONV | |
761 | ucnv_io_countStandardAliases(UEnumeration *enumerator, UErrorCode *pErrorCode) { | |
762 | int32_t value = 0; | |
763 | UAliasContext *myContext = (UAliasContext *)(enumerator->context); | |
764 | uint32_t listOffset = myContext->listOffset; | |
765 | ||
766 | if (listOffset) { | |
73c04bcf | 767 | value = gMainTable.taggedAliasLists[listOffset]; |
b75a7d8f A |
768 | } |
769 | return value; | |
770 | } | |
771 | ||
772 | static const char* U_CALLCONV | |
773 | ucnv_io_nextStandardAliases(UEnumeration *enumerator, | |
774 | int32_t* resultLength, | |
775 | UErrorCode *pErrorCode) | |
776 | { | |
777 | UAliasContext *myContext = (UAliasContext *)(enumerator->context); | |
778 | uint32_t listOffset = myContext->listOffset; | |
779 | ||
780 | if (listOffset) { | |
73c04bcf A |
781 | uint32_t listCount = gMainTable.taggedAliasLists[listOffset]; |
782 | const uint16_t *currList = gMainTable.taggedAliasLists + listOffset + 1; | |
b75a7d8f A |
783 | |
784 | if (myContext->listIdx < listCount) { | |
785 | const char *myStr = GET_STRING(currList[myContext->listIdx++]); | |
786 | if (resultLength) { | |
374ca955 | 787 | *resultLength = (int32_t)uprv_strlen(myStr); |
b75a7d8f A |
788 | } |
789 | return myStr; | |
790 | } | |
791 | } | |
792 | /* Either we accessed a zero length list, or we enumerated too far. */ | |
73c04bcf A |
793 | if (resultLength) { |
794 | *resultLength = 0; | |
795 | } | |
b75a7d8f A |
796 | return NULL; |
797 | } | |
798 | ||
799 | static void U_CALLCONV | |
800 | ucnv_io_resetStandardAliases(UEnumeration *enumerator, UErrorCode *pErrorCode) { | |
801 | ((UAliasContext *)(enumerator->context))->listIdx = 0; | |
802 | } | |
803 | ||
804 | static void U_CALLCONV | |
805 | ucnv_io_closeUEnumeration(UEnumeration *enumerator) { | |
806 | uprv_free(enumerator->context); | |
807 | uprv_free(enumerator); | |
808 | } | |
809 | ||
810 | /* Enumerate the aliases for the specified converter and standard tag */ | |
811 | static const UEnumeration gEnumAliases = { | |
812 | NULL, | |
813 | NULL, | |
814 | ucnv_io_closeUEnumeration, | |
815 | ucnv_io_countStandardAliases, | |
816 | uenum_unextDefault, | |
817 | ucnv_io_nextStandardAliases, | |
818 | ucnv_io_resetStandardAliases | |
819 | }; | |
820 | ||
821 | U_CAPI UEnumeration * U_EXPORT2 | |
822 | ucnv_openStandardNames(const char *convName, | |
823 | const char *standard, | |
824 | UErrorCode *pErrorCode) | |
825 | { | |
826 | UEnumeration *myEnum = NULL; | |
827 | if (haveAliasData(pErrorCode) && isAlias(convName, pErrorCode)) { | |
828 | uint32_t listOffset = findTaggedAliasListsOffset(convName, standard, pErrorCode); | |
829 | ||
830 | /* When listOffset == 0, we want to acknowledge that the | |
831 | converter name and standard are okay, but there | |
832 | is nothing to enumerate. */ | |
73c04bcf | 833 | if (listOffset < gMainTable.taggedAliasListsSize) { |
b75a7d8f A |
834 | UAliasContext *myContext; |
835 | ||
836 | myEnum = uprv_malloc(sizeof(UEnumeration)); | |
837 | if (myEnum == NULL) { | |
838 | *pErrorCode = U_MEMORY_ALLOCATION_ERROR; | |
839 | return NULL; | |
840 | } | |
841 | uprv_memcpy(myEnum, &gEnumAliases, sizeof(UEnumeration)); | |
842 | myContext = uprv_malloc(sizeof(UAliasContext)); | |
843 | if (myContext == NULL) { | |
844 | *pErrorCode = U_MEMORY_ALLOCATION_ERROR; | |
845 | uprv_free(myEnum); | |
846 | return NULL; | |
847 | } | |
848 | myContext->listOffset = listOffset; | |
849 | myContext->listIdx = 0; | |
850 | myEnum->context = myContext; | |
851 | } | |
852 | /* else converter or tag not found */ | |
853 | } | |
854 | return myEnum; | |
855 | } | |
856 | ||
73c04bcf | 857 | static uint16_t |
b75a7d8f A |
858 | ucnv_io_countAliases(const char *alias, UErrorCode *pErrorCode) { |
859 | if(haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) { | |
73c04bcf A |
860 | uint32_t convNum = findConverter(alias, NULL, pErrorCode); |
861 | if (convNum < gMainTable.converterListSize) { | |
b75a7d8f | 862 | /* tagListNum - 1 is the ALL tag */ |
73c04bcf | 863 | int32_t listOffset = gMainTable.taggedAliasArray[(gMainTable.tagListSize - 1)*gMainTable.converterListSize + convNum]; |
b75a7d8f A |
864 | |
865 | if (listOffset) { | |
73c04bcf | 866 | return gMainTable.taggedAliasLists[listOffset]; |
b75a7d8f A |
867 | } |
868 | /* else this shouldn't happen. internal program error */ | |
869 | } | |
870 | /* else converter not found */ | |
871 | } | |
872 | return 0; | |
873 | } | |
874 | ||
73c04bcf | 875 | static uint16_t |
b75a7d8f A |
876 | ucnv_io_getAliases(const char *alias, uint16_t start, const char **aliases, UErrorCode *pErrorCode) { |
877 | if(haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) { | |
878 | uint32_t currAlias; | |
73c04bcf A |
879 | uint32_t convNum = findConverter(alias, NULL, pErrorCode); |
880 | if (convNum < gMainTable.converterListSize) { | |
b75a7d8f | 881 | /* tagListNum - 1 is the ALL tag */ |
73c04bcf | 882 | int32_t listOffset = gMainTable.taggedAliasArray[(gMainTable.tagListSize - 1)*gMainTable.converterListSize + convNum]; |
b75a7d8f A |
883 | |
884 | if (listOffset) { | |
73c04bcf | 885 | uint32_t listCount = gMainTable.taggedAliasLists[listOffset]; |
b75a7d8f | 886 | /* +1 to skip listCount */ |
73c04bcf | 887 | const uint16_t *currList = gMainTable.taggedAliasLists + listOffset + 1; |
b75a7d8f A |
888 | |
889 | for (currAlias = start; currAlias < listCount; currAlias++) { | |
890 | aliases[currAlias] = GET_STRING(currList[currAlias]); | |
891 | } | |
892 | } | |
893 | /* else this shouldn't happen. internal program error */ | |
894 | } | |
895 | /* else converter not found */ | |
896 | } | |
897 | return 0; | |
898 | } | |
899 | ||
73c04bcf | 900 | static const char * |
b75a7d8f A |
901 | ucnv_io_getAlias(const char *alias, uint16_t n, UErrorCode *pErrorCode) { |
902 | if(haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) { | |
73c04bcf A |
903 | uint32_t convNum = findConverter(alias, NULL, pErrorCode); |
904 | if (convNum < gMainTable.converterListSize) { | |
b75a7d8f | 905 | /* tagListNum - 1 is the ALL tag */ |
73c04bcf | 906 | int32_t listOffset = gMainTable.taggedAliasArray[(gMainTable.tagListSize - 1)*gMainTable.converterListSize + convNum]; |
b75a7d8f A |
907 | |
908 | if (listOffset) { | |
73c04bcf | 909 | uint32_t listCount = gMainTable.taggedAliasLists[listOffset]; |
b75a7d8f | 910 | /* +1 to skip listCount */ |
73c04bcf | 911 | const uint16_t *currList = gMainTable.taggedAliasLists + listOffset + 1; |
b75a7d8f A |
912 | |
913 | if (n < listCount) { | |
914 | return GET_STRING(currList[n]); | |
915 | } | |
916 | *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; | |
917 | } | |
918 | /* else this shouldn't happen. internal program error */ | |
919 | } | |
920 | /* else converter not found */ | |
921 | } | |
922 | return NULL; | |
923 | } | |
924 | ||
73c04bcf | 925 | static uint16_t |
b75a7d8f A |
926 | ucnv_io_countStandards(UErrorCode *pErrorCode) { |
927 | if (haveAliasData(pErrorCode)) { | |
928 | /* Don't include the empty list */ | |
73c04bcf | 929 | return (uint16_t)(gMainTable.tagListSize - UCNV_NUM_HIDDEN_TAGS); |
b75a7d8f A |
930 | } |
931 | ||
932 | return 0; | |
933 | } | |
934 | ||
935 | U_CAPI const char * U_EXPORT2 | |
936 | ucnv_getStandard(uint16_t n, UErrorCode *pErrorCode) { | |
937 | if (haveAliasData(pErrorCode)) { | |
73c04bcf A |
938 | if (n < gMainTable.tagListSize - UCNV_NUM_HIDDEN_TAGS) { |
939 | return GET_STRING(gMainTable.tagList[n]); | |
b75a7d8f A |
940 | } |
941 | *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; | |
942 | } | |
943 | ||
944 | return NULL; | |
945 | } | |
946 | ||
947 | U_CAPI const char * U_EXPORT2 | |
948 | ucnv_getStandardName(const char *alias, const char *standard, UErrorCode *pErrorCode) { | |
949 | if (haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) { | |
950 | uint32_t listOffset = findTaggedAliasListsOffset(alias, standard, pErrorCode); | |
951 | ||
73c04bcf A |
952 | if (0 < listOffset && listOffset < gMainTable.taggedAliasListsSize) { |
953 | const uint16_t *currList = gMainTable.taggedAliasLists + listOffset + 1; | |
b75a7d8f A |
954 | |
955 | /* Get the preferred name from this list */ | |
956 | if (currList[0]) { | |
957 | return GET_STRING(currList[0]); | |
958 | } | |
959 | /* else someone screwed up the alias table. */ | |
960 | /* *pErrorCode = U_INVALID_FORMAT_ERROR */ | |
961 | } | |
962 | } | |
963 | ||
964 | return NULL; | |
965 | } | |
966 | ||
73c04bcf A |
967 | U_CAPI uint16_t U_EXPORT2 |
968 | ucnv_countAliases(const char *alias, UErrorCode *pErrorCode) | |
969 | { | |
970 | return ucnv_io_countAliases(alias, pErrorCode); | |
971 | } | |
b75a7d8f | 972 | |
b75a7d8f | 973 | |
73c04bcf A |
974 | U_CAPI const char* U_EXPORT2 |
975 | ucnv_getAlias(const char *alias, uint16_t n, UErrorCode *pErrorCode) | |
976 | { | |
977 | return ucnv_io_getAlias(alias, n, pErrorCode); | |
b75a7d8f A |
978 | } |
979 | ||
73c04bcf A |
980 | U_CAPI void U_EXPORT2 |
981 | ucnv_getAliases(const char *alias, const char **aliases, UErrorCode *pErrorCode) | |
982 | { | |
983 | ucnv_io_getAliases(alias, 0, aliases, pErrorCode); | |
b75a7d8f A |
984 | } |
985 | ||
73c04bcf A |
986 | U_CAPI uint16_t U_EXPORT2 |
987 | ucnv_countStandards(void) | |
988 | { | |
989 | UErrorCode err = U_ZERO_ERROR; | |
990 | return ucnv_io_countStandards(&err); | |
b75a7d8f A |
991 | } |
992 | ||
73c04bcf A |
993 | U_CAPI const char * U_EXPORT2 |
994 | ucnv_getCanonicalName(const char *alias, const char *standard, UErrorCode *pErrorCode) { | |
995 | if (haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) { | |
996 | uint32_t convNum = findTaggedConverterNum(alias, standard, pErrorCode); | |
b75a7d8f | 997 | |
73c04bcf A |
998 | if (convNum < gMainTable.converterListSize) { |
999 | return GET_STRING(gMainTable.converterList[convNum]); | |
b75a7d8f | 1000 | } |
b75a7d8f | 1001 | } |
73c04bcf | 1002 | |
b75a7d8f A |
1003 | return NULL; |
1004 | } | |
1005 | ||
1006 | static int32_t U_CALLCONV | |
1007 | ucnv_io_countAllConverters(UEnumeration *enumerator, UErrorCode *pErrorCode) { | |
73c04bcf | 1008 | return gMainTable.converterListSize; |
b75a7d8f A |
1009 | } |
1010 | ||
1011 | static const char* U_CALLCONV | |
1012 | ucnv_io_nextAllConverters(UEnumeration *enumerator, | |
1013 | int32_t* resultLength, | |
1014 | UErrorCode *pErrorCode) | |
1015 | { | |
1016 | uint16_t *myContext = (uint16_t *)(enumerator->context); | |
1017 | ||
73c04bcf A |
1018 | if (*myContext < gMainTable.converterListSize) { |
1019 | const char *myStr = GET_STRING(gMainTable.converterList[(*myContext)++]); | |
b75a7d8f | 1020 | if (resultLength) { |
374ca955 | 1021 | *resultLength = (int32_t)uprv_strlen(myStr); |
b75a7d8f A |
1022 | } |
1023 | return myStr; | |
1024 | } | |
1025 | /* Either we accessed a zero length list, or we enumerated too far. */ | |
73c04bcf A |
1026 | if (resultLength) { |
1027 | *resultLength = 0; | |
1028 | } | |
b75a7d8f A |
1029 | return NULL; |
1030 | } | |
1031 | ||
1032 | static void U_CALLCONV | |
1033 | ucnv_io_resetAllConverters(UEnumeration *enumerator, UErrorCode *pErrorCode) { | |
1034 | *((uint16_t *)(enumerator->context)) = 0; | |
1035 | } | |
1036 | ||
1037 | static const UEnumeration gEnumAllConverters = { | |
1038 | NULL, | |
1039 | NULL, | |
1040 | ucnv_io_closeUEnumeration, | |
1041 | ucnv_io_countAllConverters, | |
1042 | uenum_unextDefault, | |
1043 | ucnv_io_nextAllConverters, | |
1044 | ucnv_io_resetAllConverters | |
1045 | }; | |
1046 | ||
1047 | U_CAPI UEnumeration * U_EXPORT2 | |
1048 | ucnv_openAllNames(UErrorCode *pErrorCode) { | |
1049 | UEnumeration *myEnum = NULL; | |
1050 | if (haveAliasData(pErrorCode)) { | |
1051 | uint16_t *myContext; | |
1052 | ||
1053 | myEnum = uprv_malloc(sizeof(UEnumeration)); | |
1054 | if (myEnum == NULL) { | |
1055 | *pErrorCode = U_MEMORY_ALLOCATION_ERROR; | |
1056 | return NULL; | |
1057 | } | |
1058 | uprv_memcpy(myEnum, &gEnumAllConverters, sizeof(UEnumeration)); | |
1059 | myContext = uprv_malloc(sizeof(uint16_t)); | |
1060 | if (myContext == NULL) { | |
1061 | *pErrorCode = U_MEMORY_ALLOCATION_ERROR; | |
1062 | uprv_free(myEnum); | |
1063 | return NULL; | |
1064 | } | |
1065 | *myContext = 0; | |
1066 | myEnum->context = myContext; | |
1067 | } | |
1068 | return myEnum; | |
1069 | } | |
1070 | ||
1071 | U_CFUNC uint16_t | |
73c04bcf | 1072 | ucnv_io_countTotalAliases(UErrorCode *pErrorCode) { |
b75a7d8f | 1073 | if (haveAliasData(pErrorCode)) { |
73c04bcf | 1074 | return (uint16_t)gMainTable.aliasListSize; |
b75a7d8f A |
1075 | } |
1076 | return 0; | |
1077 | } | |
1078 | ||
374ca955 A |
1079 | /* alias table swapping ----------------------------------------------------- */ |
1080 | ||
1081 | typedef char * U_CALLCONV StripForCompareFn(char *dst, const char *name); | |
1082 | ||
1083 | /* | |
1084 | * row of a temporary array | |
1085 | * | |
1086 | * gets platform-endian charset string indexes and sorting indexes; | |
1087 | * after sorting this array by strings, the actual arrays are permutated | |
1088 | * according to the sorting indexes | |
1089 | */ | |
1090 | typedef struct TempRow { | |
1091 | uint16_t strIndex, sortIndex; | |
1092 | } TempRow; | |
1093 | ||
1094 | typedef struct TempAliasTable { | |
1095 | const char *chars; | |
1096 | TempRow *rows; | |
1097 | uint16_t *resort; | |
1098 | StripForCompareFn *stripForCompare; | |
1099 | } TempAliasTable; | |
1100 | ||
1101 | enum { | |
1102 | STACK_ROW_CAPACITY=500 | |
1103 | }; | |
1104 | ||
1105 | static int32_t | |
1106 | io_compareRows(const void *context, const void *left, const void *right) { | |
1107 | char strippedLeft[UCNV_MAX_CONVERTER_NAME_LENGTH], | |
1108 | strippedRight[UCNV_MAX_CONVERTER_NAME_LENGTH]; | |
1109 | ||
1110 | TempAliasTable *tempTable=(TempAliasTable *)context; | |
1111 | const char *chars=tempTable->chars; | |
1112 | ||
1113 | return (int32_t)uprv_strcmp(tempTable->stripForCompare(strippedLeft, chars+2*((const TempRow *)left)->strIndex), | |
1114 | tempTable->stripForCompare(strippedRight, chars+2*((const TempRow *)right)->strIndex)); | |
1115 | } | |
1116 | ||
1117 | U_CAPI int32_t U_EXPORT2 | |
1118 | ucnv_swapAliases(const UDataSwapper *ds, | |
1119 | const void *inData, int32_t length, void *outData, | |
1120 | UErrorCode *pErrorCode) { | |
1121 | const UDataInfo *pInfo; | |
1122 | int32_t headerSize; | |
1123 | ||
1124 | const uint16_t *inTable; | |
1125 | uint32_t toc[offsetsCount]; | |
1126 | uint32_t offsets[offsetsCount]; /* 16-bit-addressed offsets from inTable/outTable */ | |
1127 | uint32_t i, count, tocLength, topOffset; | |
1128 | ||
1129 | TempRow rows[STACK_ROW_CAPACITY]; | |
1130 | uint16_t resort[STACK_ROW_CAPACITY]; | |
1131 | TempAliasTable tempTable; | |
1132 | ||
1133 | /* udata_swapDataHeader checks the arguments */ | |
1134 | headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); | |
1135 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | |
1136 | return 0; | |
1137 | } | |
1138 | ||
1139 | /* check data format and format version */ | |
1140 | pInfo=(const UDataInfo *)((const char *)inData+4); | |
1141 | if(!( | |
1142 | pInfo->dataFormat[0]==0x43 && /* dataFormat="CvAl" */ | |
1143 | pInfo->dataFormat[1]==0x76 && | |
1144 | pInfo->dataFormat[2]==0x41 && | |
1145 | pInfo->dataFormat[3]==0x6c && | |
1146 | pInfo->formatVersion[0]==3 | |
1147 | )) { | |
1148 | udata_printError(ds, "ucnv_swapAliases(): data format %02x.%02x.%02x.%02x (format version %02x) is not an alias table\n", | |
1149 | pInfo->dataFormat[0], pInfo->dataFormat[1], | |
1150 | pInfo->dataFormat[2], pInfo->dataFormat[3], | |
1151 | pInfo->formatVersion[0]); | |
1152 | *pErrorCode=U_UNSUPPORTED_ERROR; | |
1153 | return 0; | |
1154 | } | |
1155 | ||
1156 | /* an alias table must contain at least the table of contents array */ | |
1157 | if(length>=0 && (length-headerSize)<4*(1+minTocLength)) { | |
1158 | udata_printError(ds, "ucnv_swapAliases(): too few bytes (%d after header) for an alias table\n", | |
1159 | length-headerSize); | |
1160 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
1161 | return 0; | |
1162 | } | |
1163 | ||
1164 | inTable=(const uint16_t *)((const char *)inData+headerSize); | |
73c04bcf | 1165 | uprv_memset(toc, 0, sizeof(toc)); |
374ca955 | 1166 | toc[tocLengthIndex]=tocLength=ds->readUInt32(((const uint32_t *)inTable)[tocLengthIndex]); |
73c04bcf A |
1167 | if(tocLength<minTocLength || offsetsCount<=tocLength) { |
1168 | udata_printError(ds, "ucnv_swapAliases(): table of contents contains unsupported number of sections (%u sections)\n", tocLength); | |
374ca955 A |
1169 | *pErrorCode=U_INVALID_FORMAT_ERROR; |
1170 | return 0; | |
1171 | } | |
1172 | ||
1173 | /* read the known part of the table of contents */ | |
73c04bcf | 1174 | for(i=converterListIndex; i<=tocLength; ++i) { |
374ca955 A |
1175 | toc[i]=ds->readUInt32(((const uint32_t *)inTable)[i]); |
1176 | } | |
1177 | ||
1178 | /* compute offsets */ | |
73c04bcf | 1179 | uprv_memset(offsets, 0, sizeof(offsets)); |
374ca955 | 1180 | offsets[converterListIndex]=2*(1+tocLength); /* count two 16-bit units per toc entry */ |
73c04bcf | 1181 | for(i=tagListIndex; i<=tocLength; ++i) { |
374ca955 A |
1182 | offsets[i]=offsets[i-1]+toc[i-1]; |
1183 | } | |
1184 | ||
1185 | /* compute the overall size of the after-header data, in numbers of 16-bit units */ | |
1186 | topOffset=offsets[i-1]+toc[i-1]; | |
1187 | ||
1188 | if(length>=0) { | |
1189 | uint16_t *outTable; | |
1190 | const uint16_t *p, *p2; | |
1191 | uint16_t *q, *q2; | |
1192 | uint16_t oldIndex; | |
1193 | ||
1194 | if((length-headerSize)<(2*(int32_t)topOffset)) { | |
1195 | udata_printError(ds, "ucnv_swapAliases(): too few bytes (%d after header) for an alias table\n", | |
1196 | length-headerSize); | |
1197 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
1198 | return 0; | |
1199 | } | |
1200 | ||
1201 | outTable=(uint16_t *)((char *)outData+headerSize); | |
1202 | ||
1203 | /* swap the entire table of contents */ | |
1204 | ds->swapArray32(ds, inTable, 4*(1+tocLength), outTable, pErrorCode); | |
1205 | ||
73c04bcf A |
1206 | /* swap unormalized strings & normalized strings */ |
1207 | ds->swapInvChars(ds, inTable+offsets[stringTableIndex], 2*(int32_t)(toc[stringTableIndex]+toc[normalizedStringTableIndex]), | |
374ca955 A |
1208 | outTable+offsets[stringTableIndex], pErrorCode); |
1209 | if(U_FAILURE(*pErrorCode)) { | |
73c04bcf | 1210 | udata_printError(ds, "ucnv_swapAliases().swapInvChars(charset names) failed\n"); |
374ca955 A |
1211 | return 0; |
1212 | } | |
1213 | ||
1214 | if(ds->inCharset==ds->outCharset) { | |
1215 | /* no need to sort, just swap all 16-bit values together */ | |
1216 | ds->swapArray16(ds, | |
1217 | inTable+offsets[converterListIndex], | |
1218 | 2*(int32_t)(offsets[stringTableIndex]-offsets[converterListIndex]), | |
1219 | outTable+offsets[converterListIndex], | |
1220 | pErrorCode); | |
1221 | } else { | |
1222 | /* allocate the temporary table for sorting */ | |
1223 | count=toc[aliasListIndex]; | |
1224 | ||
1225 | tempTable.chars=(const char *)(outTable+offsets[stringTableIndex]); /* sort by outCharset */ | |
1226 | ||
1227 | if(count<=STACK_ROW_CAPACITY) { | |
1228 | tempTable.rows=rows; | |
1229 | tempTable.resort=resort; | |
1230 | } else { | |
1231 | tempTable.rows=(TempRow *)uprv_malloc(count*sizeof(TempRow)+count*2); | |
1232 | if(tempTable.rows==NULL) { | |
1233 | udata_printError(ds, "ucnv_swapAliases(): unable to allocate memory for sorting tables (max length: %u)\n", | |
1234 | count); | |
1235 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; | |
1236 | return 0; | |
1237 | } | |
1238 | tempTable.resort=(uint16_t *)(tempTable.rows+count); | |
1239 | } | |
1240 | ||
1241 | if(ds->outCharset==U_ASCII_FAMILY) { | |
1242 | tempTable.stripForCompare=ucnv_io_stripASCIIForCompare; | |
1243 | } else /* U_EBCDIC_FAMILY */ { | |
1244 | tempTable.stripForCompare=ucnv_io_stripEBCDICForCompare; | |
1245 | } | |
1246 | ||
1247 | /* | |
1248 | * Sort unique aliases+mapped names. | |
1249 | * | |
1250 | * We need to sort the list again by outCharset strings because they | |
1251 | * sort differently for different charset families. | |
1252 | * First we set up a temporary table with the string indexes and | |
1253 | * sorting indexes and sort that. | |
1254 | * Then we permutate and copy/swap the actual values. | |
1255 | */ | |
1256 | p=inTable+offsets[aliasListIndex]; | |
1257 | q=outTable+offsets[aliasListIndex]; | |
1258 | ||
1259 | p2=inTable+offsets[untaggedConvArrayIndex]; | |
1260 | q2=outTable+offsets[untaggedConvArrayIndex]; | |
1261 | ||
1262 | for(i=0; i<count; ++i) { | |
1263 | tempTable.rows[i].strIndex=ds->readUInt16(p[i]); | |
1264 | tempTable.rows[i].sortIndex=(uint16_t)i; | |
1265 | } | |
1266 | ||
1267 | uprv_sortArray(tempTable.rows, (int32_t)count, sizeof(TempRow), | |
1268 | io_compareRows, &tempTable, | |
1269 | FALSE, pErrorCode); | |
1270 | ||
1271 | if(U_SUCCESS(*pErrorCode)) { | |
1272 | /* copy/swap/permutate items */ | |
1273 | if(p!=q) { | |
1274 | for(i=0; i<count; ++i) { | |
1275 | oldIndex=tempTable.rows[i].sortIndex; | |
1276 | ds->swapArray16(ds, p+oldIndex, 2, q+i, pErrorCode); | |
1277 | ds->swapArray16(ds, p2+oldIndex, 2, q2+i, pErrorCode); | |
1278 | } | |
1279 | } else { | |
1280 | /* | |
1281 | * If we swap in-place, then the permutation must use another | |
1282 | * temporary array (tempTable.resort) | |
1283 | * before the results are copied to the outBundle. | |
1284 | */ | |
1285 | uint16_t *r=tempTable.resort; | |
1286 | ||
1287 | for(i=0; i<count; ++i) { | |
1288 | oldIndex=tempTable.rows[i].sortIndex; | |
1289 | ds->swapArray16(ds, p+oldIndex, 2, r+i, pErrorCode); | |
1290 | } | |
1291 | uprv_memcpy(q, r, 2*count); | |
1292 | ||
1293 | for(i=0; i<count; ++i) { | |
1294 | oldIndex=tempTable.rows[i].sortIndex; | |
1295 | ds->swapArray16(ds, p2+oldIndex, 2, r+i, pErrorCode); | |
1296 | } | |
1297 | uprv_memcpy(q2, r, 2*count); | |
1298 | } | |
1299 | } | |
1300 | ||
1301 | if(tempTable.rows!=rows) { | |
1302 | uprv_free(tempTable.rows); | |
1303 | } | |
1304 | ||
1305 | if(U_FAILURE(*pErrorCode)) { | |
73c04bcf A |
1306 | udata_printError(ds, "ucnv_swapAliases().uprv_sortArray(%u items) failed\n", |
1307 | count); | |
374ca955 A |
1308 | return 0; |
1309 | } | |
1310 | ||
1311 | /* swap remaining 16-bit values */ | |
1312 | ds->swapArray16(ds, | |
1313 | inTable+offsets[converterListIndex], | |
1314 | 2*(int32_t)(offsets[aliasListIndex]-offsets[converterListIndex]), | |
1315 | outTable+offsets[converterListIndex], | |
1316 | pErrorCode); | |
1317 | ds->swapArray16(ds, | |
1318 | inTable+offsets[taggedAliasArrayIndex], | |
1319 | 2*(int32_t)(offsets[stringTableIndex]-offsets[taggedAliasArrayIndex]), | |
1320 | outTable+offsets[taggedAliasArrayIndex], | |
1321 | pErrorCode); | |
1322 | } | |
1323 | } | |
1324 | ||
1325 | return headerSize+2*(int32_t)topOffset; | |
1326 | } | |
1327 | ||
1328 | #endif | |
1329 | ||
b75a7d8f A |
1330 | /* |
1331 | * Hey, Emacs, please set the following: | |
1332 | * | |
1333 | * Local Variables: | |
1334 | * indent-tabs-mode: nil | |
1335 | * End: | |
1336 | * | |
1337 | */ |