]>
Commit | Line | Data |
---|---|---|
9ce05555 | 1 | /* |
bd5b749c | 2 | * Copyright (c) 2008 Apple Inc. All rights reserved. |
9ce05555 A |
3 | * |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
9ce05555 A |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. Please obtain a copy of the License at | |
10 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
11 | * file. | |
12 | * | |
13 | * The Original Code and all software distributed under the License are | |
14 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
15 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
16 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
18 | * Please see the License for the specific language governing rights and | |
19 | * limitations under the License. | |
20 | * | |
21 | * @APPLE_LICENSE_HEADER_END@ | |
22 | */ | |
23 | /* CFStringEncodingConverter.c | |
24 | Copyright 1998-2002, Apple, Inc. All rights reserved. | |
25 | Responsibility: Aki Inoue | |
26 | */ | |
27 | ||
28 | #include "CFInternal.h" | |
29 | #include <CoreFoundation/CFArray.h> | |
30 | #include <CoreFoundation/CFDictionary.h> | |
31 | #include "CFUniChar.h" | |
bd5b749c | 32 | #include "CFPriv.h" |
9ce05555 A |
33 | #include "CFUnicodeDecomposition.h" |
34 | #include "CFStringEncodingConverterExt.h" | |
35 | #include "CFStringEncodingConverterPriv.h" | |
36 | #include <stdlib.h> | |
bd5b749c | 37 | #if !defined(__WIN32__) |
9ce05555 A |
38 | #include <pthread.h> |
39 | #endif | |
9ce05555 A |
40 | |
41 | ||
42 | /* Macros | |
43 | */ | |
44 | #define TO_BYTE(conv,flags,chars,numChars,bytes,max,used) (conv->_toBytes ? conv->toBytes(conv,flags,chars,numChars,bytes,max,used) : ((CFStringEncodingToBytesProc)conv->toBytes)(flags,chars,numChars,bytes,max,used)) | |
45 | #define TO_UNICODE(conv,flags,bytes,numBytes,chars,max,used) (conv->_toUnicode ? (flags & (kCFStringEncodingUseCanonical|kCFStringEncodingUseHFSPlusCanonical) ? conv->toCanonicalUnicode(conv,flags,bytes,numBytes,chars,max,used) : conv->toUnicode(conv,flags,bytes,numBytes,chars,max,used)) : ((CFStringEncodingToUnicodeProc)conv->toUnicode)(flags,bytes,numBytes,chars,max,used)) | |
46 | ||
9ce05555 A |
47 | #define ASCIINewLine 0x0a |
48 | #define kSurrogateHighStart 0xD800 | |
49 | #define kSurrogateHighEnd 0xDBFF | |
50 | #define kSurrogateLowStart 0xDC00 | |
51 | #define kSurrogateLowEnd 0xDFFF | |
52 | ||
53 | /* Mapping 128..255 to lossy ASCII | |
54 | */ | |
55 | static const struct { | |
56 | unsigned char chars[4]; | |
57 | } _toLossyASCIITable[] = { | |
58 | {{' ', 0, 0, 0}}, // NO-BREAK SPACE | |
59 | {{'!', 0, 0, 0}}, // INVERTED EXCLAMATION MARK | |
60 | {{'c', 0, 0, 0}}, // CENT SIGN | |
61 | {{'L', 0, 0, 0}}, // POUND SIGN | |
62 | {{'$', 0, 0, 0}}, // CURRENCY SIGN | |
63 | {{'Y', 0, 0, 0}}, // YEN SIGN | |
64 | {{'|', 0, 0, 0}}, // BROKEN BAR | |
65 | {{0, 0, 0, 0}}, // SECTION SIGN | |
66 | {{0, 0, 0, 0}}, // DIAERESIS | |
67 | {{'(', 'C', ')', 0}}, // COPYRIGHT SIGN | |
68 | {{'a', 0, 0, 0}}, // FEMININE ORDINAL INDICATOR | |
69 | {{'<', '<', 0, 0}}, // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK | |
70 | {{0, 0, 0, 0}}, // NOT SIGN | |
71 | {{'-', 0, 0, 0}}, // SOFT HYPHEN | |
72 | {{'(', 'R', ')', 0}}, // REGISTERED SIGN | |
73 | {{0, 0, 0, 0}}, // MACRON | |
74 | {{0, 0, 0, 0}}, // DEGREE SIGN | |
75 | {{'+', '-', 0, 0}}, // PLUS-MINUS SIGN | |
76 | {{'2', 0, 0, 0}}, // SUPERSCRIPT TWO | |
77 | {{'3', 0, 0, 0}}, // SUPERSCRIPT THREE | |
78 | {{0, 0, 0, 0}}, // ACUTE ACCENT | |
79 | {{0, 0, 0, 0}}, // MICRO SIGN | |
80 | {{0, 0, 0, 0}}, // PILCROW SIGN | |
81 | {{0, 0, 0, 0}}, // MIDDLE DOT | |
82 | {{0, 0, 0, 0}}, // CEDILLA | |
83 | {{'1', 0, 0, 0}}, // SUPERSCRIPT ONE | |
84 | {{'o', 0, 0, 0}}, // MASCULINE ORDINAL INDICATOR | |
85 | {{'>', '>', 0, 0}}, // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK | |
86 | {{'1', '/', '4', 0}}, // VULGAR FRACTION ONE QUARTER | |
87 | {{'1', '/', '2', 0}}, // VULGAR FRACTION ONE HALF | |
88 | {{'3', '/', '4', 0}}, // VULGAR FRACTION THREE QUARTERS | |
89 | {{'?', 0, 0, 0}}, // INVERTED QUESTION MARK | |
90 | {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH GRAVE | |
91 | {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH ACUTE | |
92 | {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX | |
93 | {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH TILDE | |
94 | {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH DIAERESIS | |
95 | {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH RING ABOVE | |
96 | {{'A', 'E', 0, 0}}, // LATIN CAPITAL LETTER AE | |
97 | {{'C', 0, 0, 0}}, // LATIN CAPITAL LETTER C WITH CEDILLA | |
98 | {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH GRAVE | |
99 | {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH ACUTE | |
100 | {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX | |
101 | {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH DIAERESIS | |
102 | {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH GRAVE | |
103 | {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH ACUTE | |
104 | {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH CIRCUMFLEX | |
105 | {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH DIAERESIS | |
106 | {{'T', 'H', 0, 0}}, // LATIN CAPITAL LETTER ETH (Icelandic) | |
107 | {{'N', 0, 0, 0}}, // LATIN CAPITAL LETTER N WITH TILDE | |
108 | {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH GRAVE | |
109 | {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH ACUTE | |
110 | {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX | |
111 | {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH TILDE | |
112 | {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH DIAERESIS | |
113 | {{'X', 0, 0, 0}}, // MULTIPLICATION SIGN | |
114 | {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH STROKE | |
115 | {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH GRAVE | |
116 | {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH ACUTE | |
117 | {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH CIRCUMFLEX | |
118 | {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH DIAERESIS | |
119 | {{'Y', 0, 0, 0}}, // LATIN CAPITAL LETTER Y WITH ACUTE | |
120 | {{'t', 'h', 0, 0}}, // LATIN CAPITAL LETTER THORN (Icelandic) | |
121 | {{'s', 0, 0, 0}}, // LATIN SMALL LETTER SHARP S (German) | |
122 | {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH GRAVE | |
123 | {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH ACUTE | |
124 | {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH CIRCUMFLEX | |
125 | {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH TILDE | |
126 | {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH DIAERESIS | |
127 | {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH RING ABOVE | |
128 | {{'a', 'e', 0, 0}}, // LATIN SMALL LETTER AE | |
129 | {{'c', 0, 0, 0}}, // LATIN SMALL LETTER C WITH CEDILLA | |
130 | {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH GRAVE | |
131 | {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH ACUTE | |
132 | {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH CIRCUMFLEX | |
133 | {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH DIAERESIS | |
134 | {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH GRAVE | |
135 | {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH ACUTE | |
136 | {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH CIRCUMFLEX | |
137 | {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH DIAERESIS | |
138 | {{'T', 'H', 0, 0}}, // LATIN SMALL LETTER ETH (Icelandic) | |
139 | {{'n', 0, 0, 0}}, // LATIN SMALL LETTER N WITH TILDE | |
140 | {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH GRAVE | |
141 | {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH ACUTE | |
142 | {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH CIRCUMFLEX | |
143 | {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH TILDE | |
144 | {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH DIAERESIS | |
145 | {{'/', 0, 0, 0}}, // DIVISION SIGN | |
146 | {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH STROKE | |
147 | {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH GRAVE | |
148 | {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH ACUTE | |
149 | {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH CIRCUMFLEX | |
150 | {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH DIAERESIS | |
151 | {{'y', 0, 0, 0}}, // LATIN SMALL LETTER Y WITH ACUTE | |
152 | {{'t', 'h', 0, 0}}, // LATIN SMALL LETTER THORN (Icelandic) | |
153 | {{'y', 0, 0, 0}}, // LATIN SMALL LETTER Y WITH DIAERESIS | |
154 | }; | |
155 | ||
bd5b749c A |
156 | CF_INLINE CFIndex __CFToASCIILatin1Fallback(UniChar character, uint8_t *bytes, CFIndex maxByteLen) { |
157 | const uint8_t *losChars = (const uint8_t*)_toLossyASCIITable + (character - 0xA0) * sizeof(uint8_t[4]); | |
158 | CFIndex numBytes = 0; | |
159 | CFIndex idx, max = (maxByteLen && (maxByteLen < 4) ? maxByteLen : 4); | |
9ce05555 A |
160 | |
161 | for (idx = 0;idx < max;idx++) { | |
162 | if (losChars[idx]) { | |
163 | if (maxByteLen) bytes[idx] = losChars[idx]; | |
164 | ++numBytes; | |
165 | } else { | |
166 | break; | |
167 | } | |
168 | } | |
169 | ||
170 | return numBytes; | |
171 | } | |
172 | ||
bd5b749c A |
173 | static CFIndex __CFDefaultToBytesFallbackProc(const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { |
174 | CFIndex processCharLen = 1, filledBytesLen = 1; | |
175 | uint8_t byte = '?'; | |
176 | ||
9ce05555 | 177 | if (*characters < 0xA0) { // 0x80 to 0x9F maps to ASCII C0 range |
bd5b749c | 178 | byte = (uint8_t)(*characters - 0x80); |
9ce05555 A |
179 | } else if (*characters < 0x100) { |
180 | *usedByteLen = __CFToASCIILatin1Fallback(*characters, bytes, maxByteLen); | |
181 | return 1; | |
182 | } else if (*characters >= kSurrogateHighStart && *characters <= kSurrogateLowEnd) { | |
bd5b749c | 183 | processCharLen = (numChars > 1 && *characters <= kSurrogateLowStart && *(characters + 1) >= kSurrogateLowStart && *(characters + 1) <= kSurrogateLowEnd ? 2 : 1); |
9ce05555 | 184 | } else if (CFUniCharIsMemberOf(*characters, kCFUniCharWhitespaceCharacterSet)) { |
bd5b749c | 185 | byte = ' '; |
9ce05555 | 186 | } else if (CFUniCharIsMemberOf(*characters, kCFUniCharWhitespaceAndNewlineCharacterSet)) { |
bd5b749c A |
187 | byte = ASCIINewLine; |
188 | } else if (*characters == 0x2026) { // ellipsis | |
189 | if (0 == maxByteLen) { | |
190 | filledBytesLen = 3; | |
191 | } else if (maxByteLen > 2) { | |
192 | memset(bytes, '.', 3); | |
193 | *usedByteLen = 3; | |
194 | return processCharLen; | |
195 | } | |
9ce05555 A |
196 | } else if (CFUniCharIsMemberOf(*characters, kCFUniCharDecomposableCharacterSet)) { |
197 | UTF32Char decomposed[MAX_DECOMPOSED_LENGTH]; | |
198 | ||
199 | (void)CFUniCharDecomposeCharacter(*characters, decomposed, MAX_DECOMPOSED_LENGTH); | |
200 | if (*decomposed < 0x80) { | |
bd5b749c | 201 | byte = (uint8_t)(*decomposed); |
9ce05555 A |
202 | } else { |
203 | UTF16Char theChar = *decomposed; | |
204 | ||
205 | return __CFDefaultToBytesFallbackProc(&theChar, 1, bytes, maxByteLen, usedByteLen); | |
206 | } | |
9ce05555 | 207 | } |
bd5b749c A |
208 | |
209 | if (maxByteLen) *bytes = byte; | |
210 | *usedByteLen = filledBytesLen; | |
211 | return processCharLen; | |
9ce05555 A |
212 | } |
213 | ||
bd5b749c | 214 | static CFIndex __CFDefaultToUnicodeFallbackProc(const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { |
9ce05555 A |
215 | if (maxCharLen) *characters = (UniChar)'?'; |
216 | *usedCharLen = 1; | |
217 | return 1; | |
218 | } | |
219 | ||
220 | #define TO_BYTE_FALLBACK(conv,chars,numChars,bytes,max,used) (conv->toBytesFallback(chars,numChars,bytes,max,used)) | |
221 | #define TO_UNICODE_FALLBACK(conv,bytes,numBytes,chars,max,used) (conv->toUnicodeFallback(bytes,numBytes,chars,max,used)) | |
222 | ||
223 | #define EXTRA_BASE (0x0F00) | |
224 | ||
225 | /* Wrapper funcs for non-standard converters | |
226 | */ | |
bd5b749c A |
227 | static CFIndex __CFToBytesCheapEightBitWrapper(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { |
228 | CFIndex processedCharLen = 0; | |
229 | CFIndex length = (maxByteLen && (maxByteLen < numChars) ? maxByteLen : numChars); | |
9ce05555 A |
230 | uint8_t byte; |
231 | ||
232 | while (processedCharLen < length) { | |
233 | if (!((CFStringEncodingCheapEightBitToBytesProc)((const _CFEncodingConverter*)converter)->_toBytes)(flags, characters[processedCharLen], &byte)) break; | |
234 | ||
235 | if (maxByteLen) bytes[processedCharLen] = byte; | |
236 | processedCharLen++; | |
237 | } | |
238 | ||
239 | *usedByteLen = processedCharLen; | |
240 | return processedCharLen; | |
241 | } | |
242 | ||
bd5b749c A |
243 | static CFIndex __CFToUnicodeCheapEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { |
244 | CFIndex processedByteLen = 0; | |
245 | CFIndex length = (maxCharLen && (maxCharLen < numBytes) ? maxCharLen : numBytes); | |
9ce05555 A |
246 | UniChar character; |
247 | ||
248 | while (processedByteLen < length) { | |
249 | if (!((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->_toUnicode)(flags, bytes[processedByteLen], &character)) break; | |
250 | ||
251 | if (maxCharLen) characters[processedByteLen] = character; | |
252 | processedByteLen++; | |
253 | } | |
254 | ||
255 | *usedCharLen = processedByteLen; | |
256 | return processedByteLen; | |
257 | } | |
258 | ||
bd5b749c A |
259 | static CFIndex __CFToCanonicalUnicodeCheapEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { |
260 | CFIndex processedByteLen = 0; | |
261 | CFIndex theUsedCharLen = 0; | |
9ce05555 | 262 | UTF32Char charBuffer[MAX_DECOMPOSED_LENGTH]; |
bd5b749c | 263 | CFIndex usedLen; |
9ce05555 A |
264 | UniChar character; |
265 | bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false); | |
266 | ||
267 | while ((processedByteLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) { | |
268 | if (!((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->_toUnicode)(flags, bytes[processedByteLen], &character)) break; | |
269 | ||
270 | if (CFUniCharIsDecomposableCharacter(character, isHFSPlus)) { | |
bd5b749c | 271 | CFIndex idx; |
9ce05555 A |
272 | |
273 | usedLen = CFUniCharDecomposeCharacter(character, charBuffer, MAX_DECOMPOSED_LENGTH); | |
274 | *usedCharLen = theUsedCharLen; | |
275 | ||
276 | for (idx = 0;idx < usedLen;idx++) { | |
277 | if (charBuffer[idx] > 0xFFFF) { // Non-BMP | |
278 | if (theUsedCharLen + 2 > maxCharLen) return processedByteLen; | |
279 | theUsedCharLen += 2; | |
280 | if (maxCharLen) { | |
281 | charBuffer[idx] = charBuffer[idx] - 0x10000; | |
bd5b749c A |
282 | *(characters++) = (UniChar)(charBuffer[idx] >> 10) + 0xD800UL; |
283 | *(characters++) = (UniChar)(charBuffer[idx] & 0x3FF) + 0xDC00UL; | |
9ce05555 A |
284 | } |
285 | } else { | |
286 | if (theUsedCharLen + 1 > maxCharLen) return processedByteLen; | |
287 | ++theUsedCharLen; | |
288 | *(characters++) = charBuffer[idx]; | |
289 | } | |
290 | } | |
291 | } else { | |
292 | if (maxCharLen) *(characters++) = character; | |
293 | ++theUsedCharLen; | |
294 | } | |
295 | processedByteLen++; | |
296 | } | |
297 | ||
298 | *usedCharLen = theUsedCharLen; | |
299 | return processedByteLen; | |
300 | } | |
301 | ||
bd5b749c A |
302 | static CFIndex __CFToBytesStandardEightBitWrapper(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { |
303 | CFIndex processedCharLen = 0; | |
9ce05555 | 304 | uint8_t byte; |
bd5b749c | 305 | CFIndex usedLen; |
9ce05555 A |
306 | |
307 | *usedByteLen = 0; | |
308 | ||
309 | while (numChars && (!maxByteLen || (*usedByteLen < maxByteLen))) { | |
310 | if (!(usedLen = ((CFStringEncodingStandardEightBitToBytesProc)((const _CFEncodingConverter*)converter)->_toBytes)(flags, characters, numChars, &byte))) break; | |
311 | ||
312 | if (maxByteLen) bytes[*usedByteLen] = byte; | |
313 | (*usedByteLen)++; | |
314 | characters += usedLen; | |
315 | numChars -= usedLen; | |
316 | processedCharLen += usedLen; | |
317 | } | |
318 | ||
319 | return processedCharLen; | |
320 | } | |
321 | ||
bd5b749c A |
322 | static CFIndex __CFToUnicodeStandardEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { |
323 | CFIndex processedByteLen = 0; | |
324 | #if 0 || 0 | |
9ce05555 A |
325 | UniChar charBuffer[20]; // Dynamic stack allocation is GNU specific |
326 | #else | |
327 | UniChar charBuffer[((const _CFEncodingConverter*)converter)->maxLen]; | |
328 | #endif | |
bd5b749c | 329 | CFIndex usedLen; |
9ce05555 A |
330 | |
331 | *usedCharLen = 0; | |
332 | ||
333 | while ((processedByteLen < numBytes) && (!maxCharLen || (*usedCharLen < maxCharLen))) { | |
334 | if (!(usedLen = ((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->_toUnicode)(flags, bytes[processedByteLen], charBuffer))) break; | |
335 | ||
336 | if (maxCharLen) { | |
bd5b749c | 337 | CFIndex idx; |
9ce05555 A |
338 | |
339 | if (*usedCharLen + usedLen > maxCharLen) break; | |
340 | ||
341 | for (idx = 0;idx < usedLen;idx++) { | |
342 | characters[*usedCharLen + idx] = charBuffer[idx]; | |
343 | } | |
344 | } | |
345 | *usedCharLen += usedLen; | |
346 | processedByteLen++; | |
347 | } | |
348 | ||
349 | return processedByteLen; | |
350 | } | |
351 | ||
bd5b749c A |
352 | static CFIndex __CFToCanonicalUnicodeStandardEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { |
353 | CFIndex processedByteLen = 0; | |
354 | #if 0 || 0 | |
9ce05555 A |
355 | UniChar charBuffer[20]; // Dynamic stack allocation is GNU specific |
356 | #else | |
357 | UniChar charBuffer[((const _CFEncodingConverter*)converter)->maxLen]; | |
358 | #endif | |
359 | UTF32Char decompBuffer[MAX_DECOMPOSED_LENGTH]; | |
bd5b749c A |
360 | CFIndex usedLen; |
361 | CFIndex decompedLen; | |
362 | CFIndex idx, decompIndex; | |
9ce05555 | 363 | bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false); |
bd5b749c | 364 | CFIndex theUsedCharLen = 0; |
9ce05555 A |
365 | |
366 | while ((processedByteLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) { | |
367 | if (!(usedLen = ((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->_toUnicode)(flags, bytes[processedByteLen], charBuffer))) break; | |
368 | ||
369 | for (idx = 0;idx < usedLen;idx++) { | |
370 | if (CFUniCharIsDecomposableCharacter(charBuffer[idx], isHFSPlus)) { | |
371 | decompedLen = CFUniCharDecomposeCharacter(charBuffer[idx], decompBuffer, MAX_DECOMPOSED_LENGTH); | |
372 | *usedCharLen = theUsedCharLen; | |
373 | ||
374 | for (decompIndex = 0;decompIndex < decompedLen;decompIndex++) { | |
375 | if (decompBuffer[decompIndex] > 0xFFFF) { // Non-BMP | |
376 | if (theUsedCharLen + 2 > maxCharLen) return processedByteLen; | |
377 | theUsedCharLen += 2; | |
378 | if (maxCharLen) { | |
379 | charBuffer[idx] = charBuffer[idx] - 0x10000; | |
380 | *(characters++) = (charBuffer[idx] >> 10) + 0xD800UL; | |
381 | *(characters++) = (charBuffer[idx] & 0x3FF) + 0xDC00UL; | |
382 | } | |
383 | } else { | |
384 | if (theUsedCharLen + 1 > maxCharLen) return processedByteLen; | |
385 | ++theUsedCharLen; | |
386 | *(characters++) = charBuffer[idx]; | |
387 | } | |
388 | } | |
389 | } else { | |
390 | if (maxCharLen) *(characters++) = charBuffer[idx]; | |
391 | ++theUsedCharLen; | |
392 | } | |
393 | } | |
394 | processedByteLen++; | |
395 | } | |
396 | ||
397 | *usedCharLen = theUsedCharLen; | |
398 | return processedByteLen; | |
399 | } | |
400 | ||
bd5b749c A |
401 | static CFIndex __CFToBytesCheapMultiByteWrapper(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { |
402 | CFIndex processedCharLen = 0; | |
403 | #if 0 || 0 | |
9ce05555 A |
404 | uint8_t byteBuffer[20]; // Dynamic stack allocation is GNU specific |
405 | #else | |
406 | uint8_t byteBuffer[((const _CFEncodingConverter*)converter)->maxLen]; | |
407 | #endif | |
bd5b749c | 408 | CFIndex usedLen; |
9ce05555 A |
409 | |
410 | *usedByteLen = 0; | |
411 | ||
412 | while ((processedCharLen < numChars) && (!maxByteLen || (*usedByteLen < maxByteLen))) { | |
413 | if (!(usedLen = ((CFStringEncodingCheapMultiByteToBytesProc)((const _CFEncodingConverter*)converter)->_toBytes)(flags, characters[processedCharLen], byteBuffer))) break; | |
414 | ||
415 | if (maxByteLen) { | |
bd5b749c | 416 | CFIndex idx; |
9ce05555 A |
417 | |
418 | if (*usedByteLen + usedLen > maxByteLen) break; | |
419 | ||
420 | for (idx = 0;idx <usedLen;idx++) { | |
421 | bytes[*usedByteLen + idx] = byteBuffer[idx]; | |
422 | } | |
423 | } | |
424 | ||
425 | *usedByteLen += usedLen; | |
426 | processedCharLen++; | |
427 | } | |
428 | ||
429 | return processedCharLen; | |
430 | } | |
431 | ||
bd5b749c A |
432 | static CFIndex __CFToUnicodeCheapMultiByteWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { |
433 | CFIndex processedByteLen = 0; | |
9ce05555 | 434 | UniChar character; |
bd5b749c | 435 | CFIndex usedLen; |
9ce05555 A |
436 | |
437 | *usedCharLen = 0; | |
438 | ||
439 | while (numBytes && (!maxCharLen || (*usedCharLen < maxCharLen))) { | |
440 | if (!(usedLen = ((CFStringEncodingCheapMultiByteToUnicodeProc)((const _CFEncodingConverter*)converter)->_toUnicode)(flags, bytes, numBytes, &character))) break; | |
441 | ||
442 | if (maxCharLen) *(characters++) = character; | |
443 | (*usedCharLen)++; | |
444 | processedByteLen += usedLen; | |
445 | bytes += usedLen; | |
446 | numBytes -= usedLen; | |
447 | } | |
448 | ||
449 | return processedByteLen; | |
450 | } | |
451 | ||
bd5b749c A |
452 | static CFIndex __CFToCanonicalUnicodeCheapMultiByteWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { |
453 | CFIndex processedByteLen = 0; | |
9ce05555 A |
454 | UTF32Char charBuffer[MAX_DECOMPOSED_LENGTH]; |
455 | UniChar character; | |
bd5b749c A |
456 | CFIndex usedLen; |
457 | CFIndex decomposedLen; | |
458 | CFIndex theUsedCharLen = 0; | |
9ce05555 A |
459 | bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false); |
460 | ||
461 | while (numBytes && (!maxCharLen || (theUsedCharLen < maxCharLen))) { | |
462 | if (!(usedLen = ((CFStringEncodingCheapMultiByteToUnicodeProc)((const _CFEncodingConverter*)converter)->_toUnicode)(flags, bytes, numBytes, &character))) break; | |
463 | ||
464 | if (CFUniCharIsDecomposableCharacter(character, isHFSPlus)) { | |
bd5b749c | 465 | CFIndex idx; |
9ce05555 A |
466 | |
467 | decomposedLen = CFUniCharDecomposeCharacter(character, charBuffer, MAX_DECOMPOSED_LENGTH); | |
468 | *usedCharLen = theUsedCharLen; | |
469 | ||
470 | for (idx = 0;idx < decomposedLen;idx++) { | |
471 | if (charBuffer[idx] > 0xFFFF) { // Non-BMP | |
472 | if (theUsedCharLen + 2 > maxCharLen) return processedByteLen; | |
473 | theUsedCharLen += 2; | |
474 | if (maxCharLen) { | |
475 | charBuffer[idx] = charBuffer[idx] - 0x10000; | |
bd5b749c A |
476 | *(characters++) = (UniChar)(charBuffer[idx] >> 10) + 0xD800UL; |
477 | *(characters++) = (UniChar)(charBuffer[idx] & 0x3FF) + 0xDC00UL; | |
9ce05555 A |
478 | } |
479 | } else { | |
480 | if (theUsedCharLen + 1 > maxCharLen) return processedByteLen; | |
481 | ++theUsedCharLen; | |
482 | *(characters++) = charBuffer[idx]; | |
483 | } | |
484 | } | |
485 | } else { | |
486 | if (maxCharLen) *(characters++) = character; | |
487 | ++theUsedCharLen; | |
488 | } | |
489 | ||
490 | processedByteLen += usedLen; | |
491 | bytes += usedLen; | |
492 | numBytes -= usedLen; | |
493 | } | |
494 | *usedCharLen = theUsedCharLen; | |
495 | return processedByteLen; | |
496 | } | |
497 | ||
498 | /* static functions | |
499 | */ | |
500 | static _CFConverterEntry __CFConverterEntryASCII = { | |
501 | kCFStringEncodingASCII, NULL, | |
502 | "Western (ASCII)", {"us-ascii", "ascii", "iso-646-us", NULL}, NULL, NULL, NULL, NULL, | |
503 | kCFStringEncodingMacRoman // We use string encoding's script range here | |
504 | }; | |
505 | ||
506 | static _CFConverterEntry __CFConverterEntryISOLatin1 = { | |
507 | kCFStringEncodingISOLatin1, NULL, | |
508 | "Western (ISO Latin 1)", {"iso-8859-1", "latin1","iso-latin-1", NULL}, NULL, NULL, NULL, NULL, | |
509 | kCFStringEncodingMacRoman // We use string encoding's script range here | |
510 | }; | |
511 | ||
512 | static _CFConverterEntry __CFConverterEntryMacRoman = { | |
513 | kCFStringEncodingMacRoman, NULL, | |
514 | "Western (Mac OS Roman)", {"macintosh", "mac", "x-mac-roman", NULL}, NULL, NULL, NULL, NULL, | |
515 | kCFStringEncodingMacRoman // We use string encoding's script range here | |
516 | }; | |
517 | ||
518 | static _CFConverterEntry __CFConverterEntryWinLatin1 = { | |
519 | kCFStringEncodingWindowsLatin1, NULL, | |
520 | "Western (Windows Latin 1)", {"windows-1252", "cp1252", "windows latin1", NULL}, NULL, NULL, NULL, NULL, | |
521 | kCFStringEncodingMacRoman // We use string encoding's script range here | |
522 | }; | |
523 | ||
524 | static _CFConverterEntry __CFConverterEntryNextStepLatin = { | |
525 | kCFStringEncodingNextStepLatin, NULL, | |
526 | "Western (NextStep)", {"x-nextstep", NULL, NULL, NULL}, NULL, NULL, NULL, NULL, | |
527 | kCFStringEncodingMacRoman // We use string encoding's script range here | |
528 | }; | |
529 | ||
530 | static _CFConverterEntry __CFConverterEntryUTF8 = { | |
531 | kCFStringEncodingUTF8, NULL, | |
532 | "UTF-8", {"utf-8", "unicode-1-1-utf8", NULL, NULL}, NULL, NULL, NULL, NULL, | |
533 | kCFStringEncodingUnicode // We use string encoding's script range here | |
534 | }; | |
535 | ||
bd5b749c | 536 | CF_INLINE _CFConverterEntry *__CFStringEncodingConverterGetEntry(uint32_t encoding) { |
9ce05555 A |
537 | switch (encoding) { |
538 | case kCFStringEncodingInvalidId: | |
539 | case kCFStringEncodingASCII: | |
540 | return &__CFConverterEntryASCII; | |
541 | ||
542 | case kCFStringEncodingISOLatin1: | |
543 | return &__CFConverterEntryISOLatin1; | |
544 | ||
545 | case kCFStringEncodingMacRoman: | |
546 | return &__CFConverterEntryMacRoman; | |
547 | ||
548 | case kCFStringEncodingWindowsLatin1: | |
549 | return &__CFConverterEntryWinLatin1; | |
550 | ||
551 | case kCFStringEncodingNextStepLatin: | |
552 | return &__CFConverterEntryNextStepLatin; | |
553 | ||
554 | case kCFStringEncodingUTF8: | |
555 | return &__CFConverterEntryUTF8; | |
556 | ||
d8925383 A |
557 | default: { |
558 | return NULL; | |
559 | } | |
9ce05555 A |
560 | } |
561 | } | |
562 | ||
563 | CF_INLINE _CFEncodingConverter *__CFEncodingConverterFromDefinition(const CFStringEncodingConverter *definition) { | |
564 | #define NUM_OF_ENTRIES_CYCLE (10) | |
bd5b749c A |
565 | static CFSpinLock_t _indexLock = CFSpinLockInit; |
566 | static uint32_t _currentIndex = 0; | |
567 | static uint32_t _allocatedSize = 0; | |
9ce05555 A |
568 | static _CFEncodingConverter *_allocatedEntries = NULL; |
569 | _CFEncodingConverter *converter; | |
570 | ||
571 | ||
572 | __CFSpinLock(&_indexLock); | |
573 | if ((_currentIndex + 1) >= _allocatedSize) { | |
574 | _currentIndex = 0; | |
575 | _allocatedSize = 0; | |
576 | _allocatedEntries = NULL; | |
577 | } | |
578 | if (_allocatedEntries == NULL) { // Not allocated yet | |
bd5b749c | 579 | _allocatedEntries = (_CFEncodingConverter *)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(_CFEncodingConverter) * NUM_OF_ENTRIES_CYCLE, 0); |
9ce05555 A |
580 | _allocatedSize = NUM_OF_ENTRIES_CYCLE; |
581 | converter = &(_allocatedEntries[_currentIndex]); | |
582 | } else { | |
583 | converter = &(_allocatedEntries[++_currentIndex]); | |
584 | } | |
585 | __CFSpinUnlock(&_indexLock); | |
586 | ||
587 | switch (definition->encodingClass) { | |
588 | case kCFStringEncodingConverterStandard: | |
bd5b749c A |
589 | converter->toBytes = (_CFToBytesProc)definition->toBytes; |
590 | converter->toUnicode = (_CFToUnicodeProc)definition->toUnicode; | |
591 | converter->toCanonicalUnicode = (_CFToUnicodeProc)definition->toUnicode; | |
9ce05555 A |
592 | converter->_toBytes = NULL; |
593 | converter->_toUnicode = NULL; | |
594 | converter->maxLen = 2; | |
595 | break; | |
596 | ||
597 | case kCFStringEncodingConverterCheapEightBit: | |
598 | converter->toBytes = __CFToBytesCheapEightBitWrapper; | |
599 | converter->toUnicode = __CFToUnicodeCheapEightBitWrapper; | |
600 | converter->toCanonicalUnicode = __CFToCanonicalUnicodeCheapEightBitWrapper; | |
601 | converter->_toBytes = definition->toBytes; | |
602 | converter->_toUnicode = definition->toUnicode; | |
603 | converter->maxLen = 1; | |
604 | break; | |
605 | ||
606 | case kCFStringEncodingConverterStandardEightBit: | |
607 | converter->toBytes = __CFToBytesStandardEightBitWrapper; | |
608 | converter->toUnicode = __CFToUnicodeStandardEightBitWrapper; | |
609 | converter->toCanonicalUnicode = __CFToCanonicalUnicodeStandardEightBitWrapper; | |
610 | converter->_toBytes = definition->toBytes; | |
611 | converter->_toUnicode = definition->toUnicode; | |
612 | converter->maxLen = definition->maxDecomposedCharLen; | |
613 | break; | |
614 | ||
615 | case kCFStringEncodingConverterCheapMultiByte: | |
616 | converter->toBytes = __CFToBytesCheapMultiByteWrapper; | |
617 | converter->toUnicode = __CFToUnicodeCheapMultiByteWrapper; | |
618 | converter->toCanonicalUnicode = __CFToCanonicalUnicodeCheapMultiByteWrapper; | |
619 | converter->_toBytes = definition->toBytes; | |
620 | converter->_toUnicode = definition->toUnicode; | |
621 | converter->maxLen = definition->maxBytesPerChar; | |
622 | break; | |
623 | ||
624 | case kCFStringEncodingConverterPlatformSpecific: | |
625 | converter->toBytes = NULL; | |
626 | converter->toUnicode = NULL; | |
627 | converter->toCanonicalUnicode = NULL; | |
628 | converter->_toBytes = NULL; | |
629 | converter->_toUnicode = NULL; | |
630 | converter->maxLen = 0; | |
631 | converter->toBytesLen = NULL; | |
632 | converter->toUnicodeLen = NULL; | |
633 | converter->toBytesFallback = NULL; | |
634 | converter->toUnicodeFallback = NULL; | |
635 | converter->toBytesPrecompose = NULL; | |
636 | converter->isValidCombiningChar = NULL; | |
637 | return converter; | |
638 | ||
639 | default: // Shouln't be here | |
640 | return NULL; | |
641 | } | |
642 | ||
bd5b749c A |
643 | converter->toBytesLen = (definition->toBytesLen ? definition->toBytesLen : (CFStringEncodingToBytesLenProc)(uintptr_t)definition->maxBytesPerChar); |
644 | converter->toUnicodeLen = (definition->toUnicodeLen ? definition->toUnicodeLen : (CFStringEncodingToUnicodeLenProc)(uintptr_t)definition->maxDecomposedCharLen); | |
9ce05555 A |
645 | converter->toBytesFallback = (definition->toBytesFallback ? definition->toBytesFallback : __CFDefaultToBytesFallbackProc); |
646 | converter->toUnicodeFallback = (definition->toUnicodeFallback ? definition->toUnicodeFallback : __CFDefaultToUnicodeFallbackProc); | |
647 | converter->toBytesPrecompose = (definition->toBytesPrecompose ? definition->toBytesPrecompose : NULL); | |
648 | converter->isValidCombiningChar = (definition->isValidCombiningChar ? definition->isValidCombiningChar : NULL); | |
649 | ||
650 | return converter; | |
651 | } | |
652 | ||
653 | CF_INLINE const CFStringEncodingConverter *__CFStringEncodingConverterGetDefinition(_CFConverterEntry *entry) { | |
654 | if (!entry) return NULL; | |
655 | ||
656 | switch (entry->encoding) { | |
657 | case kCFStringEncodingASCII: | |
658 | return &__CFConverterASCII; | |
659 | ||
660 | case kCFStringEncodingISOLatin1: | |
661 | return &__CFConverterISOLatin1; | |
662 | ||
663 | case kCFStringEncodingMacRoman: | |
664 | return &__CFConverterMacRoman; | |
665 | ||
666 | case kCFStringEncodingWindowsLatin1: | |
667 | return &__CFConverterWinLatin1; | |
668 | ||
669 | case kCFStringEncodingNextStepLatin: | |
670 | return &__CFConverterNextStepLatin; | |
671 | ||
672 | case kCFStringEncodingUTF8: | |
673 | return &__CFConverterUTF8; | |
674 | ||
675 | default: | |
bd5b749c | 676 | return NULL; |
9ce05555 A |
677 | } |
678 | } | |
679 | ||
bd5b749c | 680 | static const _CFEncodingConverter *__CFGetConverter(uint32_t encoding) { |
9ce05555 A |
681 | _CFConverterEntry *entry = __CFStringEncodingConverterGetEntry(encoding); |
682 | ||
683 | if (!entry) return NULL; | |
684 | ||
685 | if (!entry->converter) { | |
686 | const CFStringEncodingConverter *definition = __CFStringEncodingConverterGetDefinition(entry); | |
687 | ||
688 | if (definition) { | |
689 | entry->converter = __CFEncodingConverterFromDefinition(definition); | |
690 | entry->toBytesFallback = definition->toBytesFallback; | |
691 | entry->toUnicodeFallback = definition->toUnicodeFallback; | |
692 | } | |
693 | } | |
694 | ||
695 | return (_CFEncodingConverter *)entry->converter; | |
696 | } | |
697 | ||
698 | /* Public API | |
699 | */ | |
bd5b749c | 700 | uint32_t CFStringEncodingUnicodeToBytes(uint32_t encoding, uint32_t flags, const UniChar *characters, CFIndex numChars, CFIndex *usedCharLen, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { |
9ce05555 A |
701 | if (encoding == kCFStringEncodingUTF8) { |
702 | static CFStringEncodingToBytesProc __CFToUTF8 = NULL; | |
bd5b749c A |
703 | CFIndex convertedCharLen; |
704 | CFIndex usedLen; | |
9ce05555 A |
705 | |
706 | ||
707 | if ((flags & kCFStringEncodingUseCanonical) || (flags & kCFStringEncodingUseHFSPlusCanonical)) { | |
708 | (void)CFUniCharDecompose(characters, numChars, &convertedCharLen, (void *)bytes, maxByteLen, &usedLen, true, kCFUniCharUTF8Format, (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false)); | |
709 | } else { | |
710 | if (!__CFToUTF8) { | |
711 | const CFStringEncodingConverter *utf8Converter = CFStringEncodingGetConverter(kCFStringEncodingUTF8); | |
712 | __CFToUTF8 = (CFStringEncodingToBytesProc)utf8Converter->toBytes; | |
713 | } | |
bd5b749c | 714 | convertedCharLen = __CFToUTF8(0, characters, numChars, bytes, maxByteLen, &usedLen); |
9ce05555 A |
715 | } |
716 | if (usedCharLen) *usedCharLen = convertedCharLen; | |
717 | if (usedByteLen) *usedByteLen = usedLen; | |
718 | ||
719 | if (convertedCharLen == numChars) { | |
720 | return kCFStringEncodingConversionSuccess; | |
721 | } else if (maxByteLen && (maxByteLen == usedLen)) { | |
722 | return kCFStringEncodingInsufficientOutputBufferLength; | |
723 | } else { | |
724 | return kCFStringEncodingInvalidInputStream; | |
725 | } | |
726 | } else { | |
727 | const _CFEncodingConverter *converter = __CFGetConverter(encoding); | |
bd5b749c A |
728 | CFIndex usedLen = 0; |
729 | CFIndex localUsedByteLen; | |
730 | CFIndex theUsedByteLen = 0; | |
731 | uint32_t theResult = kCFStringEncodingConversionSuccess; | |
9ce05555 A |
732 | CFStringEncodingToBytesPrecomposeProc toBytesPrecompose = NULL; |
733 | CFStringEncodingIsValidCombiningCharacterProc isValidCombiningChar = NULL; | |
734 | ||
735 | if (!converter) return kCFStringEncodingConverterUnavailable; | |
736 | ||
737 | if (flags & kCFStringEncodingSubstituteCombinings) { | |
738 | if (!(flags & kCFStringEncodingAllowLossyConversion)) isValidCombiningChar = converter->isValidCombiningChar; | |
739 | } else { | |
740 | isValidCombiningChar = converter->isValidCombiningChar; | |
741 | if (!(flags & kCFStringEncodingIgnoreCombinings)) { | |
742 | toBytesPrecompose = converter->toBytesPrecompose; | |
743 | flags |= kCFStringEncodingComposeCombinings; | |
744 | } | |
745 | } | |
746 | ||
747 | ||
748 | while ((usedLen < numChars) && (!maxByteLen || (theUsedByteLen < maxByteLen))) { | |
749 | if ((usedLen += TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen)) < numChars) { | |
bd5b749c | 750 | CFIndex dummy; |
9ce05555 A |
751 | |
752 | if (isValidCombiningChar && (usedLen > 0) && isValidCombiningChar(characters[usedLen])) { | |
753 | if (toBytesPrecompose) { | |
bd5b749c | 754 | CFIndex localUsedLen = usedLen; |
9ce05555 A |
755 | |
756 | while (isValidCombiningChar(characters[--usedLen])); | |
757 | theUsedByteLen += localUsedByteLen; | |
758 | if (converter->maxLen > 1) { | |
759 | TO_BYTE(converter, flags, characters + usedLen, localUsedLen - usedLen, NULL, 0, &localUsedByteLen); | |
760 | theUsedByteLen -= localUsedByteLen; | |
761 | } else { | |
762 | theUsedByteLen--; | |
763 | } | |
764 | if ((localUsedLen = toBytesPrecompose(flags, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen)) > 0) { | |
765 | usedLen += localUsedLen; | |
766 | if ((usedLen < numChars) && isValidCombiningChar(characters[usedLen])) { // There is a non-base char not combined remaining | |
767 | theUsedByteLen += localUsedByteLen; | |
768 | theResult = kCFStringEncodingInvalidInputStream; | |
769 | break; | |
770 | } | |
771 | } else if (flags & kCFStringEncodingAllowLossyConversion) { | |
772 | uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags); | |
773 | ||
774 | if (lossyByte) { | |
775 | while (isValidCombiningChar(characters[++usedLen])); | |
776 | localUsedByteLen = 1; | |
777 | if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte; | |
778 | } else { | |
779 | ++usedLen; | |
780 | usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen); | |
781 | } | |
782 | } else { | |
783 | theResult = kCFStringEncodingInvalidInputStream; | |
784 | break; | |
785 | } | |
786 | } else if (maxByteLen && ((maxByteLen == theUsedByteLen + localUsedByteLen) || TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, NULL, 0, &dummy))) { // buffer was filled up | |
787 | theUsedByteLen += localUsedByteLen; | |
788 | theResult = kCFStringEncodingInsufficientOutputBufferLength; | |
789 | break; | |
790 | } else if (flags & kCFStringEncodingIgnoreCombinings) { | |
791 | while ((++usedLen < numChars) && isValidCombiningChar(characters[usedLen])); | |
792 | } else { | |
793 | uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags); | |
794 | ||
795 | theUsedByteLen += localUsedByteLen; | |
796 | if (lossyByte) { | |
797 | ++usedLen; | |
798 | localUsedByteLen = 1; | |
799 | if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte; | |
800 | } else { | |
801 | usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen); | |
802 | } | |
803 | } | |
804 | } else if (maxByteLen && ((maxByteLen == theUsedByteLen + localUsedByteLen) || TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, NULL, 0, &dummy))) { // buffer was filled up | |
805 | theUsedByteLen += localUsedByteLen; | |
806 | ||
807 | if (flags & kCFStringEncodingAllowLossyConversion && !CFStringEncodingMaskToLossyByte(flags)) { | |
bd5b749c | 808 | CFIndex localUsedLen; |
9ce05555 A |
809 | |
810 | localUsedByteLen = 0; | |
811 | while ((usedLen < numChars) && !localUsedByteLen && (localUsedLen = TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, NULL, 0, &localUsedByteLen))) usedLen += localUsedLen; | |
812 | } | |
813 | if (usedLen < numChars) theResult = kCFStringEncodingInsufficientOutputBufferLength; | |
814 | break; | |
815 | } else if (flags & kCFStringEncodingAllowLossyConversion) { | |
816 | uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags); | |
817 | ||
818 | theUsedByteLen += localUsedByteLen; | |
819 | if (lossyByte) { | |
820 | ++usedLen; | |
821 | localUsedByteLen = 1; | |
822 | if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte; | |
823 | } else { | |
824 | usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen); | |
825 | } | |
826 | } else { | |
827 | theUsedByteLen += localUsedByteLen; | |
828 | theResult = kCFStringEncodingInvalidInputStream; | |
829 | break; | |
830 | } | |
831 | } | |
832 | theUsedByteLen += localUsedByteLen; | |
833 | } | |
834 | ||
835 | if (usedLen < numChars && maxByteLen && theResult == kCFStringEncodingConversionSuccess) { | |
836 | if (flags & kCFStringEncodingAllowLossyConversion && !CFStringEncodingMaskToLossyByte(flags)) { | |
bd5b749c | 837 | CFIndex localUsedLen; |
9ce05555 A |
838 | |
839 | localUsedByteLen = 0; | |
840 | while ((usedLen < numChars) && !localUsedByteLen && (localUsedLen = TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, NULL, 0, &localUsedByteLen))) usedLen += localUsedLen; | |
841 | } | |
842 | if (usedLen < numChars) theResult = kCFStringEncodingInsufficientOutputBufferLength; | |
843 | } | |
844 | if (usedByteLen) *usedByteLen = theUsedByteLen; | |
845 | if (usedCharLen) *usedCharLen = usedLen; | |
846 | ||
847 | return theResult; | |
848 | } | |
849 | } | |
850 | ||
bd5b749c | 851 | uint32_t CFStringEncodingBytesToUnicode(uint32_t encoding, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, CFIndex *usedByteLen, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { |
9ce05555 | 852 | const _CFEncodingConverter *converter = __CFGetConverter(encoding); |
bd5b749c A |
853 | CFIndex usedLen = 0; |
854 | CFIndex theUsedCharLen = 0; | |
855 | CFIndex localUsedCharLen; | |
856 | uint32_t theResult = kCFStringEncodingConversionSuccess; | |
9ce05555 A |
857 | |
858 | if (!converter) return kCFStringEncodingConverterUnavailable; | |
859 | ||
860 | ||
861 | while ((usedLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) { | |
862 | if ((usedLen += TO_UNICODE(converter, flags, bytes + usedLen, numBytes - usedLen, characters + theUsedCharLen, (maxCharLen ? maxCharLen - theUsedCharLen : 0), &localUsedCharLen)) < numBytes) { | |
bd5b749c | 863 | CFIndex tempUsedCharLen; |
9ce05555 | 864 | |
bd5b749c | 865 | if (maxCharLen && ((maxCharLen == theUsedCharLen + localUsedCharLen) || (((flags & (kCFStringEncodingUseCanonical|kCFStringEncodingUseHFSPlusCanonical)) || (maxCharLen == theUsedCharLen + localUsedCharLen + 1)) && TO_UNICODE(converter, flags, bytes + usedLen, numBytes - usedLen, NULL, 0, &tempUsedCharLen)))) { // buffer was filled up |
9ce05555 A |
866 | theUsedCharLen += localUsedCharLen; |
867 | theResult = kCFStringEncodingInsufficientOutputBufferLength; | |
868 | break; | |
869 | } else if (flags & kCFStringEncodingAllowLossyConversion) { | |
870 | theUsedCharLen += localUsedCharLen; | |
871 | usedLen += TO_UNICODE_FALLBACK(converter, bytes + usedLen, numBytes - usedLen, characters + theUsedCharLen, (maxCharLen ? maxCharLen - theUsedCharLen : 0), &localUsedCharLen); | |
872 | } else { | |
873 | theUsedCharLen += localUsedCharLen; | |
874 | theResult = kCFStringEncodingInvalidInputStream; | |
875 | break; | |
876 | } | |
877 | } | |
878 | theUsedCharLen += localUsedCharLen; | |
879 | } | |
880 | ||
881 | if (usedLen < numBytes && maxCharLen && theResult == kCFStringEncodingConversionSuccess) { | |
882 | theResult = kCFStringEncodingInsufficientOutputBufferLength; | |
883 | } | |
884 | if (usedCharLen) *usedCharLen = theUsedCharLen; | |
885 | if (usedByteLen) *usedByteLen = usedLen; | |
886 | ||
887 | return theResult; | |
888 | } | |
889 | ||
bd5b749c | 890 | __private_extern__ bool CFStringEncodingIsValidEncoding(uint32_t encoding) { |
9ce05555 A |
891 | return (CFStringEncodingGetConverter(encoding) ? true : false); |
892 | } | |
893 | ||
bd5b749c | 894 | __private_extern__ const char *CFStringEncodingName(uint32_t encoding) { |
9ce05555 A |
895 | _CFConverterEntry *entry = __CFStringEncodingConverterGetEntry(encoding); |
896 | if (entry) return entry->encodingName; | |
897 | return NULL; | |
898 | } | |
899 | ||
bd5b749c | 900 | __private_extern__ const char **CFStringEncodingCanonicalCharsetNames(uint32_t encoding) { |
9ce05555 A |
901 | _CFConverterEntry *entry = __CFStringEncodingConverterGetEntry(encoding); |
902 | if (entry) return entry->ianaNames; | |
903 | return NULL; | |
904 | } | |
905 | ||
bd5b749c | 906 | __private_extern__ uint32_t CFStringEncodingGetScriptCodeForEncoding(CFStringEncoding encoding) { |
9ce05555 A |
907 | _CFConverterEntry *entry = __CFStringEncodingConverterGetEntry(encoding); |
908 | ||
d8925383 | 909 | return (entry ? entry->scriptCode : ((encoding & 0x0FFF) == kCFStringEncodingUnicode ? kCFStringEncodingUnicode : (encoding < 0xFF ? encoding : kCFStringEncodingInvalidId))); |
9ce05555 A |
910 | } |
911 | ||
bd5b749c | 912 | __private_extern__ CFIndex CFStringEncodingCharLengthForBytes(uint32_t encoding, uint32_t flags, const uint8_t *bytes, CFIndex numBytes) { |
9ce05555 A |
913 | const _CFEncodingConverter *converter = __CFGetConverter(encoding); |
914 | ||
915 | if (converter) { | |
bd5b749c | 916 | uintptr_t switchVal = (uintptr_t)(converter->toUnicodeLen); |
9ce05555 | 917 | |
bd5b749c | 918 | if (switchVal < 0xFFFF) { |
9ce05555 | 919 | return switchVal * numBytes; |
bd5b749c | 920 | } else { |
9ce05555 | 921 | return converter->toUnicodeLen(flags, bytes, numBytes); |
bd5b749c | 922 | } |
9ce05555 A |
923 | } |
924 | ||
925 | return 0; | |
926 | } | |
927 | ||
bd5b749c | 928 | __private_extern__ CFIndex CFStringEncodingByteLengthForCharacters(uint32_t encoding, uint32_t flags, const UniChar *characters, CFIndex numChars) { |
9ce05555 A |
929 | const _CFEncodingConverter *converter = __CFGetConverter(encoding); |
930 | ||
931 | if (converter) { | |
bd5b749c | 932 | uintptr_t switchVal = (uintptr_t)(converter->toBytesLen); |
9ce05555 | 933 | |
bd5b749c | 934 | if (switchVal < 0xFFFF) { |
9ce05555 | 935 | return switchVal * numChars; |
bd5b749c | 936 | } else { |
9ce05555 | 937 | return converter->toBytesLen(flags, characters, numChars); |
bd5b749c | 938 | } |
9ce05555 A |
939 | } |
940 | ||
941 | return 0; | |
942 | } | |
943 | ||
bd5b749c | 944 | __private_extern__ void CFStringEncodingRegisterFallbackProcedures(uint32_t encoding, CFStringEncodingToBytesFallbackProc toBytes, CFStringEncodingToUnicodeFallbackProc toUnicode) { |
9ce05555 A |
945 | _CFConverterEntry *entry = __CFStringEncodingConverterGetEntry(encoding); |
946 | ||
947 | if (entry && __CFGetConverter(encoding)) { | |
948 | ((_CFEncodingConverter*)entry->converter)->toBytesFallback = (toBytes ? toBytes : entry->toBytesFallback); | |
949 | ((_CFEncodingConverter*)entry->converter)->toUnicodeFallback = (toUnicode ? toUnicode : entry->toUnicodeFallback); | |
950 | } | |
951 | } | |
952 | ||
bd5b749c | 953 | __private_extern__ const CFStringEncodingConverter *CFStringEncodingGetConverter(uint32_t encoding) { |
9ce05555 A |
954 | return __CFStringEncodingConverterGetDefinition(__CFStringEncodingConverterGetEntry(encoding)); |
955 | } | |
956 | ||
bd5b749c | 957 | static const uint32_t __CFBuiltinEncodings[] = { |
9ce05555 A |
958 | kCFStringEncodingMacRoman, |
959 | kCFStringEncodingWindowsLatin1, | |
960 | kCFStringEncodingISOLatin1, | |
961 | kCFStringEncodingNextStepLatin, | |
962 | kCFStringEncodingASCII, | |
963 | kCFStringEncodingUTF8, | |
d8925383 | 964 | /* These seven are available only in CFString-level */ |
9ce05555 | 965 | kCFStringEncodingNonLossyASCII, |
d8925383 A |
966 | |
967 | kCFStringEncodingUTF16, | |
968 | kCFStringEncodingUTF16BE, | |
969 | kCFStringEncodingUTF16LE, | |
970 | ||
971 | kCFStringEncodingUTF32, | |
972 | kCFStringEncodingUTF32BE, | |
973 | kCFStringEncodingUTF32LE, | |
974 | ||
9ce05555 A |
975 | kCFStringEncodingInvalidId, |
976 | }; | |
977 | ||
978 | ||
bd5b749c | 979 | __private_extern__ const uint32_t *CFStringEncodingListOfAvailableEncodings(void) { |
9ce05555 A |
980 | return __CFBuiltinEncodings; |
981 | } | |
982 | ||
bd5b749c A |
983 | |
984 | #undef TO_BYTE | |
985 | #undef TO_UNICODE | |
986 | #undef ASCIINewLine | |
987 | #undef kSurrogateHighStart | |
988 | #undef kSurrogateHighEnd | |
989 | #undef kSurrogateLowStart | |
990 | #undef kSurrogateLowEnd | |
991 | #undef TO_BYTE_FALLBACK | |
992 | #undef TO_UNICODE_FALLBACK | |
993 | #undef EXTRA_BASE | |
994 | #undef NUM_OF_ENTRIES_CYCLE | |
995 |