]>
Commit | Line | Data |
---|---|---|
9ce05555 | 1 | /* |
e29e285d | 2 | * Copyright (c) 2015 Apple Inc. All rights reserved. |
9ce05555 A |
3 | * |
4 | * @APPLE_LICENSE_HEADER_START@ | |
d7384798 | 5 | * |
9ce05555 A |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. Please obtain a copy of the License at | |
10 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
11 | * file. | |
d7384798 | 12 | * |
9ce05555 A |
13 | * The Original Code and all software distributed under the License are |
14 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
15 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
16 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
18 | * Please see the License for the specific language governing rights and | |
19 | * limitations under the License. | |
d7384798 | 20 | * |
9ce05555 A |
21 | * @APPLE_LICENSE_HEADER_END@ |
22 | */ | |
f64f9b69 | 23 | |
9ce05555 | 24 | /* CFStringEncodingConverter.c |
d7384798 | 25 | Copyright (c) 1998-2014, Apple Inc. All rights reserved. |
9ce05555 A |
26 | Responsibility: Aki Inoue |
27 | */ | |
28 | ||
29 | #include "CFInternal.h" | |
30 | #include <CoreFoundation/CFArray.h> | |
31 | #include <CoreFoundation/CFDictionary.h> | |
cf7d2af9 A |
32 | #include "CFICUConverters.h" |
33 | #include <CoreFoundation/CFUniChar.h> | |
34 | #include <CoreFoundation/CFPriv.h> | |
9ce05555 A |
35 | #include "CFUnicodeDecomposition.h" |
36 | #include "CFStringEncodingConverterExt.h" | |
37 | #include "CFStringEncodingConverterPriv.h" | |
38 | #include <stdlib.h> | |
9ce05555 | 39 | |
cf7d2af9 A |
40 | typedef CFIndex (*_CFToBytesProc)(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen); |
41 | typedef CFIndex (*_CFToUnicodeProc)(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen); | |
42 | ||
43 | typedef struct { | |
44 | const CFStringEncodingConverter *definition; | |
45 | _CFToBytesProc toBytes; | |
46 | _CFToUnicodeProc toUnicode; | |
47 | _CFToUnicodeProc toCanonicalUnicode; | |
48 | CFStringEncodingToBytesFallbackProc toBytesFallback; | |
49 | CFStringEncodingToUnicodeFallbackProc toUnicodeFallback; | |
50 | } _CFEncodingConverter; | |
9ce05555 A |
51 | |
52 | /* Macros | |
53 | */ | |
cf7d2af9 A |
54 | #define TO_BYTE(conv,flags,chars,numChars,bytes,max,used) (conv->toBytes ? conv->toBytes(conv,flags,chars,numChars,bytes,max,used) : ((CFStringEncodingToBytesProc)conv->definition->toBytes)(flags,chars,numChars,bytes,max,used)) |
55 | #define TO_UNICODE(conv,flags,bytes,numBytes,chars,max,used) (conv->toUnicode ? (flags & (kCFStringEncodingUseCanonical|kCFStringEncodingUseHFSPlusCanonical) ? conv->toCanonicalUnicode(conv,flags,bytes,numBytes,chars,max,used) : conv->toUnicode(conv,flags,bytes,numBytes,chars,max,used)) : ((CFStringEncodingToUnicodeProc)conv->definition->toUnicode)(flags,bytes,numBytes,chars,max,used)) | |
9ce05555 | 56 | |
9ce05555 A |
57 | #define ASCIINewLine 0x0a |
58 | #define kSurrogateHighStart 0xD800 | |
59 | #define kSurrogateHighEnd 0xDBFF | |
60 | #define kSurrogateLowStart 0xDC00 | |
61 | #define kSurrogateLowEnd 0xDFFF | |
62 | ||
cf7d2af9 A |
63 | static const uint8_t __CFMaximumConvertedLength = 20; |
64 | ||
9ce05555 A |
65 | /* Mapping 128..255 to lossy ASCII |
66 | */ | |
67 | static const struct { | |
68 | unsigned char chars[4]; | |
69 | } _toLossyASCIITable[] = { | |
70 | {{' ', 0, 0, 0}}, // NO-BREAK SPACE | |
71 | {{'!', 0, 0, 0}}, // INVERTED EXCLAMATION MARK | |
72 | {{'c', 0, 0, 0}}, // CENT SIGN | |
73 | {{'L', 0, 0, 0}}, // POUND SIGN | |
74 | {{'$', 0, 0, 0}}, // CURRENCY SIGN | |
75 | {{'Y', 0, 0, 0}}, // YEN SIGN | |
76 | {{'|', 0, 0, 0}}, // BROKEN BAR | |
77 | {{0, 0, 0, 0}}, // SECTION SIGN | |
78 | {{0, 0, 0, 0}}, // DIAERESIS | |
79 | {{'(', 'C', ')', 0}}, // COPYRIGHT SIGN | |
80 | {{'a', 0, 0, 0}}, // FEMININE ORDINAL INDICATOR | |
81 | {{'<', '<', 0, 0}}, // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK | |
82 | {{0, 0, 0, 0}}, // NOT SIGN | |
83 | {{'-', 0, 0, 0}}, // SOFT HYPHEN | |
84 | {{'(', 'R', ')', 0}}, // REGISTERED SIGN | |
85 | {{0, 0, 0, 0}}, // MACRON | |
86 | {{0, 0, 0, 0}}, // DEGREE SIGN | |
87 | {{'+', '-', 0, 0}}, // PLUS-MINUS SIGN | |
88 | {{'2', 0, 0, 0}}, // SUPERSCRIPT TWO | |
89 | {{'3', 0, 0, 0}}, // SUPERSCRIPT THREE | |
90 | {{0, 0, 0, 0}}, // ACUTE ACCENT | |
91 | {{0, 0, 0, 0}}, // MICRO SIGN | |
92 | {{0, 0, 0, 0}}, // PILCROW SIGN | |
93 | {{0, 0, 0, 0}}, // MIDDLE DOT | |
94 | {{0, 0, 0, 0}}, // CEDILLA | |
95 | {{'1', 0, 0, 0}}, // SUPERSCRIPT ONE | |
96 | {{'o', 0, 0, 0}}, // MASCULINE ORDINAL INDICATOR | |
97 | {{'>', '>', 0, 0}}, // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK | |
98 | {{'1', '/', '4', 0}}, // VULGAR FRACTION ONE QUARTER | |
99 | {{'1', '/', '2', 0}}, // VULGAR FRACTION ONE HALF | |
100 | {{'3', '/', '4', 0}}, // VULGAR FRACTION THREE QUARTERS | |
101 | {{'?', 0, 0, 0}}, // INVERTED QUESTION MARK | |
102 | {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH GRAVE | |
103 | {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH ACUTE | |
104 | {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX | |
105 | {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH TILDE | |
106 | {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH DIAERESIS | |
107 | {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH RING ABOVE | |
108 | {{'A', 'E', 0, 0}}, // LATIN CAPITAL LETTER AE | |
109 | {{'C', 0, 0, 0}}, // LATIN CAPITAL LETTER C WITH CEDILLA | |
110 | {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH GRAVE | |
111 | {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH ACUTE | |
112 | {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX | |
113 | {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH DIAERESIS | |
114 | {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH GRAVE | |
115 | {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH ACUTE | |
116 | {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH CIRCUMFLEX | |
117 | {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH DIAERESIS | |
118 | {{'T', 'H', 0, 0}}, // LATIN CAPITAL LETTER ETH (Icelandic) | |
119 | {{'N', 0, 0, 0}}, // LATIN CAPITAL LETTER N WITH TILDE | |
120 | {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH GRAVE | |
121 | {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH ACUTE | |
122 | {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX | |
123 | {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH TILDE | |
124 | {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH DIAERESIS | |
125 | {{'X', 0, 0, 0}}, // MULTIPLICATION SIGN | |
126 | {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH STROKE | |
127 | {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH GRAVE | |
128 | {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH ACUTE | |
129 | {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH CIRCUMFLEX | |
130 | {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH DIAERESIS | |
131 | {{'Y', 0, 0, 0}}, // LATIN CAPITAL LETTER Y WITH ACUTE | |
132 | {{'t', 'h', 0, 0}}, // LATIN CAPITAL LETTER THORN (Icelandic) | |
133 | {{'s', 0, 0, 0}}, // LATIN SMALL LETTER SHARP S (German) | |
134 | {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH GRAVE | |
135 | {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH ACUTE | |
136 | {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH CIRCUMFLEX | |
137 | {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH TILDE | |
138 | {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH DIAERESIS | |
139 | {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH RING ABOVE | |
140 | {{'a', 'e', 0, 0}}, // LATIN SMALL LETTER AE | |
141 | {{'c', 0, 0, 0}}, // LATIN SMALL LETTER C WITH CEDILLA | |
142 | {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH GRAVE | |
143 | {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH ACUTE | |
144 | {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH CIRCUMFLEX | |
145 | {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH DIAERESIS | |
146 | {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH GRAVE | |
147 | {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH ACUTE | |
148 | {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH CIRCUMFLEX | |
149 | {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH DIAERESIS | |
150 | {{'T', 'H', 0, 0}}, // LATIN SMALL LETTER ETH (Icelandic) | |
151 | {{'n', 0, 0, 0}}, // LATIN SMALL LETTER N WITH TILDE | |
152 | {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH GRAVE | |
153 | {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH ACUTE | |
154 | {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH CIRCUMFLEX | |
155 | {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH TILDE | |
156 | {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH DIAERESIS | |
157 | {{'/', 0, 0, 0}}, // DIVISION SIGN | |
158 | {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH STROKE | |
159 | {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH GRAVE | |
160 | {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH ACUTE | |
161 | {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH CIRCUMFLEX | |
162 | {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH DIAERESIS | |
163 | {{'y', 0, 0, 0}}, // LATIN SMALL LETTER Y WITH ACUTE | |
164 | {{'t', 'h', 0, 0}}, // LATIN SMALL LETTER THORN (Icelandic) | |
165 | {{'y', 0, 0, 0}}, // LATIN SMALL LETTER Y WITH DIAERESIS | |
166 | }; | |
167 | ||
bd5b749c A |
168 | CF_INLINE CFIndex __CFToASCIILatin1Fallback(UniChar character, uint8_t *bytes, CFIndex maxByteLen) { |
169 | const uint8_t *losChars = (const uint8_t*)_toLossyASCIITable + (character - 0xA0) * sizeof(uint8_t[4]); | |
170 | CFIndex numBytes = 0; | |
171 | CFIndex idx, max = (maxByteLen && (maxByteLen < 4) ? maxByteLen : 4); | |
9ce05555 A |
172 | |
173 | for (idx = 0;idx < max;idx++) { | |
174 | if (losChars[idx]) { | |
175 | if (maxByteLen) bytes[idx] = losChars[idx]; | |
176 | ++numBytes; | |
177 | } else { | |
178 | break; | |
179 | } | |
180 | } | |
181 | ||
182 | return numBytes; | |
183 | } | |
184 | ||
bd5b749c A |
185 | static CFIndex __CFDefaultToBytesFallbackProc(const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { |
186 | CFIndex processCharLen = 1, filledBytesLen = 1; | |
187 | uint8_t byte = '?'; | |
188 | ||
9ce05555 | 189 | if (*characters < 0xA0) { // 0x80 to 0x9F maps to ASCII C0 range |
bd5b749c | 190 | byte = (uint8_t)(*characters - 0x80); |
9ce05555 A |
191 | } else if (*characters < 0x100) { |
192 | *usedByteLen = __CFToASCIILatin1Fallback(*characters, bytes, maxByteLen); | |
193 | return 1; | |
194 | } else if (*characters >= kSurrogateHighStart && *characters <= kSurrogateLowEnd) { | |
bd5b749c | 195 | processCharLen = (numChars > 1 && *characters <= kSurrogateLowStart && *(characters + 1) >= kSurrogateLowStart && *(characters + 1) <= kSurrogateLowEnd ? 2 : 1); |
9ce05555 | 196 | } else if (CFUniCharIsMemberOf(*characters, kCFUniCharWhitespaceCharacterSet)) { |
bd5b749c | 197 | byte = ' '; |
9ce05555 | 198 | } else if (CFUniCharIsMemberOf(*characters, kCFUniCharWhitespaceAndNewlineCharacterSet)) { |
bd5b749c A |
199 | byte = ASCIINewLine; |
200 | } else if (*characters == 0x2026) { // ellipsis | |
201 | if (0 == maxByteLen) { | |
202 | filledBytesLen = 3; | |
203 | } else if (maxByteLen > 2) { | |
204 | memset(bytes, '.', 3); | |
205 | *usedByteLen = 3; | |
206 | return processCharLen; | |
207 | } | |
9ce05555 A |
208 | } else if (CFUniCharIsMemberOf(*characters, kCFUniCharDecomposableCharacterSet)) { |
209 | UTF32Char decomposed[MAX_DECOMPOSED_LENGTH]; | |
210 | ||
211 | (void)CFUniCharDecomposeCharacter(*characters, decomposed, MAX_DECOMPOSED_LENGTH); | |
212 | if (*decomposed < 0x80) { | |
bd5b749c | 213 | byte = (uint8_t)(*decomposed); |
9ce05555 A |
214 | } else { |
215 | UTF16Char theChar = *decomposed; | |
216 | ||
217 | return __CFDefaultToBytesFallbackProc(&theChar, 1, bytes, maxByteLen, usedByteLen); | |
218 | } | |
9ce05555 | 219 | } |
bd5b749c A |
220 | |
221 | if (maxByteLen) *bytes = byte; | |
222 | *usedByteLen = filledBytesLen; | |
223 | return processCharLen; | |
9ce05555 A |
224 | } |
225 | ||
bd5b749c | 226 | static CFIndex __CFDefaultToUnicodeFallbackProc(const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { |
9ce05555 A |
227 | if (maxCharLen) *characters = (UniChar)'?'; |
228 | *usedCharLen = 1; | |
229 | return 1; | |
230 | } | |
231 | ||
232 | #define TO_BYTE_FALLBACK(conv,chars,numChars,bytes,max,used) (conv->toBytesFallback(chars,numChars,bytes,max,used)) | |
233 | #define TO_UNICODE_FALLBACK(conv,bytes,numBytes,chars,max,used) (conv->toUnicodeFallback(bytes,numBytes,chars,max,used)) | |
234 | ||
235 | #define EXTRA_BASE (0x0F00) | |
236 | ||
237 | /* Wrapper funcs for non-standard converters | |
238 | */ | |
bd5b749c A |
239 | static CFIndex __CFToBytesCheapEightBitWrapper(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { |
240 | CFIndex processedCharLen = 0; | |
241 | CFIndex length = (maxByteLen && (maxByteLen < numChars) ? maxByteLen : numChars); | |
9ce05555 A |
242 | uint8_t byte; |
243 | ||
244 | while (processedCharLen < length) { | |
cf7d2af9 | 245 | if (!((CFStringEncodingCheapEightBitToBytesProc)((const _CFEncodingConverter*)converter)->definition->toBytes)(flags, characters[processedCharLen], &byte)) break; |
9ce05555 A |
246 | |
247 | if (maxByteLen) bytes[processedCharLen] = byte; | |
248 | processedCharLen++; | |
249 | } | |
250 | ||
251 | *usedByteLen = processedCharLen; | |
252 | return processedCharLen; | |
253 | } | |
254 | ||
bd5b749c A |
255 | static CFIndex __CFToUnicodeCheapEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { |
256 | CFIndex processedByteLen = 0; | |
257 | CFIndex length = (maxCharLen && (maxCharLen < numBytes) ? maxCharLen : numBytes); | |
9ce05555 A |
258 | UniChar character; |
259 | ||
260 | while (processedByteLen < length) { | |
cf7d2af9 | 261 | if (!((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes[processedByteLen], &character)) break; |
9ce05555 A |
262 | |
263 | if (maxCharLen) characters[processedByteLen] = character; | |
264 | processedByteLen++; | |
265 | } | |
266 | ||
267 | *usedCharLen = processedByteLen; | |
268 | return processedByteLen; | |
269 | } | |
270 | ||
bd5b749c A |
271 | static CFIndex __CFToCanonicalUnicodeCheapEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { |
272 | CFIndex processedByteLen = 0; | |
273 | CFIndex theUsedCharLen = 0; | |
9ce05555 | 274 | UTF32Char charBuffer[MAX_DECOMPOSED_LENGTH]; |
bd5b749c | 275 | CFIndex usedLen; |
9ce05555 A |
276 | UniChar character; |
277 | bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false); | |
278 | ||
279 | while ((processedByteLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) { | |
cf7d2af9 | 280 | if (!((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes[processedByteLen], &character)) break; |
9ce05555 A |
281 | |
282 | if (CFUniCharIsDecomposableCharacter(character, isHFSPlus)) { | |
bd5b749c | 283 | CFIndex idx; |
9ce05555 A |
284 | |
285 | usedLen = CFUniCharDecomposeCharacter(character, charBuffer, MAX_DECOMPOSED_LENGTH); | |
286 | *usedCharLen = theUsedCharLen; | |
287 | ||
288 | for (idx = 0;idx < usedLen;idx++) { | |
289 | if (charBuffer[idx] > 0xFFFF) { // Non-BMP | |
290 | if (theUsedCharLen + 2 > maxCharLen) return processedByteLen; | |
291 | theUsedCharLen += 2; | |
292 | if (maxCharLen) { | |
293 | charBuffer[idx] = charBuffer[idx] - 0x10000; | |
bd5b749c A |
294 | *(characters++) = (UniChar)(charBuffer[idx] >> 10) + 0xD800UL; |
295 | *(characters++) = (UniChar)(charBuffer[idx] & 0x3FF) + 0xDC00UL; | |
9ce05555 A |
296 | } |
297 | } else { | |
298 | if (theUsedCharLen + 1 > maxCharLen) return processedByteLen; | |
299 | ++theUsedCharLen; | |
300 | *(characters++) = charBuffer[idx]; | |
301 | } | |
302 | } | |
303 | } else { | |
304 | if (maxCharLen) *(characters++) = character; | |
305 | ++theUsedCharLen; | |
306 | } | |
307 | processedByteLen++; | |
308 | } | |
309 | ||
310 | *usedCharLen = theUsedCharLen; | |
311 | return processedByteLen; | |
312 | } | |
313 | ||
bd5b749c A |
314 | static CFIndex __CFToBytesStandardEightBitWrapper(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { |
315 | CFIndex processedCharLen = 0; | |
9ce05555 | 316 | uint8_t byte; |
bd5b749c | 317 | CFIndex usedLen; |
9ce05555 A |
318 | |
319 | *usedByteLen = 0; | |
320 | ||
321 | while (numChars && (!maxByteLen || (*usedByteLen < maxByteLen))) { | |
cf7d2af9 | 322 | if (!(usedLen = ((CFStringEncodingStandardEightBitToBytesProc)((const _CFEncodingConverter*)converter)->definition->toBytes)(flags, characters, numChars, &byte))) break; |
9ce05555 A |
323 | |
324 | if (maxByteLen) bytes[*usedByteLen] = byte; | |
325 | (*usedByteLen)++; | |
326 | characters += usedLen; | |
327 | numChars -= usedLen; | |
328 | processedCharLen += usedLen; | |
329 | } | |
330 | ||
331 | return processedCharLen; | |
332 | } | |
333 | ||
bd5b749c A |
334 | static CFIndex __CFToUnicodeStandardEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { |
335 | CFIndex processedByteLen = 0; | |
cf7d2af9 | 336 | UniChar charBuffer[__CFMaximumConvertedLength]; |
bd5b749c | 337 | CFIndex usedLen; |
9ce05555 A |
338 | |
339 | *usedCharLen = 0; | |
340 | ||
341 | while ((processedByteLen < numBytes) && (!maxCharLen || (*usedCharLen < maxCharLen))) { | |
cf7d2af9 | 342 | if (!(usedLen = ((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes[processedByteLen], charBuffer))) break; |
9ce05555 A |
343 | |
344 | if (maxCharLen) { | |
bd5b749c | 345 | CFIndex idx; |
9ce05555 A |
346 | |
347 | if (*usedCharLen + usedLen > maxCharLen) break; | |
348 | ||
349 | for (idx = 0;idx < usedLen;idx++) { | |
350 | characters[*usedCharLen + idx] = charBuffer[idx]; | |
351 | } | |
352 | } | |
353 | *usedCharLen += usedLen; | |
354 | processedByteLen++; | |
355 | } | |
356 | ||
357 | return processedByteLen; | |
358 | } | |
359 | ||
bd5b749c A |
360 | static CFIndex __CFToCanonicalUnicodeStandardEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { |
361 | CFIndex processedByteLen = 0; | |
cf7d2af9 | 362 | UniChar charBuffer[__CFMaximumConvertedLength]; |
9ce05555 | 363 | UTF32Char decompBuffer[MAX_DECOMPOSED_LENGTH]; |
bd5b749c A |
364 | CFIndex usedLen; |
365 | CFIndex decompedLen; | |
366 | CFIndex idx, decompIndex; | |
9ce05555 | 367 | bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false); |
bd5b749c | 368 | CFIndex theUsedCharLen = 0; |
9ce05555 A |
369 | |
370 | while ((processedByteLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) { | |
cf7d2af9 | 371 | if (!(usedLen = ((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes[processedByteLen], charBuffer))) break; |
9ce05555 A |
372 | |
373 | for (idx = 0;idx < usedLen;idx++) { | |
374 | if (CFUniCharIsDecomposableCharacter(charBuffer[idx], isHFSPlus)) { | |
375 | decompedLen = CFUniCharDecomposeCharacter(charBuffer[idx], decompBuffer, MAX_DECOMPOSED_LENGTH); | |
376 | *usedCharLen = theUsedCharLen; | |
377 | ||
378 | for (decompIndex = 0;decompIndex < decompedLen;decompIndex++) { | |
379 | if (decompBuffer[decompIndex] > 0xFFFF) { // Non-BMP | |
380 | if (theUsedCharLen + 2 > maxCharLen) return processedByteLen; | |
381 | theUsedCharLen += 2; | |
382 | if (maxCharLen) { | |
383 | charBuffer[idx] = charBuffer[idx] - 0x10000; | |
384 | *(characters++) = (charBuffer[idx] >> 10) + 0xD800UL; | |
385 | *(characters++) = (charBuffer[idx] & 0x3FF) + 0xDC00UL; | |
386 | } | |
387 | } else { | |
388 | if (theUsedCharLen + 1 > maxCharLen) return processedByteLen; | |
389 | ++theUsedCharLen; | |
390 | *(characters++) = charBuffer[idx]; | |
391 | } | |
392 | } | |
393 | } else { | |
394 | if (maxCharLen) *(characters++) = charBuffer[idx]; | |
395 | ++theUsedCharLen; | |
396 | } | |
397 | } | |
398 | processedByteLen++; | |
399 | } | |
400 | ||
401 | *usedCharLen = theUsedCharLen; | |
402 | return processedByteLen; | |
403 | } | |
404 | ||
bd5b749c A |
405 | static CFIndex __CFToBytesCheapMultiByteWrapper(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { |
406 | CFIndex processedCharLen = 0; | |
cf7d2af9 | 407 | uint8_t byteBuffer[__CFMaximumConvertedLength]; |
bd5b749c | 408 | CFIndex usedLen; |
9ce05555 A |
409 | |
410 | *usedByteLen = 0; | |
411 | ||
412 | while ((processedCharLen < numChars) && (!maxByteLen || (*usedByteLen < maxByteLen))) { | |
cf7d2af9 | 413 | if (!(usedLen = ((CFStringEncodingCheapMultiByteToBytesProc)((const _CFEncodingConverter*)converter)->definition->toBytes)(flags, characters[processedCharLen], byteBuffer))) break; |
9ce05555 A |
414 | |
415 | if (maxByteLen) { | |
bd5b749c | 416 | CFIndex idx; |
9ce05555 A |
417 | |
418 | if (*usedByteLen + usedLen > maxByteLen) break; | |
419 | ||
420 | for (idx = 0;idx <usedLen;idx++) { | |
421 | bytes[*usedByteLen + idx] = byteBuffer[idx]; | |
422 | } | |
423 | } | |
424 | ||
425 | *usedByteLen += usedLen; | |
426 | processedCharLen++; | |
427 | } | |
428 | ||
429 | return processedCharLen; | |
430 | } | |
431 | ||
bd5b749c A |
432 | static CFIndex __CFToUnicodeCheapMultiByteWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { |
433 | CFIndex processedByteLen = 0; | |
9ce05555 | 434 | UniChar character; |
bd5b749c | 435 | CFIndex usedLen; |
9ce05555 A |
436 | |
437 | *usedCharLen = 0; | |
438 | ||
439 | while (numBytes && (!maxCharLen || (*usedCharLen < maxCharLen))) { | |
cf7d2af9 | 440 | if (!(usedLen = ((CFStringEncodingCheapMultiByteToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes, numBytes, &character))) break; |
9ce05555 A |
441 | |
442 | if (maxCharLen) *(characters++) = character; | |
443 | (*usedCharLen)++; | |
444 | processedByteLen += usedLen; | |
445 | bytes += usedLen; | |
446 | numBytes -= usedLen; | |
447 | } | |
448 | ||
449 | return processedByteLen; | |
450 | } | |
451 | ||
bd5b749c A |
452 | static CFIndex __CFToCanonicalUnicodeCheapMultiByteWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { |
453 | CFIndex processedByteLen = 0; | |
9ce05555 A |
454 | UTF32Char charBuffer[MAX_DECOMPOSED_LENGTH]; |
455 | UniChar character; | |
bd5b749c A |
456 | CFIndex usedLen; |
457 | CFIndex decomposedLen; | |
458 | CFIndex theUsedCharLen = 0; | |
9ce05555 A |
459 | bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false); |
460 | ||
461 | while (numBytes && (!maxCharLen || (theUsedCharLen < maxCharLen))) { | |
cf7d2af9 | 462 | if (!(usedLen = ((CFStringEncodingCheapMultiByteToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes, numBytes, &character))) break; |
9ce05555 A |
463 | |
464 | if (CFUniCharIsDecomposableCharacter(character, isHFSPlus)) { | |
bd5b749c | 465 | CFIndex idx; |
9ce05555 A |
466 | |
467 | decomposedLen = CFUniCharDecomposeCharacter(character, charBuffer, MAX_DECOMPOSED_LENGTH); | |
468 | *usedCharLen = theUsedCharLen; | |
469 | ||
470 | for (idx = 0;idx < decomposedLen;idx++) { | |
471 | if (charBuffer[idx] > 0xFFFF) { // Non-BMP | |
472 | if (theUsedCharLen + 2 > maxCharLen) return processedByteLen; | |
473 | theUsedCharLen += 2; | |
474 | if (maxCharLen) { | |
475 | charBuffer[idx] = charBuffer[idx] - 0x10000; | |
bd5b749c A |
476 | *(characters++) = (UniChar)(charBuffer[idx] >> 10) + 0xD800UL; |
477 | *(characters++) = (UniChar)(charBuffer[idx] & 0x3FF) + 0xDC00UL; | |
9ce05555 A |
478 | } |
479 | } else { | |
480 | if (theUsedCharLen + 1 > maxCharLen) return processedByteLen; | |
481 | ++theUsedCharLen; | |
482 | *(characters++) = charBuffer[idx]; | |
483 | } | |
484 | } | |
485 | } else { | |
486 | if (maxCharLen) *(characters++) = character; | |
487 | ++theUsedCharLen; | |
488 | } | |
489 | ||
490 | processedByteLen += usedLen; | |
491 | bytes += usedLen; | |
492 | numBytes -= usedLen; | |
493 | } | |
494 | *usedCharLen = theUsedCharLen; | |
495 | return processedByteLen; | |
496 | } | |
497 | ||
498 | /* static functions | |
499 | */ | |
cf7d2af9 | 500 | CF_INLINE _CFEncodingConverter *__CFEncodingConverterFromDefinition(const CFStringEncodingConverter *definition, CFStringEncoding encoding) { |
9ce05555 | 501 | #define NUM_OF_ENTRIES_CYCLE (10) |
bd5b749c A |
502 | static uint32_t _currentIndex = 0; |
503 | static uint32_t _allocatedSize = 0; | |
9ce05555 A |
504 | static _CFEncodingConverter *_allocatedEntries = NULL; |
505 | _CFEncodingConverter *converter; | |
506 | ||
507 | ||
9ce05555 A |
508 | if ((_currentIndex + 1) >= _allocatedSize) { |
509 | _currentIndex = 0; | |
510 | _allocatedSize = 0; | |
511 | _allocatedEntries = NULL; | |
512 | } | |
513 | if (_allocatedEntries == NULL) { // Not allocated yet | |
bd5b749c | 514 | _allocatedEntries = (_CFEncodingConverter *)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(_CFEncodingConverter) * NUM_OF_ENTRIES_CYCLE, 0); |
9ce05555 A |
515 | _allocatedSize = NUM_OF_ENTRIES_CYCLE; |
516 | converter = &(_allocatedEntries[_currentIndex]); | |
517 | } else { | |
518 | converter = &(_allocatedEntries[++_currentIndex]); | |
519 | } | |
cf7d2af9 A |
520 | |
521 | memset(converter, 0, sizeof(_CFEncodingConverter)); | |
522 | ||
523 | converter->definition = definition; | |
9ce05555 A |
524 | |
525 | switch (definition->encodingClass) { | |
526 | case kCFStringEncodingConverterStandard: | |
cf7d2af9 A |
527 | converter->toBytes = NULL; |
528 | converter->toUnicode = NULL; | |
529 | converter->toCanonicalUnicode = NULL; | |
9ce05555 A |
530 | break; |
531 | ||
532 | case kCFStringEncodingConverterCheapEightBit: | |
533 | converter->toBytes = __CFToBytesCheapEightBitWrapper; | |
534 | converter->toUnicode = __CFToUnicodeCheapEightBitWrapper; | |
535 | converter->toCanonicalUnicode = __CFToCanonicalUnicodeCheapEightBitWrapper; | |
9ce05555 A |
536 | break; |
537 | ||
538 | case kCFStringEncodingConverterStandardEightBit: | |
539 | converter->toBytes = __CFToBytesStandardEightBitWrapper; | |
540 | converter->toUnicode = __CFToUnicodeStandardEightBitWrapper; | |
541 | converter->toCanonicalUnicode = __CFToCanonicalUnicodeStandardEightBitWrapper; | |
9ce05555 A |
542 | break; |
543 | ||
544 | case kCFStringEncodingConverterCheapMultiByte: | |
545 | converter->toBytes = __CFToBytesCheapMultiByteWrapper; | |
546 | converter->toUnicode = __CFToUnicodeCheapMultiByteWrapper; | |
547 | converter->toCanonicalUnicode = __CFToCanonicalUnicodeCheapMultiByteWrapper; | |
cf7d2af9 A |
548 | break; |
549 | ||
856091c5 | 550 | #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX |
cf7d2af9 A |
551 | case kCFStringEncodingConverterICU: |
552 | converter->toBytes = (_CFToBytesProc)__CFStringEncodingGetICUName(encoding); | |
9ce05555 | 553 | break; |
856091c5 | 554 | #endif |
9ce05555 A |
555 | |
556 | case kCFStringEncodingConverterPlatformSpecific: | |
cf7d2af9 | 557 | break; |
9ce05555 A |
558 | |
559 | default: // Shouln't be here | |
560 | return NULL; | |
561 | } | |
562 | ||
9ce05555 A |
563 | converter->toBytesFallback = (definition->toBytesFallback ? definition->toBytesFallback : __CFDefaultToBytesFallbackProc); |
564 | converter->toUnicodeFallback = (definition->toUnicodeFallback ? definition->toUnicodeFallback : __CFDefaultToUnicodeFallbackProc); | |
9ce05555 A |
565 | |
566 | return converter; | |
567 | } | |
568 | ||
cf7d2af9 A |
569 | CF_INLINE const CFStringEncodingConverter *__CFStringEncodingConverterGetDefinition(CFStringEncoding encoding) { |
570 | switch (encoding) { | |
571 | case kCFStringEncodingUTF8: | |
572 | return &__CFConverterUTF8; | |
573 | ||
574 | case kCFStringEncodingMacRoman: | |
575 | return &__CFConverterMacRoman; | |
576 | ||
577 | case kCFStringEncodingWindowsLatin1: | |
578 | return &__CFConverterWinLatin1; | |
579 | ||
9ce05555 A |
580 | case kCFStringEncodingASCII: |
581 | return &__CFConverterASCII; | |
582 | ||
583 | case kCFStringEncodingISOLatin1: | |
584 | return &__CFConverterISOLatin1; | |
585 | ||
9ce05555 A |
586 | |
587 | case kCFStringEncodingNextStepLatin: | |
588 | return &__CFConverterNextStepLatin; | |
589 | ||
9ce05555 A |
590 | |
591 | default: | |
cf7d2af9 | 592 | return __CFStringEncodingGetExternalConverter(encoding); |
9ce05555 A |
593 | } |
594 | } | |
595 | ||
bd5b749c | 596 | static const _CFEncodingConverter *__CFGetConverter(uint32_t encoding) { |
cf7d2af9 A |
597 | const _CFEncodingConverter *converter = NULL; |
598 | const _CFEncodingConverter **commonConverterSlot = NULL; | |
599 | static _CFEncodingConverter *commonConverters[3] = {NULL, NULL, NULL}; // UTF8, MacRoman/WinLatin1, and the default encoding* | |
600 | static CFMutableDictionaryRef mappingTable = NULL; | |
d7384798 | 601 | static OSSpinLock lock = OS_SPINLOCK_INIT; |
cf7d2af9 A |
602 | |
603 | switch (encoding) { | |
604 | case kCFStringEncodingUTF8: commonConverterSlot = (const _CFEncodingConverter **)&(commonConverters[0]); break; | |
605 | ||
606 | /* the swith here should avoid possible bootstrap issues in the default: case below when invoked from CFStringGetSystemEncoding() */ | |
856091c5 | 607 | #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_EMBEDDED_MINI || DEPLOYMENT_TARGET_LINUX |
cf7d2af9 A |
608 | case kCFStringEncodingMacRoman: commonConverterSlot = (const _CFEncodingConverter **)&(commonConverters[1]); break; |
609 | #elif DEPLOYMENT_TARGET_WINDOWS | |
610 | case kCFStringEncodingWindowsLatin1: commonConverterSlot = (const _CFEncodingConverter **)(&(commonConverters[1])); break; | |
611 | #else | |
612 | #warning This case must match __defaultEncoding value defined in CFString.c | |
613 | case kCFStringEncodingISOLatin1: commonConverterSlot = (const _CFEncodingConverter **)(&(commonConverters[1])); break; | |
856091c5 | 614 | #endif |
cf7d2af9 A |
615 | |
616 | default: if (CFStringGetSystemEncoding() == encoding) commonConverterSlot = (const _CFEncodingConverter **)&(commonConverters[2]); break; | |
617 | } | |
618 | ||
d7384798 | 619 | OSSpinLockLock(&lock); |
cf7d2af9 | 620 | converter = ((NULL == commonConverterSlot) ? ((NULL == mappingTable) ? NULL : (const _CFEncodingConverter *)CFDictionaryGetValue(mappingTable, (const void *)(uintptr_t)encoding)) : *commonConverterSlot); |
d7384798 | 621 | OSSpinLockUnlock(&lock); |
9ce05555 | 622 | |
cf7d2af9 A |
623 | if (NULL == converter) { |
624 | const CFStringEncodingConverter *definition = __CFStringEncodingConverterGetDefinition(encoding); | |
9ce05555 | 625 | |
cf7d2af9 | 626 | if (NULL != definition) { |
d7384798 | 627 | OSSpinLockLock(&lock); |
cf7d2af9 | 628 | converter = ((NULL == commonConverterSlot) ? ((NULL == mappingTable) ? NULL : (const _CFEncodingConverter *)CFDictionaryGetValue(mappingTable, (const void *)(uintptr_t)encoding)) : *commonConverterSlot); |
9ce05555 | 629 | |
cf7d2af9 A |
630 | if (NULL == converter) { |
631 | converter = __CFEncodingConverterFromDefinition(definition, encoding); | |
632 | ||
633 | if (NULL == commonConverterSlot) { | |
634 | if (NULL == mappingTable) mappingTable = CFDictionaryCreateMutable(NULL, 0, NULL, NULL); | |
635 | ||
636 | CFDictionarySetValue(mappingTable, (const void *)(uintptr_t)encoding, converter); | |
637 | } else { | |
638 | *commonConverterSlot = converter; | |
639 | } | |
640 | } | |
d7384798 | 641 | OSSpinLockUnlock(&lock); |
9ce05555 A |
642 | } |
643 | } | |
644 | ||
cf7d2af9 | 645 | return converter; |
9ce05555 A |
646 | } |
647 | ||
648 | /* Public API | |
649 | */ | |
bd5b749c | 650 | uint32_t CFStringEncodingUnicodeToBytes(uint32_t encoding, uint32_t flags, const UniChar *characters, CFIndex numChars, CFIndex *usedCharLen, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { |
9ce05555 A |
651 | if (encoding == kCFStringEncodingUTF8) { |
652 | static CFStringEncodingToBytesProc __CFToUTF8 = NULL; | |
bd5b749c A |
653 | CFIndex convertedCharLen; |
654 | CFIndex usedLen; | |
9ce05555 A |
655 | |
656 | ||
657 | if ((flags & kCFStringEncodingUseCanonical) || (flags & kCFStringEncodingUseHFSPlusCanonical)) { | |
658 | (void)CFUniCharDecompose(characters, numChars, &convertedCharLen, (void *)bytes, maxByteLen, &usedLen, true, kCFUniCharUTF8Format, (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false)); | |
659 | } else { | |
660 | if (!__CFToUTF8) { | |
661 | const CFStringEncodingConverter *utf8Converter = CFStringEncodingGetConverter(kCFStringEncodingUTF8); | |
662 | __CFToUTF8 = (CFStringEncodingToBytesProc)utf8Converter->toBytes; | |
663 | } | |
bd5b749c | 664 | convertedCharLen = __CFToUTF8(0, characters, numChars, bytes, maxByteLen, &usedLen); |
9ce05555 A |
665 | } |
666 | if (usedCharLen) *usedCharLen = convertedCharLen; | |
667 | if (usedByteLen) *usedByteLen = usedLen; | |
668 | ||
669 | if (convertedCharLen == numChars) { | |
670 | return kCFStringEncodingConversionSuccess; | |
cf7d2af9 A |
671 | } else if ((maxByteLen > 0) && ((maxByteLen - usedLen) < 10)) { // could be filled outbuf |
672 | UTF16Char character = characters[convertedCharLen]; | |
673 | ||
674 | if (((character >= kSurrogateLowStart) && (character <= kSurrogateLowEnd)) || ((character >= kSurrogateHighStart) && (character <= kSurrogateHighEnd) && ((1 == (numChars - convertedCharLen)) || (characters[convertedCharLen + 1] < kSurrogateLowStart) || (characters[convertedCharLen + 1] > kSurrogateLowEnd)))) return kCFStringEncodingInvalidInputStream; | |
675 | ||
9ce05555 A |
676 | return kCFStringEncodingInsufficientOutputBufferLength; |
677 | } else { | |
678 | return kCFStringEncodingInvalidInputStream; | |
679 | } | |
680 | } else { | |
681 | const _CFEncodingConverter *converter = __CFGetConverter(encoding); | |
bd5b749c A |
682 | CFIndex usedLen = 0; |
683 | CFIndex localUsedByteLen; | |
684 | CFIndex theUsedByteLen = 0; | |
685 | uint32_t theResult = kCFStringEncodingConversionSuccess; | |
9ce05555 A |
686 | CFStringEncodingToBytesPrecomposeProc toBytesPrecompose = NULL; |
687 | CFStringEncodingIsValidCombiningCharacterProc isValidCombiningChar = NULL; | |
688 | ||
689 | if (!converter) return kCFStringEncodingConverterUnavailable; | |
690 | ||
691 | if (flags & kCFStringEncodingSubstituteCombinings) { | |
cf7d2af9 | 692 | if (!(flags & kCFStringEncodingAllowLossyConversion)) isValidCombiningChar = converter->definition->isValidCombiningChar; |
9ce05555 | 693 | } else { |
cf7d2af9 | 694 | isValidCombiningChar = converter->definition->isValidCombiningChar; |
9ce05555 | 695 | if (!(flags & kCFStringEncodingIgnoreCombinings)) { |
cf7d2af9 | 696 | toBytesPrecompose = converter->definition->toBytesPrecompose; |
9ce05555 A |
697 | flags |= kCFStringEncodingComposeCombinings; |
698 | } | |
699 | } | |
700 | ||
856091c5 | 701 | #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX |
cf7d2af9 | 702 | if (kCFStringEncodingConverterICU == converter->definition->encodingClass) return __CFStringEncodingICUToBytes((const char *)converter->toBytes, flags, characters, numChars, usedCharLen, bytes, maxByteLen, usedByteLen); |
856091c5 | 703 | #endif |
cf7d2af9 A |
704 | |
705 | /* Platform converter */ | |
706 | if (kCFStringEncodingConverterPlatformSpecific == converter->definition->encodingClass) return __CFStringEncodingPlatformUnicodeToBytes(encoding, flags, characters, numChars, usedCharLen, bytes, maxByteLen, usedByteLen); | |
9ce05555 A |
707 | |
708 | while ((usedLen < numChars) && (!maxByteLen || (theUsedByteLen < maxByteLen))) { | |
709 | if ((usedLen += TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen)) < numChars) { | |
bd5b749c | 710 | CFIndex dummy; |
9ce05555 A |
711 | |
712 | if (isValidCombiningChar && (usedLen > 0) && isValidCombiningChar(characters[usedLen])) { | |
713 | if (toBytesPrecompose) { | |
bd5b749c | 714 | CFIndex localUsedLen = usedLen; |
9ce05555 A |
715 | |
716 | while (isValidCombiningChar(characters[--usedLen])); | |
717 | theUsedByteLen += localUsedByteLen; | |
cf7d2af9 | 718 | if (converter->definition->maxBytesPerChar > 1) { |
9ce05555 A |
719 | TO_BYTE(converter, flags, characters + usedLen, localUsedLen - usedLen, NULL, 0, &localUsedByteLen); |
720 | theUsedByteLen -= localUsedByteLen; | |
721 | } else { | |
722 | theUsedByteLen--; | |
723 | } | |
724 | if ((localUsedLen = toBytesPrecompose(flags, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen)) > 0) { | |
725 | usedLen += localUsedLen; | |
726 | if ((usedLen < numChars) && isValidCombiningChar(characters[usedLen])) { // There is a non-base char not combined remaining | |
727 | theUsedByteLen += localUsedByteLen; | |
728 | theResult = kCFStringEncodingInvalidInputStream; | |
729 | break; | |
730 | } | |
731 | } else if (flags & kCFStringEncodingAllowLossyConversion) { | |
732 | uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags); | |
733 | ||
734 | if (lossyByte) { | |
cf7d2af9 | 735 | while (isValidCombiningChar(characters[++usedLen])); |
9ce05555 A |
736 | localUsedByteLen = 1; |
737 | if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte; | |
738 | } else { | |
739 | ++usedLen; | |
740 | usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen); | |
741 | } | |
742 | } else { | |
743 | theResult = kCFStringEncodingInvalidInputStream; | |
744 | break; | |
745 | } | |
746 | } else if (maxByteLen && ((maxByteLen == theUsedByteLen + localUsedByteLen) || TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, NULL, 0, &dummy))) { // buffer was filled up | |
747 | theUsedByteLen += localUsedByteLen; | |
748 | theResult = kCFStringEncodingInsufficientOutputBufferLength; | |
749 | break; | |
750 | } else if (flags & kCFStringEncodingIgnoreCombinings) { | |
751 | while ((++usedLen < numChars) && isValidCombiningChar(characters[usedLen])); | |
752 | } else { | |
753 | uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags); | |
754 | ||
755 | theUsedByteLen += localUsedByteLen; | |
756 | if (lossyByte) { | |
757 | ++usedLen; | |
758 | localUsedByteLen = 1; | |
759 | if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte; | |
760 | } else { | |
761 | usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen); | |
762 | } | |
763 | } | |
764 | } else if (maxByteLen && ((maxByteLen == theUsedByteLen + localUsedByteLen) || TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, NULL, 0, &dummy))) { // buffer was filled up | |
765 | theUsedByteLen += localUsedByteLen; | |
766 | ||
767 | if (flags & kCFStringEncodingAllowLossyConversion && !CFStringEncodingMaskToLossyByte(flags)) { | |
bd5b749c | 768 | CFIndex localUsedLen; |
9ce05555 A |
769 | |
770 | localUsedByteLen = 0; | |
db04bbf9 A |
771 | // after the buffer is full, we still try out all the rest of the characters |
772 | // if all characters cannot be converted, we mark the result as insufficient output buffer | |
773 | while ((usedLen < numChars) && !localUsedByteLen && (localUsedLen = TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, NULL, 0, &localUsedByteLen))) { | |
774 | if (localUsedByteLen == 0) { | |
775 | usedLen += localUsedLen; | |
776 | } | |
777 | } | |
9ce05555 A |
778 | } |
779 | if (usedLen < numChars) theResult = kCFStringEncodingInsufficientOutputBufferLength; | |
780 | break; | |
781 | } else if (flags & kCFStringEncodingAllowLossyConversion) { | |
782 | uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags); | |
783 | ||
784 | theUsedByteLen += localUsedByteLen; | |
785 | if (lossyByte) { | |
786 | ++usedLen; | |
787 | localUsedByteLen = 1; | |
788 | if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte; | |
789 | } else { | |
790 | usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen); | |
791 | } | |
792 | } else { | |
793 | theUsedByteLen += localUsedByteLen; | |
794 | theResult = kCFStringEncodingInvalidInputStream; | |
795 | break; | |
796 | } | |
797 | } | |
798 | theUsedByteLen += localUsedByteLen; | |
799 | } | |
800 | ||
801 | if (usedLen < numChars && maxByteLen && theResult == kCFStringEncodingConversionSuccess) { | |
802 | if (flags & kCFStringEncodingAllowLossyConversion && !CFStringEncodingMaskToLossyByte(flags)) { | |
bd5b749c | 803 | CFIndex localUsedLen; |
9ce05555 A |
804 | |
805 | localUsedByteLen = 0; | |
db04bbf9 A |
806 | while ((usedLen < numChars) && !localUsedByteLen && (localUsedLen = TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, NULL, 0, &localUsedByteLen))) { |
807 | if (!localUsedByteLen) { | |
808 | usedLen += localUsedLen; | |
809 | } | |
810 | } | |
9ce05555 A |
811 | } |
812 | if (usedLen < numChars) theResult = kCFStringEncodingInsufficientOutputBufferLength; | |
813 | } | |
814 | if (usedByteLen) *usedByteLen = theUsedByteLen; | |
815 | if (usedCharLen) *usedCharLen = usedLen; | |
816 | ||
817 | return theResult; | |
818 | } | |
819 | } | |
820 | ||
bd5b749c | 821 | uint32_t CFStringEncodingBytesToUnicode(uint32_t encoding, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, CFIndex *usedByteLen, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { |
9ce05555 | 822 | const _CFEncodingConverter *converter = __CFGetConverter(encoding); |
bd5b749c A |
823 | CFIndex usedLen = 0; |
824 | CFIndex theUsedCharLen = 0; | |
825 | CFIndex localUsedCharLen; | |
826 | uint32_t theResult = kCFStringEncodingConversionSuccess; | |
9ce05555 A |
827 | |
828 | if (!converter) return kCFStringEncodingConverterUnavailable; | |
829 | ||
856091c5 | 830 | #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX |
cf7d2af9 | 831 | if (kCFStringEncodingConverterICU == converter->definition->encodingClass) return __CFStringEncodingICUToUnicode((const char *)converter->toBytes, flags, bytes, numBytes, usedByteLen, characters, maxCharLen, usedCharLen); |
856091c5 | 832 | #endif |
cf7d2af9 A |
833 | |
834 | /* Platform converter */ | |
835 | if (kCFStringEncodingConverterPlatformSpecific == converter->definition->encodingClass) return __CFStringEncodingPlatformBytesToUnicode(encoding, flags, bytes, numBytes, usedByteLen, characters, maxCharLen, usedCharLen); | |
9ce05555 A |
836 | |
837 | while ((usedLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) { | |
838 | if ((usedLen += TO_UNICODE(converter, flags, bytes + usedLen, numBytes - usedLen, characters + theUsedCharLen, (maxCharLen ? maxCharLen - theUsedCharLen : 0), &localUsedCharLen)) < numBytes) { | |
bd5b749c | 839 | CFIndex tempUsedCharLen; |
9ce05555 | 840 | |
bd5b749c | 841 | if (maxCharLen && ((maxCharLen == theUsedCharLen + localUsedCharLen) || (((flags & (kCFStringEncodingUseCanonical|kCFStringEncodingUseHFSPlusCanonical)) || (maxCharLen == theUsedCharLen + localUsedCharLen + 1)) && TO_UNICODE(converter, flags, bytes + usedLen, numBytes - usedLen, NULL, 0, &tempUsedCharLen)))) { // buffer was filled up |
9ce05555 A |
842 | theUsedCharLen += localUsedCharLen; |
843 | theResult = kCFStringEncodingInsufficientOutputBufferLength; | |
844 | break; | |
845 | } else if (flags & kCFStringEncodingAllowLossyConversion) { | |
846 | theUsedCharLen += localUsedCharLen; | |
847 | usedLen += TO_UNICODE_FALLBACK(converter, bytes + usedLen, numBytes - usedLen, characters + theUsedCharLen, (maxCharLen ? maxCharLen - theUsedCharLen : 0), &localUsedCharLen); | |
848 | } else { | |
849 | theUsedCharLen += localUsedCharLen; | |
850 | theResult = kCFStringEncodingInvalidInputStream; | |
851 | break; | |
852 | } | |
853 | } | |
854 | theUsedCharLen += localUsedCharLen; | |
855 | } | |
856 | ||
857 | if (usedLen < numBytes && maxCharLen && theResult == kCFStringEncodingConversionSuccess) { | |
858 | theResult = kCFStringEncodingInsufficientOutputBufferLength; | |
859 | } | |
860 | if (usedCharLen) *usedCharLen = theUsedCharLen; | |
861 | if (usedByteLen) *usedByteLen = usedLen; | |
862 | ||
863 | return theResult; | |
864 | } | |
865 | ||
a48904a4 | 866 | CF_PRIVATE bool CFStringEncodingIsValidEncoding(uint32_t encoding) { |
9ce05555 A |
867 | return (CFStringEncodingGetConverter(encoding) ? true : false); |
868 | } | |
869 | ||
a48904a4 | 870 | CF_PRIVATE CFIndex CFStringEncodingCharLengthForBytes(uint32_t encoding, uint32_t flags, const uint8_t *bytes, CFIndex numBytes) { |
cf7d2af9 | 871 | const _CFEncodingConverter *converter = __CFGetConverter(encoding); |
9ce05555 | 872 | |
cf7d2af9 | 873 | if (converter) { |
856091c5 | 874 | #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX |
cf7d2af9 | 875 | if (kCFStringEncodingConverterICU == converter->definition->encodingClass) return __CFStringEncodingICUCharLength((const char *)converter->toBytes, flags, bytes, numBytes); |
856091c5 A |
876 | #endif |
877 | ||
cf7d2af9 | 878 | if (kCFStringEncodingConverterPlatformSpecific == converter->definition->encodingClass) return __CFStringEncodingPlatformCharLengthForBytes(encoding, flags, bytes, numBytes); |
9ce05555 | 879 | |
cf7d2af9 | 880 | if (1 == converter->definition->maxBytesPerChar) return numBytes; |
9ce05555 | 881 | |
cf7d2af9 A |
882 | if (NULL == converter->definition->toUnicodeLen) { |
883 | CFIndex usedByteLen = 0; | |
884 | CFIndex totalLength = 0; | |
885 | CFIndex usedCharLen; | |
9ce05555 | 886 | |
cf7d2af9 A |
887 | while (numBytes > 0) { |
888 | usedByteLen = TO_UNICODE(converter, flags, bytes, numBytes, NULL, 0, &usedCharLen); | |
889 | ||
890 | bytes += usedByteLen; | |
891 | numBytes -= usedByteLen; | |
892 | totalLength += usedCharLen; | |
893 | ||
894 | if (numBytes > 0) { | |
895 | if (0 == (flags & kCFStringEncodingAllowLossyConversion)) return 0; | |
9ce05555 | 896 | |
cf7d2af9 A |
897 | usedByteLen = TO_UNICODE_FALLBACK(converter, bytes, numBytes, NULL, 0, &usedCharLen); |
898 | ||
899 | bytes += usedByteLen; | |
900 | numBytes -= usedByteLen; | |
901 | totalLength += usedCharLen; | |
902 | } | |
903 | } | |
904 | ||
905 | return totalLength; | |
bd5b749c | 906 | } else { |
cf7d2af9 A |
907 | return converter->definition->toUnicodeLen(flags, bytes, numBytes); |
908 | } | |
9ce05555 A |
909 | } |
910 | ||
911 | return 0; | |
912 | } | |
913 | ||
a48904a4 | 914 | CF_PRIVATE CFIndex CFStringEncodingByteLengthForCharacters(uint32_t encoding, uint32_t flags, const UniChar *characters, CFIndex numChars) { |
9ce05555 A |
915 | const _CFEncodingConverter *converter = __CFGetConverter(encoding); |
916 | ||
917 | if (converter) { | |
856091c5 | 918 | #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX |
cf7d2af9 | 919 | if (kCFStringEncodingConverterICU == converter->definition->encodingClass) return __CFStringEncodingICUByteLength((const char *)converter->toBytes, flags, characters, numChars); |
856091c5 | 920 | #endif |
cf7d2af9 A |
921 | |
922 | if (kCFStringEncodingConverterPlatformSpecific == converter->definition->encodingClass) return __CFStringEncodingPlatformByteLengthForCharacters(encoding, flags, characters, numChars); | |
923 | ||
924 | if (1 == converter->definition->maxBytesPerChar) return numChars; | |
925 | ||
926 | if (NULL == converter->definition->toBytesLen) { | |
8ca704e1 | 927 | CFIndex usedByteLen; |
9ce05555 | 928 | |
8ca704e1 | 929 | return ((kCFStringEncodingConversionSuccess == CFStringEncodingUnicodeToBytes(encoding, flags, characters, numChars, NULL, NULL, 0, &usedByteLen)) ? usedByteLen : 0); |
bd5b749c | 930 | } else { |
cf7d2af9 A |
931 | return converter->definition->toBytesLen(flags, characters, numChars); |
932 | } | |
9ce05555 A |
933 | } |
934 | ||
935 | return 0; | |
936 | } | |
937 | ||
a48904a4 | 938 | void CFStringEncodingRegisterFallbackProcedures(uint32_t encoding, CFStringEncodingToBytesFallbackProc toBytes, CFStringEncodingToUnicodeFallbackProc toUnicode) { |
cf7d2af9 A |
939 | _CFEncodingConverter *converter = (_CFEncodingConverter *)__CFGetConverter(encoding); |
940 | ||
941 | if (NULL != converter) { | |
942 | const CFStringEncodingConverter *body = CFStringEncodingGetConverter(encoding); | |
9ce05555 | 943 | |
cf7d2af9 A |
944 | converter->toBytesFallback = ((NULL == toBytes) ? ((NULL == body) ? __CFDefaultToBytesFallbackProc : body->toBytesFallback) : toBytes); |
945 | converter->toUnicodeFallback = ((NULL == toUnicode) ? ((NULL == body) ? __CFDefaultToUnicodeFallbackProc : body->toUnicodeFallback) : toUnicode); | |
9ce05555 A |
946 | } |
947 | } | |
948 | ||
a48904a4 | 949 | CF_PRIVATE const CFStringEncodingConverter *CFStringEncodingGetConverter(uint32_t encoding) { |
cf7d2af9 A |
950 | const _CFEncodingConverter *converter = __CFGetConverter(encoding); |
951 | ||
952 | return ((NULL == converter) ? NULL : converter->definition); | |
9ce05555 A |
953 | } |
954 | ||
cf7d2af9 | 955 | static const CFStringEncoding __CFBuiltinEncodings[] = { |
9ce05555 A |
956 | kCFStringEncodingMacRoman, |
957 | kCFStringEncodingWindowsLatin1, | |
958 | kCFStringEncodingISOLatin1, | |
959 | kCFStringEncodingNextStepLatin, | |
960 | kCFStringEncodingASCII, | |
961 | kCFStringEncodingUTF8, | |
d8925383 | 962 | /* These seven are available only in CFString-level */ |
9ce05555 | 963 | kCFStringEncodingNonLossyASCII, |
d8925383 A |
964 | |
965 | kCFStringEncodingUTF16, | |
966 | kCFStringEncodingUTF16BE, | |
967 | kCFStringEncodingUTF16LE, | |
968 | ||
969 | kCFStringEncodingUTF32, | |
970 | kCFStringEncodingUTF32BE, | |
971 | kCFStringEncodingUTF32LE, | |
972 | ||
9ce05555 A |
973 | kCFStringEncodingInvalidId, |
974 | }; | |
975 | ||
cf7d2af9 A |
976 | static CFComparisonResult __CFStringEncodingComparator(const void *v1, const void *v2, void *context) { |
977 | CFComparisonResult val1 = (*(const CFStringEncoding *)v1) & 0xFFFF; | |
978 | CFComparisonResult val2 = (*(const CFStringEncoding *)v2) & 0xFFFF; | |
9ce05555 | 979 | |
cf7d2af9 | 980 | return ((val1 == val2) ? ((CFComparisonResult)(*(const CFStringEncoding *)v1) - (CFComparisonResult)(*(const CFStringEncoding *)v2)) : val1 - val2); |
9ce05555 A |
981 | } |
982 | ||
cf7d2af9 A |
983 | static void __CFStringEncodingFliterDupes(CFStringEncoding *encodings, CFIndex numSlots) { |
984 | CFStringEncoding last = kCFStringEncodingInvalidId; | |
985 | const CFStringEncoding *limitEncodings = encodings + numSlots; | |
986 | ||
987 | while (encodings < limitEncodings) { | |
988 | if (last == *encodings) { | |
989 | if ((encodings + 1) < limitEncodings) memmove(encodings, encodings + 1, sizeof(CFStringEncoding) * (limitEncodings - encodings - 1)); | |
990 | --limitEncodings; | |
991 | } else { | |
992 | last = *(encodings++); | |
993 | } | |
994 | } | |
995 | } | |
996 | ||
a48904a4 | 997 | CF_PRIVATE const CFStringEncoding *CFStringEncodingListOfAvailableEncodings(void) { |
cf7d2af9 A |
998 | static const CFStringEncoding *encodings = NULL; |
999 | ||
1000 | if (NULL == encodings) { | |
1001 | CFStringEncoding *list = (CFStringEncoding *)__CFBuiltinEncodings; | |
1002 | CFIndex numICUConverters = 0, numPlatformConverters = 0; | |
856091c5 | 1003 | #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX |
cf7d2af9 | 1004 | CFStringEncoding *icuConverters = __CFStringEncodingCreateICUEncodings(NULL, &numICUConverters); |
856091c5 A |
1005 | #else |
1006 | CFStringEncoding *icuConverters = NULL; | |
1007 | #endif | |
cf7d2af9 A |
1008 | CFStringEncoding *platformConverters = __CFStringEncodingCreateListOfAvailablePlatformConverters(NULL, &numPlatformConverters); |
1009 | ||
1010 | if ((NULL != icuConverters) || (NULL != platformConverters)) { | |
1011 | CFIndex numSlots = (sizeof(__CFBuiltinEncodings) / sizeof(*__CFBuiltinEncodings)) + numICUConverters + numPlatformConverters; | |
1012 | ||
1013 | list = (CFStringEncoding *)CFAllocatorAllocate(NULL, sizeof(CFStringEncoding) * numSlots, 0); | |
1014 | ||
1015 | memcpy(list, __CFBuiltinEncodings, sizeof(__CFBuiltinEncodings)); | |
1016 | ||
1017 | if (NULL != icuConverters) { | |
1018 | memcpy(list + (sizeof(__CFBuiltinEncodings) / sizeof(*__CFBuiltinEncodings)), icuConverters, sizeof(CFStringEncoding) * numICUConverters); | |
1019 | CFAllocatorDeallocate(NULL, icuConverters); | |
1020 | } | |
1021 | ||
1022 | if (NULL != platformConverters) { | |
1023 | memcpy(list + (sizeof(__CFBuiltinEncodings) / sizeof(*__CFBuiltinEncodings)) + numICUConverters, platformConverters, sizeof(CFStringEncoding) * numPlatformConverters); | |
1024 | CFAllocatorDeallocate(NULL, platformConverters); | |
1025 | } | |
1026 | ||
1027 | CFQSortArray(list, numSlots, sizeof(CFStringEncoding), (CFComparatorFunction)__CFStringEncodingComparator, NULL); | |
1028 | __CFStringEncodingFliterDupes(list, numSlots); | |
1029 | } | |
1030 | if (!OSAtomicCompareAndSwapPtrBarrier(NULL, list, (void * volatile *)&encodings) && (list != __CFBuiltinEncodings)) CFAllocatorDeallocate(NULL, list); | |
1031 | } | |
1032 | ||
1033 | return encodings; | |
1034 | } | |
bd5b749c A |
1035 | |
1036 | #undef TO_BYTE | |
1037 | #undef TO_UNICODE | |
1038 | #undef ASCIINewLine | |
1039 | #undef kSurrogateHighStart | |
1040 | #undef kSurrogateHighEnd | |
1041 | #undef kSurrogateLowStart | |
1042 | #undef kSurrogateLowEnd | |
1043 | #undef TO_BYTE_FALLBACK | |
1044 | #undef TO_UNICODE_FALLBACK | |
1045 | #undef EXTRA_BASE | |
1046 | #undef NUM_OF_ENTRIES_CYCLE | |
1047 |