]>
Commit | Line | Data |
---|---|---|
bd5b749c | 1 | /* |
8ca704e1 | 2 | * Copyright (c) 2011 Apple Inc. All rights reserved. |
bd5b749c A |
3 | * |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. Please obtain a copy of the License at | |
10 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
11 | * file. | |
12 | * | |
13 | * The Original Code and all software distributed under the License are | |
14 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
15 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
16 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
18 | * Please see the License for the specific language governing rights and | |
19 | * limitations under the License. | |
20 | * | |
21 | * @APPLE_LICENSE_HEADER_END@ | |
22 | */ | |
f64f9b69 | 23 | |
bd5b749c | 24 | /* CFUnicodeDecomposition.c |
8ca704e1 | 25 | Copyright (c) 1999-2011, Apple Inc. All rights reserved. |
bd5b749c A |
26 | Responsibility: Aki Inoue |
27 | */ | |
28 | ||
29 | #include <string.h> | |
30 | #include <CoreFoundation/CFBase.h> | |
31 | #include <CoreFoundation/CFCharacterSet.h> | |
32 | #include <CoreFoundation/CFUniChar.h> | |
33 | #include <CoreFoundation/CFUnicodeDecomposition.h> | |
34 | #include "CFInternal.h" | |
35 | #include "CFUniCharPriv.h" | |
36 | ||
37 | // Canonical Decomposition | |
38 | static UTF32Char *__CFUniCharDecompositionTable = NULL; | |
39 | static uint32_t __CFUniCharDecompositionTableLength = 0; | |
40 | static UTF32Char *__CFUniCharMultipleDecompositionTable = NULL; | |
41 | ||
42 | static const uint8_t *__CFUniCharDecomposableBitmapForBMP = NULL; | |
43 | static const uint8_t *__CFUniCharHFSPlusDecomposableBitmapForBMP = NULL; | |
44 | ||
45 | static CFSpinLock_t __CFUniCharDecompositionTableLock = CFSpinLockInit; | |
46 | ||
47 | static const uint8_t **__CFUniCharCombiningPriorityTable = NULL; | |
48 | static uint8_t __CFUniCharCombiningPriorityTableNumPlane = 0; | |
49 | ||
50 | static void __CFUniCharLoadDecompositionTable(void) { | |
51 | ||
52 | __CFSpinLock(&__CFUniCharDecompositionTableLock); | |
53 | ||
54 | if (NULL == __CFUniCharDecompositionTable) { | |
55 | const uint32_t *bytes = (uint32_t *)CFUniCharGetMappingData(kCFUniCharCanonicalDecompMapping); | |
56 | ||
57 | if (NULL == bytes) { | |
58 | __CFSpinUnlock(&__CFUniCharDecompositionTableLock); | |
59 | return; | |
60 | } | |
61 | ||
62 | __CFUniCharDecompositionTableLength = *(bytes++); | |
63 | __CFUniCharDecompositionTable = (UTF32Char *)bytes; | |
64 | __CFUniCharMultipleDecompositionTable = (UTF32Char *)((intptr_t)bytes + __CFUniCharDecompositionTableLength); | |
65 | ||
66 | __CFUniCharDecompositionTableLength /= (sizeof(uint32_t) * 2); | |
67 | __CFUniCharDecomposableBitmapForBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharCanonicalDecomposableCharacterSet, 0); | |
68 | __CFUniCharHFSPlusDecomposableBitmapForBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharHFSPlusDecomposableCharacterSet, 0); | |
69 | ||
70 | CFIndex idx; | |
71 | ||
72 | __CFUniCharCombiningPriorityTableNumPlane = CFUniCharGetNumberOfPlanesForUnicodePropertyData(kCFUniCharCombiningProperty); | |
73 | __CFUniCharCombiningPriorityTable = (const uint8_t **)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(uint8_t *) * __CFUniCharCombiningPriorityTableNumPlane, 0); | |
74 | for (idx = 0;idx < __CFUniCharCombiningPriorityTableNumPlane;idx++) __CFUniCharCombiningPriorityTable[idx] = (const uint8_t *)CFUniCharGetUnicodePropertyDataForPlane(kCFUniCharCombiningProperty, idx); | |
75 | } | |
76 | ||
77 | __CFSpinUnlock(&__CFUniCharDecompositionTableLock); | |
78 | } | |
79 | ||
80 | static CFSpinLock_t __CFUniCharCompatibilityDecompositionTableLock = CFSpinLockInit; | |
81 | static UTF32Char *__CFUniCharCompatibilityDecompositionTable = NULL; | |
82 | static uint32_t __CFUniCharCompatibilityDecompositionTableLength = 0; | |
83 | static UTF32Char *__CFUniCharCompatibilityMultipleDecompositionTable = NULL; | |
84 | ||
85 | static void __CFUniCharLoadCompatibilityDecompositionTable(void) { | |
86 | ||
87 | __CFSpinLock(&__CFUniCharCompatibilityDecompositionTableLock); | |
88 | ||
89 | if (NULL == __CFUniCharCompatibilityDecompositionTable) { | |
90 | const uint32_t *bytes = (uint32_t *)CFUniCharGetMappingData(kCFUniCharCompatibilityDecompMapping); | |
91 | ||
92 | if (NULL == bytes) { | |
93 | __CFSpinUnlock(&__CFUniCharCompatibilityDecompositionTableLock); | |
94 | return; | |
95 | } | |
96 | ||
97 | __CFUniCharCompatibilityDecompositionTableLength = *(bytes++); | |
98 | __CFUniCharCompatibilityDecompositionTable = (UTF32Char *)bytes; | |
99 | __CFUniCharCompatibilityMultipleDecompositionTable = (UTF32Char *)((intptr_t)bytes + __CFUniCharCompatibilityDecompositionTableLength); | |
100 | ||
101 | __CFUniCharCompatibilityDecompositionTableLength /= (sizeof(uint32_t) * 2); | |
102 | } | |
103 | ||
104 | __CFSpinUnlock(&__CFUniCharCompatibilityDecompositionTableLock); | |
105 | } | |
106 | ||
107 | CF_INLINE bool __CFUniCharIsDecomposableCharacterWithFlag(UTF32Char character, bool isHFSPlus) { | |
108 | return CFUniCharIsMemberOfBitmap(character, (character < 0x10000 ? (isHFSPlus ? __CFUniCharHFSPlusDecomposableBitmapForBMP : __CFUniCharDecomposableBitmapForBMP) : CFUniCharGetBitmapPtrForPlane(kCFUniCharCanonicalDecomposableCharacterSet, ((character >> 16) & 0xFF)))); | |
109 | } | |
110 | ||
111 | CF_INLINE uint8_t __CFUniCharGetCombiningPropertyForCharacter(UTF32Char character) { return CFUniCharGetCombiningPropertyForCharacter(character, (((character) >> 16) < __CFUniCharCombiningPriorityTableNumPlane ? __CFUniCharCombiningPriorityTable[(character) >> 16] : NULL)); } | |
112 | ||
113 | CF_INLINE bool __CFUniCharIsNonBaseCharacter(UTF32Char character) { return ((0 == __CFUniCharGetCombiningPropertyForCharacter(character)) ? false : true); } // the notion of non-base in normalization is characters with non-0 combining class | |
114 | ||
115 | typedef struct { | |
116 | uint32_t _key; | |
117 | uint32_t _value; | |
118 | } __CFUniCharDecomposeMappings; | |
119 | ||
120 | static uint32_t __CFUniCharGetMappedValue(const __CFUniCharDecomposeMappings *theTable, uint32_t numElem, UTF32Char character) { | |
121 | const __CFUniCharDecomposeMappings *p, *q, *divider; | |
122 | ||
123 | if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key)) { | |
124 | return 0; | |
125 | } | |
126 | p = theTable; | |
127 | q = p + (numElem-1); | |
128 | while (p <= q) { | |
129 | divider = p + ((q - p) >> 1); /* divide by 2 */ | |
130 | if (character < divider->_key) { q = divider - 1; } | |
131 | else if (character > divider->_key) { p = divider + 1; } | |
132 | else { return divider->_value; } | |
133 | } | |
134 | return 0; | |
135 | } | |
136 | ||
137 | static void __CFUniCharPrioritySort(UTF32Char *characters, CFIndex length) { | |
138 | UTF32Char *end = characters + length; | |
139 | ||
140 | while ((characters < end) && (0 == __CFUniCharGetCombiningPropertyForCharacter(*characters))) ++characters; | |
141 | ||
142 | if ((end - characters) > 1) { | |
143 | uint32_t p1, p2; | |
144 | UTF32Char *ch1, *ch2; | |
145 | bool changes = true; | |
146 | ||
147 | do { | |
148 | changes = false; | |
149 | ch1 = characters; ch2 = characters + 1; | |
150 | p2 = __CFUniCharGetCombiningPropertyForCharacter(*ch1); | |
151 | while (ch2 < end) { | |
152 | p1 = p2; p2 = __CFUniCharGetCombiningPropertyForCharacter(*ch2); | |
153 | if (p1 > p2) { | |
154 | UTF32Char tmp = *ch1; *ch1 = *ch2; *ch2 = tmp; | |
155 | changes = true; | |
156 | } | |
157 | ++ch1; ++ch2; | |
158 | } | |
159 | } while (changes); | |
160 | } | |
161 | } | |
162 | ||
163 | static CFIndex __CFUniCharRecursivelyDecomposeCharacter(UTF32Char character, UTF32Char *convertedChars, CFIndex maxBufferLength) { | |
164 | uint32_t value = __CFUniCharGetMappedValue((const __CFUniCharDecomposeMappings *)__CFUniCharDecompositionTable, __CFUniCharDecompositionTableLength, character); | |
165 | CFIndex length = CFUniCharConvertFlagToCount(value); | |
166 | UTF32Char firstChar = value & 0xFFFFFF; | |
167 | UTF32Char *mappings = (length > 1 ? __CFUniCharMultipleDecompositionTable + firstChar : &firstChar); | |
168 | CFIndex usedLength = 0; | |
169 | ||
170 | if (maxBufferLength < length) return 0; | |
171 | ||
172 | if (value & kCFUniCharRecursiveDecompositionFlag) { | |
173 | usedLength = __CFUniCharRecursivelyDecomposeCharacter(*mappings, convertedChars, maxBufferLength - length); | |
174 | ||
175 | --length; // Decrement for the first char | |
176 | if (!usedLength || usedLength + length > maxBufferLength) return 0; | |
177 | ++mappings; | |
178 | convertedChars += usedLength; | |
179 | } | |
180 | ||
181 | usedLength += length; | |
182 | ||
183 | while (length--) *(convertedChars++) = *(mappings++); | |
184 | ||
185 | return usedLength; | |
186 | } | |
187 | ||
188 | #define HANGUL_SBASE 0xAC00 | |
189 | #define HANGUL_LBASE 0x1100 | |
190 | #define HANGUL_VBASE 0x1161 | |
191 | #define HANGUL_TBASE 0x11A7 | |
192 | #define HANGUL_SCOUNT 11172 | |
193 | #define HANGUL_LCOUNT 19 | |
194 | #define HANGUL_VCOUNT 21 | |
195 | #define HANGUL_TCOUNT 28 | |
196 | #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT) | |
197 | ||
198 | CFIndex CFUniCharDecomposeCharacter(UTF32Char character, UTF32Char *convertedChars, CFIndex maxBufferLength) { | |
199 | if (NULL == __CFUniCharDecompositionTable) __CFUniCharLoadDecompositionTable(); | |
200 | if (character >= HANGUL_SBASE && character <= (HANGUL_SBASE + HANGUL_SCOUNT)) { | |
201 | CFIndex length; | |
202 | ||
203 | character -= HANGUL_SBASE; | |
204 | ||
205 | length = (character % HANGUL_TCOUNT ? 3 : 2); | |
206 | ||
207 | if (maxBufferLength < length) return 0; | |
208 | ||
209 | *(convertedChars++) = character / HANGUL_NCOUNT + HANGUL_LBASE; | |
210 | *(convertedChars++) = (character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE; | |
211 | if (length > 2) *convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE; | |
212 | return length; | |
213 | } else { | |
214 | return __CFUniCharRecursivelyDecomposeCharacter(character, convertedChars, maxBufferLength); | |
215 | } | |
216 | } | |
217 | ||
218 | CF_INLINE bool __CFProcessReorderBuffer(UTF32Char *buffer, CFIndex length, void **dst, CFIndex dstLength, CFIndex *filledLength, uint32_t dstFormat) { | |
219 | if (length > 1) __CFUniCharPrioritySort(buffer, length); | |
220 | return CFUniCharFillDestinationBuffer(buffer, length, dst, dstLength, filledLength, dstFormat); | |
221 | } | |
222 | ||
223 | #define MAX_BUFFER_LENGTH (32) | |
224 | bool CFUniCharDecompose(const UTF16Char *src, CFIndex length, CFIndex *consumedLength, void *dst, CFIndex maxLength, CFIndex *filledLength, bool needToReorder, uint32_t dstFormat, bool isHFSPlus) { | |
225 | CFIndex usedLength = 0; | |
226 | CFIndex originalLength = length; | |
227 | UTF32Char buffer[MAX_BUFFER_LENGTH]; | |
228 | UTF32Char *decompBuffer = buffer; | |
229 | CFIndex decompBufferSize = MAX_BUFFER_LENGTH; | |
230 | CFIndex decompBufferLen = 0; | |
231 | CFIndex segmentLength = 0; | |
232 | UTF32Char currentChar; | |
233 | ||
234 | if (NULL == __CFUniCharDecompositionTable) __CFUniCharLoadDecompositionTable(); | |
235 | ||
236 | while ((length - segmentLength) > 0) { | |
237 | currentChar = *(src++); | |
238 | ||
239 | if (currentChar < 0x80) { | |
240 | if (decompBufferLen > 0) { | |
241 | if (!__CFProcessReorderBuffer(decompBuffer, decompBufferLen, &dst, maxLength, &usedLength, dstFormat)) break; | |
242 | length -= segmentLength; | |
243 | segmentLength = 0; | |
244 | decompBufferLen = 0; | |
245 | } | |
246 | ||
247 | if (maxLength > 0) { | |
248 | if (usedLength >= maxLength) break; | |
249 | switch (dstFormat) { | |
250 | case kCFUniCharUTF8Format: *(uint8_t *)dst = currentChar; dst = (uint8_t *)dst + sizeof(uint8_t); break; | |
251 | case kCFUniCharUTF16Format: *(UTF16Char *)dst = currentChar; dst = (uint8_t *)dst + sizeof(UTF16Char); break; | |
252 | case kCFUniCharUTF32Format: *(UTF32Char *)dst = currentChar; dst = (uint8_t *)dst + sizeof(UTF32Char); break; | |
253 | } | |
254 | } | |
255 | ||
256 | --length; | |
257 | ++usedLength; | |
258 | } else { | |
259 | if (CFUniCharIsSurrogateLowCharacter(currentChar)) { // Stray surrogagte | |
260 | if (dstFormat != kCFUniCharUTF16Format) break; | |
261 | } else if (CFUniCharIsSurrogateHighCharacter(currentChar)) { | |
262 | if (((length - segmentLength) > 1) && CFUniCharIsSurrogateLowCharacter(*src)) { | |
263 | currentChar = CFUniCharGetLongCharacterForSurrogatePair(currentChar, *(src++)); | |
264 | } else { | |
265 | if (dstFormat != kCFUniCharUTF16Format) break; | |
266 | } | |
267 | } | |
268 | ||
269 | if (needToReorder && __CFUniCharIsNonBaseCharacter(currentChar)) { | |
270 | if ((decompBufferLen + 1) >= decompBufferSize) { | |
271 | UTF32Char *newBuffer; | |
272 | ||
273 | decompBufferSize += MAX_BUFFER_LENGTH; | |
274 | newBuffer = (UTF32Char *)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(UTF32Char) * decompBufferSize, 0); | |
275 | memmove(newBuffer, decompBuffer, (decompBufferSize - MAX_BUFFER_LENGTH) * sizeof(UTF32Char)); | |
276 | if (decompBuffer != buffer) CFAllocatorDeallocate(kCFAllocatorSystemDefault, decompBuffer); | |
277 | decompBuffer = newBuffer; | |
278 | } | |
279 | ||
280 | if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar, isHFSPlus)) { // Vietnamese accent, etc. | |
281 | decompBufferLen += CFUniCharDecomposeCharacter(currentChar, decompBuffer + decompBufferLen, decompBufferSize - decompBufferLen); | |
282 | } else { | |
283 | decompBuffer[decompBufferLen++] = currentChar; | |
284 | } | |
285 | } else { | |
286 | if (decompBufferLen > 0) { | |
287 | if (!__CFProcessReorderBuffer(decompBuffer, decompBufferLen, &dst, maxLength, &usedLength, dstFormat)) break; | |
288 | length -= segmentLength; | |
289 | segmentLength = 0; | |
290 | } | |
291 | ||
292 | if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar, isHFSPlus)) { | |
293 | decompBufferLen = CFUniCharDecomposeCharacter(currentChar, decompBuffer, MAX_BUFFER_LENGTH); | |
294 | } else { | |
295 | decompBufferLen = 1; | |
296 | *decompBuffer = currentChar; | |
297 | } | |
298 | ||
299 | if (!needToReorder || (decompBufferLen == 1)) { | |
300 | if (!CFUniCharFillDestinationBuffer(decompBuffer, decompBufferLen, &dst, maxLength, &usedLength, dstFormat)) break; | |
301 | length -= ((currentChar > 0xFFFF) ? 2 : 1); | |
302 | decompBufferLen = 0; | |
303 | continue; | |
304 | } | |
305 | } | |
306 | ||
307 | segmentLength += ((currentChar > 0xFFFF) ? 2 : 1); | |
308 | } | |
309 | } | |
310 | ||
311 | if ((decompBufferLen > 0) && __CFProcessReorderBuffer(decompBuffer, decompBufferLen, &dst, maxLength, &usedLength, dstFormat)) length -= segmentLength; | |
312 | ||
313 | if (decompBuffer != buffer) CFAllocatorDeallocate(kCFAllocatorSystemDefault, decompBuffer); | |
314 | ||
315 | if (consumedLength) *consumedLength = originalLength - length; | |
316 | if (filledLength) *filledLength = usedLength; | |
317 | ||
318 | return ((length > 0) ? false : true); | |
319 | } | |
320 | ||
321 | #define MAX_COMP_DECOMP_LEN (32) | |
322 | ||
323 | static CFIndex __CFUniCharRecursivelyCompatibilityDecomposeCharacter(UTF32Char character, UTF32Char *convertedChars) { | |
324 | uint32_t value = __CFUniCharGetMappedValue((const __CFUniCharDecomposeMappings *)__CFUniCharCompatibilityDecompositionTable, __CFUniCharCompatibilityDecompositionTableLength, character); | |
325 | CFIndex length = CFUniCharConvertFlagToCount(value); | |
326 | UTF32Char firstChar = value & 0xFFFFFF; | |
327 | const UTF32Char *mappings = (length > 1 ? __CFUniCharCompatibilityMultipleDecompositionTable + firstChar : &firstChar); | |
328 | CFIndex usedLength = length; | |
329 | UTF32Char currentChar; | |
330 | CFIndex currentLength; | |
331 | ||
332 | while (length-- > 0) { | |
333 | currentChar = *(mappings++); | |
334 | if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar, false)) { | |
335 | currentLength = __CFUniCharRecursivelyDecomposeCharacter(currentChar, convertedChars, MAX_COMP_DECOMP_LEN - length); | |
336 | convertedChars += currentLength; | |
337 | usedLength += (currentLength - 1); | |
338 | } else if (CFUniCharIsMemberOf(currentChar, kCFUniCharCompatibilityDecomposableCharacterSet)) { | |
339 | currentLength = __CFUniCharRecursivelyCompatibilityDecomposeCharacter(currentChar, convertedChars); | |
340 | convertedChars += currentLength; | |
341 | usedLength += (currentLength - 1); | |
342 | } else { | |
343 | *(convertedChars++) = currentChar; | |
344 | } | |
345 | } | |
346 | ||
347 | return usedLength; | |
348 | } | |
349 | ||
350 | CF_INLINE void __CFUniCharMoveBufferFromEnd1(UTF32Char *convertedChars, CFIndex length, CFIndex delta) { | |
351 | const UTF32Char *limit = convertedChars; | |
352 | UTF32Char *dstP; | |
353 | ||
354 | convertedChars += length; | |
355 | dstP = convertedChars + delta; | |
356 | ||
357 | while (convertedChars > limit) *(--dstP) = *(--convertedChars); | |
358 | } | |
359 | ||
360 | __private_extern__ CFIndex CFUniCharCompatibilityDecompose(UTF32Char *convertedChars, CFIndex length, CFIndex maxBufferLength) { | |
361 | UTF32Char currentChar; | |
362 | UTF32Char buffer[MAX_COMP_DECOMP_LEN]; | |
363 | const UTF32Char *bufferP; | |
364 | const UTF32Char *limit = convertedChars + length; | |
365 | CFIndex filledLength; | |
366 | ||
367 | if (NULL == __CFUniCharCompatibilityDecompositionTable) __CFUniCharLoadCompatibilityDecompositionTable(); | |
368 | ||
369 | while (convertedChars < limit) { | |
370 | currentChar = *convertedChars; | |
371 | ||
372 | if (CFUniCharIsMemberOf(currentChar, kCFUniCharCompatibilityDecomposableCharacterSet)) { | |
373 | filledLength = __CFUniCharRecursivelyCompatibilityDecomposeCharacter(currentChar, buffer); | |
374 | ||
375 | if (filledLength + length - 1 > maxBufferLength) return 0; | |
376 | ||
377 | if (filledLength > 1) __CFUniCharMoveBufferFromEnd1(convertedChars + 1, limit - convertedChars - 1, filledLength - 1); | |
378 | ||
379 | bufferP = buffer; | |
380 | length += (filledLength - 1); | |
381 | while (filledLength-- > 0) *(convertedChars++) = *(bufferP++); | |
382 | } else { | |
383 | ++convertedChars; | |
384 | } | |
385 | } | |
386 | ||
387 | return length; | |
388 | } | |
389 | ||
390 | CF_EXPORT void CFUniCharPrioritySort(UTF32Char *characters, CFIndex length) { | |
391 | __CFUniCharPrioritySort(characters, length); | |
392 | } | |
393 | ||
394 | #undef MAX_BUFFER_LENGTH | |
395 | #undef MAX_COMP_DECOMP_LEN | |
396 | #undef HANGUL_SBASE | |
397 | #undef HANGUL_LBASE | |
398 | #undef HANGUL_VBASE | |
399 | #undef HANGUL_TBASE | |
400 | #undef HANGUL_SCOUNT | |
401 | #undef HANGUL_LCOUNT | |
402 | #undef HANGUL_VCOUNT | |
403 | #undef HANGUL_TCOUNT | |
404 | #undef HANGUL_NCOUNT | |
405 |