]> git.saurik.com Git - apple/cf.git/blob - CFStringUtilities.c
46e91a8f31671c1adf61830de84d0dc4ae69dfec
[apple/cf.git] / CFStringUtilities.c
1 /*
2 * Copyright (c) 2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24 /* CFStringUtilities.c
25 Copyright (c) 1999-2014, Apple Inc. All rights reserved.
26 Responsibility: Aki Inoue
27 */
28
29 #include "CFInternal.h"
30 #include <CoreFoundation/CFStringEncodingConverterExt.h>
31 #include <CoreFoundation/CFUniChar.h>
32 #include <CoreFoundation/CFStringEncodingExt.h>
33 #include "CFStringEncodingDatabase.h"
34 #include "CFICUConverters.h"
35 #include <limits.h>
36 #include <stdlib.h>
37 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX
38 #include <unicode/ucol.h>
39 #include <unicode/ucoleitr.h>
40 #endif
41 #include <string.h>
42
43 #if DEPLOYMENT_TARGET_WINDOWS
44 #include <tchar.h>
45 #endif
46
47
48 Boolean CFStringIsEncodingAvailable(CFStringEncoding theEncoding) {
49 switch (theEncoding) {
50 case kCFStringEncodingASCII: // Built-in encodings
51 case kCFStringEncodingMacRoman:
52 case kCFStringEncodingUTF8:
53 case kCFStringEncodingNonLossyASCII:
54 case kCFStringEncodingWindowsLatin1:
55 case kCFStringEncodingNextStepLatin:
56 case kCFStringEncodingUTF16:
57 case kCFStringEncodingUTF16BE:
58 case kCFStringEncodingUTF16LE:
59 case kCFStringEncodingUTF32:
60 case kCFStringEncodingUTF32BE:
61 case kCFStringEncodingUTF32LE:
62 return true;
63
64 default:
65 return CFStringEncodingIsValidEncoding(theEncoding);
66 }
67 }
68
69 const CFStringEncoding* CFStringGetListOfAvailableEncodings() {
70 return (const CFStringEncoding *)CFStringEncodingListOfAvailableEncodings();
71 }
72
73 CFStringRef CFStringGetNameOfEncoding(CFStringEncoding theEncoding) {
74 static CFMutableDictionaryRef mappingTable = NULL;
75 static OSSpinLock mappingTableLock = OS_SPINLOCK_INIT;
76
77 CFStringRef theName = NULL;
78
79 if (mappingTable) {
80 OSSpinLockLock(&mappingTableLock);
81 theName = (CFStringRef)CFDictionaryGetValue(mappingTable, (const void*)(uintptr_t)theEncoding);
82 OSSpinLockUnlock(&mappingTableLock);
83 }
84
85 if (!theName) {
86 const char *encodingName = __CFStringEncodingGetName(theEncoding);
87
88 if (encodingName) {
89 theName = CFStringCreateWithCString(kCFAllocatorSystemDefault, encodingName, kCFStringEncodingASCII);
90 }
91
92 if (theName) {
93 OSSpinLockLock(&mappingTableLock);
94
95 CFStringRef result = NULL;
96 if (!mappingTable) {
97 mappingTable = CFDictionaryCreateMutable(kCFAllocatorSystemDefault, 0, (const CFDictionaryKeyCallBacks *)NULL, &kCFTypeDictionaryValueCallBacks);
98 } else { // Check to see if this got in the dictionary in the meantime
99 result = (CFStringRef)CFDictionaryGetValue(mappingTable, (const void*)(uintptr_t)theEncoding);
100 }
101 if (!result) { // If not, add it in
102 CFDictionaryAddValue(mappingTable, (const void*)(uintptr_t)theEncoding, (const void*)theName);
103 OSSpinLockUnlock(&mappingTableLock);
104 CFRelease(theName);
105 } else { // Otherwise use the one already in there
106 OSSpinLockUnlock(&mappingTableLock);
107 CFRelease(theName);
108 theName = result;
109 }
110 }
111 }
112
113 return theName;
114 }
115
116 CFStringEncoding CFStringConvertIANACharSetNameToEncoding(CFStringRef charsetName) {
117 CFStringEncoding encoding = kCFStringEncodingInvalidId;
118 #define BUFFER_SIZE (100)
119 char buffer[BUFFER_SIZE];
120 const char *name = CFStringGetCStringPtr(charsetName, __CFStringGetEightBitStringEncoding());
121
122 if (NULL == name) {
123 if (false == CFStringGetCString(charsetName, buffer, BUFFER_SIZE, __CFStringGetEightBitStringEncoding())) return kCFStringEncodingInvalidId;
124
125 name = buffer;
126 }
127
128 encoding = __CFStringEncodingGetFromCanonicalName(name);
129
130 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX
131 if (kCFStringEncodingInvalidId == encoding) encoding = __CFStringEncodingGetFromICUName(name);
132 #endif
133
134
135 // handling Java name variant for MS codepages
136 if ((kCFStringEncodingInvalidId == encoding) && !strncasecmp(name, "ms950", strlen("ms950"))) { // <rdar://problem/12903398> “MS950” is not recognized
137 encoding = __CFStringEncodingGetFromCanonicalName("cp950");
138 }
139
140 return encoding;
141 }
142
143 CFStringRef CFStringConvertEncodingToIANACharSetName(CFStringEncoding encoding) {
144 CFStringRef name = NULL;
145 CFIndex value = encoding;
146 static CFMutableDictionaryRef mappingTable = NULL;
147 static CFLock_t lock = CFLockInit;
148
149 __CFLock(&lock);
150 name = ((NULL == mappingTable) ? NULL : (CFStringRef)CFDictionaryGetValue(mappingTable, (const void*)value));
151
152 if (NULL == name) {
153 #define STACK_BUFFER_SIZE (100)
154 char buffer[STACK_BUFFER_SIZE];
155
156 if (__CFStringEncodingGetCanonicalName(encoding, buffer, STACK_BUFFER_SIZE)) name = CFStringCreateWithCString(NULL, buffer, kCFStringEncodingASCII);
157
158
159 if (NULL != name) {
160 CFIndex value = encoding;
161
162 if (NULL == mappingTable) mappingTable = CFDictionaryCreateMutable(NULL, 0, NULL, &kCFTypeDictionaryValueCallBacks);
163
164 CFDictionaryAddValue(mappingTable, (const void*)value, (const void*)name);
165 CFRelease(name);
166 }
167 }
168 __CFUnlock(&lock);
169
170 return name;
171 }
172
173 enum {
174 NSASCIIStringEncoding = 1, /* 0..127 only */
175 NSNEXTSTEPStringEncoding = 2,
176 NSJapaneseEUCStringEncoding = 3,
177 NSUTF8StringEncoding = 4,
178 NSISOLatin1StringEncoding = 5,
179 NSSymbolStringEncoding = 6,
180 NSNonLossyASCIIStringEncoding = 7,
181 NSShiftJISStringEncoding = 8,
182 NSISOLatin2StringEncoding = 9,
183 NSUnicodeStringEncoding = 10,
184 NSWindowsCP1251StringEncoding = 11, /* Cyrillic; same as AdobeStandardCyrillic */
185 NSWindowsCP1252StringEncoding = 12, /* WinLatin1 */
186 NSWindowsCP1253StringEncoding = 13, /* Greek */
187 NSWindowsCP1254StringEncoding = 14, /* Turkish */
188 NSWindowsCP1250StringEncoding = 15, /* WinLatin2 */
189 NSISO2022JPStringEncoding = 21, /* ISO 2022 Japanese encoding for e-mail */
190 NSMacOSRomanStringEncoding = 30,
191
192 NSProprietaryStringEncoding = 65536 /* Installation-specific encoding */
193 };
194
195 #define NSENCODING_MASK (1 << 31)
196
197 unsigned long CFStringConvertEncodingToNSStringEncoding(CFStringEncoding theEncoding) {
198 switch (theEncoding & 0xFFF) {
199 case kCFStringEncodingUnicode:
200 if (theEncoding == kCFStringEncodingUTF16) return NSUnicodeStringEncoding;
201 else if (theEncoding == kCFStringEncodingUTF8) return NSUTF8StringEncoding;
202 break;
203
204 case kCFStringEncodingWindowsLatin1: return NSWindowsCP1252StringEncoding;
205 case kCFStringEncodingMacRoman: return NSMacOSRomanStringEncoding;
206
207 case kCFStringEncodingASCII: return NSASCIIStringEncoding;
208
209 case kCFStringEncodingDOSJapanese: return NSShiftJISStringEncoding;
210 case kCFStringEncodingWindowsCyrillic: return NSWindowsCP1251StringEncoding;
211 case kCFStringEncodingWindowsGreek: return NSWindowsCP1253StringEncoding;
212 case kCFStringEncodingWindowsLatin5: return NSWindowsCP1254StringEncoding;
213 case kCFStringEncodingWindowsLatin2: return NSWindowsCP1250StringEncoding;
214 case kCFStringEncodingISOLatin1: return NSISOLatin1StringEncoding;
215
216 case kCFStringEncodingNonLossyASCII: return NSNonLossyASCIIStringEncoding;
217 case kCFStringEncodingEUC_JP: return NSJapaneseEUCStringEncoding;
218 case kCFStringEncodingMacSymbol: return NSSymbolStringEncoding;
219 case kCFStringEncodingISOLatin2: return NSISOLatin2StringEncoding;
220 case kCFStringEncodingISO_2022_JP: return NSISO2022JPStringEncoding;
221 case kCFStringEncodingNextStepLatin: return NSNEXTSTEPStringEncoding;
222 }
223
224 return NSENCODING_MASK | theEncoding;
225 }
226
227 CFStringEncoding CFStringConvertNSStringEncodingToEncoding(unsigned long theEncoding) {
228 const uint16_t encodings[] = {
229 kCFStringEncodingASCII,
230 kCFStringEncodingNextStepLatin,
231 kCFStringEncodingEUC_JP,
232 0,
233 kCFStringEncodingISOLatin1,
234 kCFStringEncodingMacSymbol,
235 kCFStringEncodingNonLossyASCII,
236 kCFStringEncodingDOSJapanese,
237 kCFStringEncodingISOLatin2,
238 kCFStringEncodingUTF16,
239 kCFStringEncodingWindowsCyrillic,
240 kCFStringEncodingWindowsLatin1,
241 kCFStringEncodingWindowsGreek,
242 kCFStringEncodingWindowsLatin5,
243 kCFStringEncodingWindowsLatin2
244 };
245
246 if (NSUTF8StringEncoding == theEncoding) return kCFStringEncodingUTF8;
247
248 if ((theEncoding > 0) && (theEncoding <= NSWindowsCP1250StringEncoding)) return encodings[theEncoding - 1];
249
250 switch (theEncoding) {
251 case NSMacOSRomanStringEncoding: return kCFStringEncodingMacRoman;
252 case NSISO2022JPStringEncoding: return kCFStringEncodingISO_2022_JP;
253
254 default:
255 return ((theEncoding & NSENCODING_MASK) ? theEncoding & ~NSENCODING_MASK : kCFStringEncodingInvalidId);
256 }
257 }
258
259 UInt32 CFStringConvertEncodingToWindowsCodepage(CFStringEncoding theEncoding) {
260 uint16_t codepage = __CFStringEncodingGetWindowsCodePage(theEncoding);
261
262 return ((0 == codepage) ? kCFStringEncodingInvalidId : codepage);
263 }
264
265 CFStringEncoding CFStringConvertWindowsCodepageToEncoding(UInt32 theEncoding) {
266 return __CFStringEncodingGetFromWindowsCodePage(theEncoding);
267 }
268
269 CFStringEncoding CFStringGetMostCompatibleMacStringEncoding(CFStringEncoding encoding) {
270 CFStringEncoding macEncoding = __CFStringEncodingGetMostCompatibleMacScript(encoding);
271
272
273 return macEncoding;
274 }
275
276 #define kCFStringCompareAllocationIncrement (128)
277
278 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX
279
280 // -------------------------------------------------------------------------------------------------
281 // CompareSpecials - ignore case & diacritic differences
282 //
283 // Decomposed have 2nd-4th chars of type Mn or Mc, or in range 1160-11FF (jamo)
284 // Fullwidth & halfwidth are in range FF00-FFEF
285 // Parenthesized & circled are in range 3200-32FF
286 // -------------------------------------------------------------------------------------------------
287
288 enum {
289 kUpperCaseWeightMin = 0x80 | 0x0F,
290 kUpperCaseWeightMax = 0x80 | 0x17,
291 kUpperToLowerDelta = 0x80 | 0x0A, // 0x0A = 0x0F - 0x05
292 kMaskPrimarySecondary = 0xFFFFFF00,
293 kMaskPrimaryOnly = 0xFFFF0000,
294 kMaskSecondaryOnly = 0x0000FF00,
295 kMaskCaseTertiary = 0x000000FF // 2 hi bits case, 6 lo bits tertiary
296 };
297
298 static SInt32 __CompareSpecials(const UCollator *collator, CFOptionFlags options, const UniChar *text1Ptr, UniCharCount text1Length, const UniChar *text2Ptr, UniCharCount text2Length) {
299 UErrorCode icuStatus = U_ZERO_ERROR;
300 SInt32 orderWidth = 0;
301 SInt32 orderCompos = 0;
302
303 UCollationElements * collElems1 = ucol_openElements(collator, (const UChar *)text1Ptr, text1Length, &icuStatus);
304 UCollationElements * collElems2 = ucol_openElements(collator, (const UChar *)text2Ptr, text2Length, &icuStatus);
305 if (U_SUCCESS(icuStatus)) {
306 int32_t startOffset1 = 0;
307 int32_t startOffset2 = 0;
308
309 while (true) {
310 int32_t elemOrder1, elemOrder2;
311 int32_t offset1, offset2;
312
313 elemOrder1 = ucol_next(collElems1, &icuStatus);
314 elemOrder2 = ucol_next(collElems2, &icuStatus);
315 if ( U_FAILURE(icuStatus) || elemOrder1 == (int32_t)UCOL_NULLORDER || elemOrder2 == (int32_t)UCOL_NULLORDER ) {
316 break;
317 }
318
319 offset1 = ucol_getOffset(collElems1);
320 offset2 = ucol_getOffset(collElems2);
321 if ( (elemOrder1 & kMaskPrimarySecondary) == (elemOrder2 & kMaskPrimarySecondary) ) {
322 if ( (elemOrder1 & kMaskPrimaryOnly) != 0 ) {
323 // keys may differ in case, width, circling, etc.
324
325 int32_t tertiary1 = (elemOrder1 & kMaskCaseTertiary);
326 int32_t tertiary2 = (elemOrder2 & kMaskCaseTertiary);
327 // fold upper to lower case
328 if (tertiary1 >= kUpperCaseWeightMin && tertiary1 <= kUpperCaseWeightMax) {
329 tertiary1 -= kUpperToLowerDelta;
330 }
331 if (tertiary2 >= kUpperCaseWeightMin && tertiary2 <= kUpperCaseWeightMax) {
332 tertiary2 -= kUpperToLowerDelta;
333 }
334 // now compare
335 if (tertiary1 != tertiary2) {
336 orderWidth = (tertiary1 < tertiary2)? -1: 1;
337 break;
338 }
339
340 } else if ( (elemOrder1 & kMaskSecondaryOnly) != 0 ) {
341 // primary weights are both zero, but secondaries are not.
342 if ( orderCompos == 0 && (options & kCFCompareNonliteral) == 0 ) {
343 // We have a code element which is a diacritic.
344 // It may have come from a composed char or a combining char.
345 // If it came from a combining char (longer element length) it sorts first.
346 // This is only an approximation to what the Mac OS 9 code did, but this is an
347 // unusual case anyway.
348 int32_t elem1Length = offset1 - startOffset1;
349 int32_t elem2Length = offset2 - startOffset2;
350 if (elem1Length != elem2Length) {
351 orderCompos = (elem1Length > elem2Length)? -1: 1;
352 }
353 }
354 }
355 }
356
357 startOffset1 = offset1;
358 startOffset2 = offset2;
359 }
360 ucol_closeElements(collElems1);
361 ucol_closeElements(collElems2);
362 }
363
364 return (orderWidth != 0)? orderWidth: orderCompos;
365 }
366
367 static SInt32 __CompareCodePoints(const UniChar *text1Ptr, UniCharCount text1Length, const UniChar *text2Ptr, UniCharCount text2Length ) {
368 const UniChar * text1P = text1Ptr;
369 const UniChar * text2P = text2Ptr;
370 UInt32 textLimit = (text1Length <= text2Length)? text1Length: text2Length;
371 UInt32 textCounter;
372 SInt32 orderResult = 0;
373
374 // Loop through either string...the first difference differentiates this.
375 for (textCounter = 0; textCounter < textLimit && *text1P == *text2P; textCounter++) {
376 text1P++;
377 text2P++;
378 }
379 if (textCounter < textLimit) {
380 // code point difference
381 orderResult = (*text1P < *text2P) ? -1 : 1;
382 } else if (text1Length != text2Length) {
383 // one string has extra stuff at end
384 orderResult = (text1Length < text2Length) ? -1 : 1;
385 }
386 return orderResult;
387 }
388
389
390 extern const CFStringRef __kCFLocaleCollatorID;
391
392 static UCollator *__CFStringCreateCollator(CFLocaleRef compareLocale) {
393 CFStringRef canonLocaleCFStr = (CFStringRef)CFLocaleGetValue(compareLocale, __kCFLocaleCollatorID);
394 char icuLocaleStr[128] = {0};
395 CFStringGetCString(canonLocaleCFStr, icuLocaleStr, sizeof(icuLocaleStr), kCFStringEncodingASCII);
396 UErrorCode icuStatus = U_ZERO_ERROR;
397 UCollator * collator = ucol_open(icuLocaleStr, &icuStatus);
398 ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_OFF, &icuStatus);
399 ucol_setAttribute(collator, UCOL_ALTERNATE_HANDLING, UCOL_NON_IGNORABLE, &icuStatus);
400 ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_PRIMARY, &icuStatus);
401 ucol_setAttribute(collator, UCOL_CASE_LEVEL, UCOL_OFF, &icuStatus);
402 ucol_setAttribute(collator, UCOL_NUMERIC_COLLATION, UCOL_OFF, &icuStatus);
403 return collator;
404 }
405
406 #define kCFMaxCachedDefaultCollators (8)
407 static UCollator *__CFDefaultCollators[kCFMaxCachedDefaultCollators];
408 static CFIndex __CFDefaultCollatorsCount = 0;
409 static const void *__CFDefaultCollatorLocale = NULL;
410 static CFLock_t __CFDefaultCollatorLock = CFLockInit;
411
412 static UCollator *__CFStringCopyDefaultCollator(CFLocaleRef compareLocale) {
413 CFLocaleRef currentLocale = NULL;
414 UCollator * collator = NULL;
415
416 if (compareLocale != __CFDefaultCollatorLocale) {
417 currentLocale = CFLocaleCopyCurrent();
418 if (compareLocale != currentLocale) {
419 CFRelease(currentLocale);
420 return NULL;
421 }
422 }
423
424 __CFLock(&__CFDefaultCollatorLock);
425 if ((NULL != currentLocale) && (__CFDefaultCollatorLocale != currentLocale)) {
426 while (__CFDefaultCollatorsCount > 0) ucol_close(__CFDefaultCollators[--__CFDefaultCollatorsCount]);
427 __CFDefaultCollatorLocale = CFRetain(currentLocale);
428 }
429
430 if (__CFDefaultCollatorsCount > 0) collator = __CFDefaultCollators[--__CFDefaultCollatorsCount];
431 __CFUnlock(&__CFDefaultCollatorLock);
432
433 if (NULL == collator) {
434 collator = __CFStringCreateCollator(compareLocale);
435 }
436
437 if (NULL != currentLocale) CFRelease(currentLocale);
438
439 return collator;
440 }
441
442 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED
443 static void __collatorFinalize(UCollator *collator) {
444 CFLocaleRef locale = _CFGetTSD(__CFTSDKeyCollatorLocale);
445 _CFSetTSD(__CFTSDKeyCollatorUCollator, NULL, NULL);
446 _CFSetTSD(__CFTSDKeyCollatorLocale, NULL, NULL);
447 __CFLock(&__CFDefaultCollatorLock);
448 if ((__CFDefaultCollatorLocale == locale) && (__CFDefaultCollatorsCount < kCFMaxCachedDefaultCollators)) {
449 __CFDefaultCollators[__CFDefaultCollatorsCount++] = collator;
450 collator = NULL;
451 }
452 __CFUnlock(&__CFDefaultCollatorLock);
453 if (NULL != collator) ucol_close(collator);
454 if (locale) CFRelease(locale);
455 }
456 #endif
457
458 // -------------------------------------------------------------------------------------------------
459 // __CompareTextDefault
460 //
461 // A primary difference is denoted by values 2/-2 in orderP. Other differences are indicated with a -1/1.
462 // A negative value indicates that text1 sorts before text2.
463 // -------------------------------------------------------------------------------------------------
464 static OSStatus __CompareTextDefault(UCollator *collator, CFOptionFlags options, const UniChar *text1Ptr, UniCharCount text1Length, const UniChar *text2Ptr, UniCharCount text2Length, Boolean *equivalentP, SInt32 *orderP) {
465
466 // collator must have default settings restored on exit from this function
467
468 *equivalentP = true;
469 *orderP = 0;
470
471 if (options & kCFCompareNumerically) {
472 UErrorCode icuStatus = U_ZERO_ERROR;
473 ucol_setAttribute(collator, UCOL_NUMERIC_COLLATION, UCOL_ON, &icuStatus);
474 }
475
476 // Most string differences are Primary. Do a primary check first, then if there
477 // are no differences do a comparison with the options in the collator.
478 UCollationResult icuResult = ucol_strcoll(collator, (const UChar *)text1Ptr, text1Length, (const UChar *)text2Ptr, text2Length);
479 if (icuResult != UCOL_EQUAL) {
480 *orderP = (icuResult == UCOL_LESS) ? -2 : 2;
481 }
482 if (*orderP == 0) {
483 UErrorCode icuStatus = U_ZERO_ERROR;
484 ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_ON, &icuStatus);
485 ucol_setAttribute(collator, UCOL_STRENGTH, (options & kCFCompareDiacriticInsensitive) ? UCOL_PRIMARY : UCOL_SECONDARY, &icuStatus);
486 ucol_setAttribute(collator, UCOL_CASE_LEVEL, (options & kCFCompareCaseInsensitive) ? UCOL_OFF : UCOL_ON, &icuStatus);
487 if (!U_SUCCESS(icuStatus)) {
488 icuStatus = U_ZERO_ERROR;
489 ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_OFF, &icuStatus);
490 ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_PRIMARY, &icuStatus);
491 ucol_setAttribute(collator, UCOL_CASE_LEVEL, UCOL_OFF, &icuStatus);
492 ucol_setAttribute(collator, UCOL_NUMERIC_COLLATION, UCOL_OFF, &icuStatus);
493 return 666;
494 }
495
496 // We don't have a primary difference. Recompare with standard collator.
497 icuResult = ucol_strcoll(collator, (const UChar *)text1Ptr, text1Length, (const UChar *)text2Ptr, text2Length);
498 if (icuResult != UCOL_EQUAL) {
499 *orderP = (icuResult == UCOL_LESS) ? -1 : 1;
500 }
501 icuStatus = U_ZERO_ERROR;
502 ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_OFF, &icuStatus);
503 ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_PRIMARY, &icuStatus);
504 ucol_setAttribute(collator, UCOL_CASE_LEVEL, UCOL_OFF, &icuStatus);
505 }
506 if (*orderP == 0 && (options & kCFCompareNonliteral) == 0) {
507 *orderP = __CompareSpecials(collator, options, text1Ptr, text1Length, text2Ptr, text2Length);
508 }
509
510 *equivalentP = (*orderP == 0);
511
512 // If strings are equivalent but we care about order and have not yet checked
513 // to the level of code point order, then do some more checks for order
514 if (*orderP == 0) {
515 UErrorCode icuStatus = U_ZERO_ERROR;
516 // First try to see if ICU can find any differences above code point level
517 ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_ON, &icuStatus);
518 ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_TERTIARY, &icuStatus);
519 ucol_setAttribute(collator, UCOL_CASE_LEVEL, UCOL_ON, &icuStatus);
520 if (!U_SUCCESS(icuStatus)) {
521 icuStatus = U_ZERO_ERROR;
522 ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_OFF, &icuStatus);
523 ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_PRIMARY, &icuStatus);
524 ucol_setAttribute(collator, UCOL_CASE_LEVEL, UCOL_OFF, &icuStatus);
525 ucol_setAttribute(collator, UCOL_NUMERIC_COLLATION, UCOL_OFF, &icuStatus);
526 return 666;
527 }
528 icuResult = ucol_strcoll(collator, (const UChar *)text1Ptr, text1Length, (const UChar *)text2Ptr, text2Length);
529 if (icuResult != UCOL_EQUAL) {
530 *orderP = (icuResult == UCOL_LESS) ? -1 : 1;
531 } else {
532 // no ICU differences above code point level, compare code points
533 *orderP = __CompareCodePoints( text1Ptr, text1Length, text2Ptr, text2Length );
534 }
535 icuStatus = U_ZERO_ERROR;
536 ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_OFF, &icuStatus);
537 ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_PRIMARY, &icuStatus);
538 ucol_setAttribute(collator, UCOL_CASE_LEVEL, UCOL_OFF, &icuStatus);
539 }
540
541 if (options & kCFCompareNumerically) {
542 UErrorCode icuStatus = U_ZERO_ERROR;
543 ucol_setAttribute(collator, UCOL_NUMERIC_COLLATION, UCOL_OFF, &icuStatus);
544 }
545 return 0; // noErr
546 }
547
548 #endif // DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX
549
550 static inline CFIndex __extendLocationBackward(CFIndex location, CFStringInlineBuffer *str, const uint8_t *nonBaseBMP, const uint8_t *punctBMP) {
551 while (location > 0) {
552 UTF32Char ch = CFStringGetCharacterFromInlineBuffer(str, location);
553 UTF32Char otherChar;
554 if (CFUniCharIsSurrogateLowCharacter(ch) && CFUniCharIsSurrogateHighCharacter((otherChar = CFStringGetCharacterFromInlineBuffer(str, location - 1)))) {
555 ch = CFUniCharGetLongCharacterForSurrogatePair(ch, otherChar);
556 uint8_t planeNo = (ch >> 16);
557 if ((planeNo > 1) || (!CFUniCharIsMemberOfBitmap(ch, CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, planeNo)) && !CFUniCharIsMemberOfBitmap(ch, CFUniCharGetBitmapPtrForPlane(kCFUniCharPunctuationCharacterSet, planeNo)))) break;
558 location -= 2;
559 } else {
560 if ((!CFUniCharIsMemberOfBitmap(ch, nonBaseBMP) && !CFUniCharIsMemberOfBitmap(ch, punctBMP)) || ((ch >= 0x2E80) && (ch < 0xAC00))) break;
561 --location;
562 }
563 }
564
565 return location;
566 }
567
568 static inline CFIndex __extendLocationForward(CFIndex location, CFStringInlineBuffer *str, const uint8_t *alnumBMP, const uint8_t *punctBMP, const uint8_t *controlBMP, CFIndex strMax) {
569 do {
570 UTF32Char ch = CFStringGetCharacterFromInlineBuffer(str, location);
571 UTF32Char otherChar;
572 if (CFUniCharIsSurrogateHighCharacter(ch) && CFUniCharIsSurrogateLowCharacter((otherChar = CFStringGetCharacterFromInlineBuffer(str, location + 1)))) {
573 ch = CFUniCharGetLongCharacterForSurrogatePair(ch, otherChar);
574 location += 2;
575 uint8_t planeNo = (ch >> 16);
576 if (!CFUniCharIsMemberOfBitmap(ch, CFUniCharGetBitmapPtrForPlane(kCFUniCharAlphaNumericCharacterSet, planeNo)) && !CFUniCharIsMemberOfBitmap(ch, CFUniCharGetBitmapPtrForPlane(kCFUniCharPunctuationCharacterSet, planeNo)) && !CFUniCharIsMemberOfBitmap(ch, CFUniCharGetBitmapPtrForPlane(kCFUniCharControlAndFormatterCharacterSet, planeNo))) break;
577 } else {
578 ++location;
579 if ((!CFUniCharIsMemberOfBitmap(ch, alnumBMP) && !CFUniCharIsMemberOfBitmap(ch, punctBMP) && !CFUniCharIsMemberOfBitmap(ch, controlBMP)) || ((ch >= 0x2E80) && (ch < 0xAC00))) break;
580 }
581 } while (location < strMax);
582 return location;
583 }
584
585 CF_PRIVATE CFComparisonResult _CFCompareStringsWithLocale(CFStringInlineBuffer *str1, CFRange str1Range, CFStringInlineBuffer *str2, CFRange str2Range, CFOptionFlags options, const void *compareLocale) {
586 const UniChar *characters1;
587 const UniChar *characters2;
588 CFComparisonResult compResult = kCFCompareEqualTo;
589 CFRange range1 = str1Range;
590 CFRange range2 = str2Range;
591 SInt32 order;
592 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX
593 Boolean isEqual;
594 bool forcedOrdering = ((options & kCFCompareForcedOrdering) ? true : false);
595
596 UCollator *collator = NULL;
597 bool defaultCollator = true;
598 #endif
599 static const uint8_t *alnumBMP = NULL;
600 static const uint8_t *nonBaseBMP = NULL;
601 static const uint8_t *punctBMP = NULL;
602 static const uint8_t *controlBMP = NULL;
603
604 if (NULL == alnumBMP) {
605 alnumBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharAlphaNumericCharacterSet, 0);
606 nonBaseBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, 0);
607 punctBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharPunctuationCharacterSet, 0);
608 controlBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharControlAndFormatterCharacterSet, 0);
609 }
610
611 // Determine the range of characters surrodiing the current index significant for localized comparison. The range is extended backward and forward as long as they are contextual. Contextual characters include all letters and punctuations. Since most control/format characters are ignorable in localized comparison, we also include them extending forward.
612
613 range1.location = str1Range.location;
614 range2.location = str2Range.location;
615
616 // go backward
617 // The characters upto the current index are already determined to be equal by the CFString's standard character folding algorithm. Extend as long as truly contextual (all letters and punctuations).
618 if (range1.location > 0) {
619 range1.location = __extendLocationBackward(range1.location - 1, str1, nonBaseBMP, punctBMP);
620 }
621
622 if (range2.location > 0) {
623 range2.location = __extendLocationBackward(range2.location - 1, str2, nonBaseBMP, punctBMP);
624 }
625
626 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX
627 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED
628 // First we try to use the last one used on this thread, if the locale is the same,
629 // otherwise we try to check out a default one, or then we create one.
630 UCollator *threadCollator = _CFGetTSD(__CFTSDKeyCollatorUCollator);
631 CFLocaleRef threadLocale = _CFGetTSD(__CFTSDKeyCollatorLocale);
632 if (compareLocale == threadLocale) {
633 collator = threadCollator;
634 } else {
635 #endif
636 collator = __CFStringCopyDefaultCollator((CFLocaleRef)compareLocale);
637 defaultCollator = true;
638 if (NULL == collator) {
639 collator = __CFStringCreateCollator((CFLocaleRef)compareLocale);
640 defaultCollator = false;
641 }
642 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED
643 }
644 #endif
645 #endif
646
647 characters1 = CFStringGetCharactersPtrFromInlineBuffer(str1, range1);
648 characters2 = CFStringGetCharactersPtrFromInlineBuffer(str2, range2);
649
650 if ((NULL != characters1) && (NULL != characters2)) { // do fast
651 range1.length = (str1Range.location + str1Range.length) - range1.location;
652 range2.length = (str2Range.location + str2Range.length) - range2.location;
653
654 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX
655 if ((NULL != collator) && (__CompareTextDefault(collator, options, characters1, range1.length, characters2, range2.length, &isEqual, &order) == 0 /* noErr */)) {
656 compResult = ((isEqual && !forcedOrdering) ? kCFCompareEqualTo : ((order < 0) ? kCFCompareLessThan : kCFCompareGreaterThan));
657 } else
658 #endif
659 {
660 compResult = ((memcmp(characters1, characters2, sizeof(UniChar) * range1.length) < 0) ? kCFCompareLessThan : kCFCompareGreaterThan);
661 }
662 } else {
663 UniChar *buffer1 = NULL;
664 UniChar *buffer2 = NULL;
665 UTF16Char sBuffer1[kCFStringCompareAllocationIncrement];
666 UTF16Char sBuffer2[kCFStringCompareAllocationIncrement];
667 CFIndex buffer1Len = 0, buffer2Len = 0;
668 CFIndex str1Max = str1Range.location + str1Range.length;
669 CFIndex str2Max = str2Range.location + str2Range.length;
670 CFIndex bufferSize;
671
672 // Extend forward and compare until the result is deterministic. The result is indeterministic if the differences are weak and can be resolved by character folding. For example, comparision between "abc" and "ABC" is considered to be indeterministic.
673 do {
674 if (str1Range.location < str1Max) {
675 str1Range.location = __extendLocationForward(str1Range.location, str1, alnumBMP, punctBMP, controlBMP, str1Max);
676 range1.length = (str1Range.location - range1.location);
677 characters1 = CFStringGetCharactersPtrFromInlineBuffer(str1, range1);
678
679 if (NULL == characters1) {
680 if ((0 > buffer1Len) || (range1.length > kCFStringCompareAllocationIncrement)) {
681 if (buffer1Len < range1.length) {
682 bufferSize = range1.length + (kCFStringCompareAllocationIncrement - (range1.length % kCFStringCompareAllocationIncrement));
683 if (0 == buffer1Len) {
684 buffer1 = (UniChar *)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(UTF16Char) * bufferSize, 0);
685 } else if (buffer1Len < range1.length) {
686 buffer1 = (UniChar *)CFAllocatorReallocate(kCFAllocatorSystemDefault, buffer1, sizeof(UTF16Char) * bufferSize, 0);
687 }
688 buffer1Len = bufferSize;
689 }
690 } else {
691 buffer1 = sBuffer1;
692 }
693
694 CFStringGetCharactersFromInlineBuffer(str1, range1, buffer1);
695 characters1 = buffer1;
696 }
697 }
698
699 if (str2Range.location < str2Max) {
700 str2Range.location = __extendLocationForward(str2Range.location, str2, alnumBMP, punctBMP, controlBMP, str2Max);
701 range2.length = (str2Range.location - range2.location);
702 characters2 = CFStringGetCharactersPtrFromInlineBuffer(str2, range2);
703
704 if (NULL == characters2) {
705 if ((0 > buffer2Len) || (range2.length > kCFStringCompareAllocationIncrement)) {
706 if (buffer2Len < range2.length) {
707 bufferSize = range2.length + (kCFStringCompareAllocationIncrement - (range2.length % kCFStringCompareAllocationIncrement));
708 if (0 == buffer2Len) {
709 buffer2 = (UniChar *)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(UTF16Char) * bufferSize, 0);
710 } else if (buffer2Len < range2.length) {
711 buffer2 = (UniChar *)CFAllocatorReallocate(kCFAllocatorSystemDefault, buffer2, sizeof(UTF16Char) * bufferSize, 0);
712 }
713 buffer2Len = bufferSize;
714 }
715 } else {
716 buffer2 = sBuffer2;
717 }
718
719 CFStringGetCharactersFromInlineBuffer(str2, range2, buffer2);
720 characters2 = buffer2;
721 }
722 }
723
724 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX
725 if ((NULL != collator) && (__CompareTextDefault(collator, options, characters1, range1.length, characters2, range2.length, &isEqual, &order) == 0 /* noErr */)) {
726 if (isEqual) {
727 if (forcedOrdering && (kCFCompareEqualTo == compResult) && (0 != order)) compResult = ((order < 0) ? kCFCompareLessThan : kCFCompareGreaterThan);
728 order = 0;
729 }
730 } else
731 #endif
732 {
733 order = memcmp(characters1, characters2, sizeof(UTF16Char) * ((range1.length < range2.length) ? range1.length : range2.length));
734 if (0 == order) {
735 if (range1.length < range2.length) {
736 order = -2;
737 } else if (range2.length < range1.length) {
738 order = 2;
739 }
740 } else if (order < 0) {
741 --order;
742 } else if (order > 0) {
743 ++order;
744 }
745 }
746
747 if ((order < -1) || (order > 1)) break; // the result is deterministic
748
749 if (0 == order) {
750 range1.location = str1Range.location;
751 range2.location = str2Range.location;
752 }
753 } while ((str1Range.location < str1Max) || (str2Range.location < str2Max));
754
755 if (0 != order) compResult = ((order < 0) ? kCFCompareLessThan : kCFCompareGreaterThan);
756
757 if (buffer1Len > 0) CFAllocatorDeallocate(kCFAllocatorSystemDefault, buffer1);
758 if (buffer2Len > 0) CFAllocatorDeallocate(kCFAllocatorSystemDefault, buffer2);
759 }
760
761 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED
762 if (collator == threadCollator) {
763 // do nothing, already cached
764 } else {
765 if (threadLocale) __collatorFinalize((UCollator *)_CFGetTSD(__CFTSDKeyCollatorUCollator)); // need to dealloc collators
766
767 _CFSetTSD(__CFTSDKeyCollatorUCollator, collator, (void *)__collatorFinalize);
768 _CFSetTSD(__CFTSDKeyCollatorLocale, (void *)CFRetain(compareLocale), NULL);
769 }
770 #endif
771
772 return compResult;
773 }
774