]> git.saurik.com Git - apple/cf.git/blob - CFUniChar.h
e3df9638f5ab743965bda9dfac784792fbd94e3e
[apple/cf.git] / CFUniChar.h
1 /*
2 * Copyright (c) 2009 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24 /* CFUniChar.h
25 Copyright (c) 1998-2009, Apple Inc. All rights reserved.
26 */
27
28 #if !defined(__COREFOUNDATION_CFUNICHAR__)
29 #define __COREFOUNDATION_CFUNICHAR__ 1
30
31
32 #include <CoreFoundation/CFByteOrder.h>
33 #include <CoreFoundation/CFBase.h>
34
35 CF_EXTERN_C_BEGIN
36
37 #define kCFUniCharBitShiftForByte (3)
38 #define kCFUniCharBitShiftForMask (7)
39
40 CF_INLINE bool CFUniCharIsSurrogateHighCharacter(UniChar character) {
41 return ((character >= 0xD800UL) && (character <= 0xDBFFUL) ? true : false);
42 }
43
44 CF_INLINE bool CFUniCharIsSurrogateLowCharacter(UniChar character) {
45 return ((character >= 0xDC00UL) && (character <= 0xDFFFUL) ? true : false);
46 }
47
48 CF_INLINE UTF32Char CFUniCharGetLongCharacterForSurrogatePair(UniChar surrogateHigh, UniChar surrogateLow) {
49 return ((surrogateHigh - 0xD800UL) << 10) + (surrogateLow - 0xDC00UL) + 0x0010000UL;
50 }
51
52 // The following values coinside TextEncodingFormat format defines in TextCommon.h
53 enum {
54 kCFUniCharUTF16Format = 0,
55 kCFUniCharUTF8Format = 2,
56 kCFUniCharUTF32Format = 3
57 };
58
59 CF_INLINE bool CFUniCharIsMemberOfBitmap(UTF16Char theChar, const uint8_t *bitmap) {
60 return (bitmap && (bitmap[(theChar) >> kCFUniCharBitShiftForByte] & (((uint32_t)1) << (theChar & kCFUniCharBitShiftForMask))) ? true : false);
61 }
62
63 CF_INLINE void CFUniCharAddCharacterToBitmap(UTF16Char theChar, uint8_t *bitmap) {
64 bitmap[(theChar) >> kCFUniCharBitShiftForByte] |= (((uint32_t)1) << (theChar & kCFUniCharBitShiftForMask));
65 }
66
67 CF_INLINE void CFUniCharRemoveCharacterFromBitmap(UTF16Char theChar, uint8_t *bitmap) {
68 bitmap[(theChar) >> kCFUniCharBitShiftForByte] &= ~(((uint32_t)1) << (theChar & kCFUniCharBitShiftForMask));
69 }
70
71 enum {
72 kCFUniCharControlCharacterSet = 1,
73 kCFUniCharWhitespaceCharacterSet,
74 kCFUniCharWhitespaceAndNewlineCharacterSet,
75 kCFUniCharDecimalDigitCharacterSet,
76 kCFUniCharLetterCharacterSet,
77 kCFUniCharLowercaseLetterCharacterSet,
78 kCFUniCharUppercaseLetterCharacterSet,
79 kCFUniCharNonBaseCharacterSet,
80 kCFUniCharCanonicalDecomposableCharacterSet,
81 kCFUniCharDecomposableCharacterSet = kCFUniCharCanonicalDecomposableCharacterSet,
82 kCFUniCharAlphaNumericCharacterSet,
83 kCFUniCharPunctuationCharacterSet,
84 kCFUniCharIllegalCharacterSet,
85 kCFUniCharTitlecaseLetterCharacterSet,
86 kCFUniCharSymbolAndOperatorCharacterSet,
87 kCFUniCharNewlineCharacterSet,
88
89 kCFUniCharCompatibilityDecomposableCharacterSet = 100, // internal character sets begins here
90 kCFUniCharHFSPlusDecomposableCharacterSet,
91 kCFUniCharStrongRightToLeftCharacterSet,
92 kCFUniCharHasNonSelfLowercaseCharacterSet,
93 kCFUniCharHasNonSelfUppercaseCharacterSet,
94 kCFUniCharHasNonSelfTitlecaseCharacterSet,
95 kCFUniCharHasNonSelfCaseFoldingCharacterSet,
96 kCFUniCharHasNonSelfMirrorMappingCharacterSet,
97 kCFUniCharControlAndFormatterCharacterSet,
98 kCFUniCharCaseIgnorableCharacterSet,
99 kCFUniCharGraphemeExtendCharacterSet
100 };
101
102 CF_EXPORT bool CFUniCharIsMemberOf(UTF32Char theChar, uint32_t charset);
103
104 // This function returns NULL for kCFUniCharControlCharacterSet, kCFUniCharWhitespaceCharacterSet, kCFUniCharWhitespaceAndNewlineCharacterSet, & kCFUniCharIllegalCharacterSet
105 CF_EXPORT const uint8_t *CFUniCharGetBitmapPtrForPlane(uint32_t charset, uint32_t plane);
106
107 enum {
108 kCFUniCharBitmapFilled = (uint8_t)0,
109 kCFUniCharBitmapEmpty = (uint8_t)0xFF,
110 kCFUniCharBitmapAll = (uint8_t)1
111 };
112
113 CF_EXPORT uint8_t CFUniCharGetBitmapForPlane(uint32_t charset, uint32_t plane, void *bitmap, bool isInverted);
114
115 CF_EXPORT uint32_t CFUniCharGetNumberOfPlanes(uint32_t charset);
116
117 enum {
118 kCFUniCharToLowercase = 0,
119 kCFUniCharToUppercase,
120 kCFUniCharToTitlecase,
121 kCFUniCharCaseFold
122 };
123
124 enum {
125 kCFUniCharCaseMapFinalSigma = (1UL << 0),
126 kCFUniCharCaseMapAfter_i = (1UL << 1),
127 kCFUniCharCaseMapMoreAbove = (1UL << 2)
128 };
129
130 CF_EXPORT CFIndex CFUniCharMapCaseTo(UTF32Char theChar, UTF16Char *convertedChar, CFIndex maxLength, uint32_t ctype, uint32_t flags, const uint8_t *langCode);
131
132 CF_EXPORT uint32_t CFUniCharGetConditionalCaseMappingFlags(UTF32Char theChar, UTF16Char *buffer, CFIndex currentIndex, CFIndex length, uint32_t type, const uint8_t *langCode, uint32_t lastFlags);
133
134 enum {
135 kCFUniCharBiDiPropertyON = 0,
136 kCFUniCharBiDiPropertyL,
137 kCFUniCharBiDiPropertyR,
138 kCFUniCharBiDiPropertyAN,
139 kCFUniCharBiDiPropertyEN,
140 kCFUniCharBiDiPropertyAL,
141 kCFUniCharBiDiPropertyNSM,
142 kCFUniCharBiDiPropertyCS,
143 kCFUniCharBiDiPropertyES,
144 kCFUniCharBiDiPropertyET,
145 kCFUniCharBiDiPropertyBN,
146 kCFUniCharBiDiPropertyS,
147 kCFUniCharBiDiPropertyWS,
148 kCFUniCharBiDiPropertyB,
149 kCFUniCharBiDiPropertyRLO,
150 kCFUniCharBiDiPropertyRLE,
151 kCFUniCharBiDiPropertyLRO,
152 kCFUniCharBiDiPropertyLRE,
153 kCFUniCharBiDiPropertyPDF
154 };
155
156 enum {
157 kCFUniCharCombiningProperty = 0,
158 kCFUniCharBidiProperty
159 };
160
161 // The second arg 'bitmap' has to be the pointer to a specific plane
162 CF_INLINE uint8_t CFUniCharGetBidiPropertyForCharacter(UTF16Char character, const uint8_t *bitmap) {
163 if (bitmap) {
164 uint8_t value = bitmap[(character >> 8)];
165
166 if (value > kCFUniCharBiDiPropertyPDF) {
167 bitmap = bitmap + 256 + ((value - kCFUniCharBiDiPropertyPDF - 1) * 256);
168 return bitmap[character % 256];
169 } else {
170 return value;
171 }
172 }
173 return kCFUniCharBiDiPropertyL;
174 }
175
176 CF_INLINE uint8_t CFUniCharGetCombiningPropertyForCharacter(UTF16Char character, const uint8_t *bitmap) {
177 if (bitmap) {
178 uint8_t value = bitmap[(character >> 8)];
179
180 if (value) {
181 bitmap = bitmap + 256 + ((value - 1) * 256);
182 return bitmap[character % 256];
183 }
184 }
185 return 0;
186 }
187
188 CF_EXPORT const void *CFUniCharGetUnicodePropertyDataForPlane(uint32_t propertyType, uint32_t plane);
189 CF_EXPORT uint32_t CFUniCharGetNumberOfPlanesForUnicodePropertyData(uint32_t propertyType);
190 CF_EXPORT uint32_t CFUniCharGetUnicodeProperty(UTF32Char character, uint32_t propertyType);
191
192 CF_EXPORT bool CFUniCharFillDestinationBuffer(const UTF32Char *src, CFIndex srcLength, void **dst, CFIndex dstLength, CFIndex *filledLength, uint32_t dstFormat);
193
194 // UTF32 support
195
196 CF_INLINE bool CFUniCharToUTF32(const UTF16Char *src, CFIndex length, UTF32Char *dst, bool allowLossy, bool isBigEndien) {
197 const UTF16Char *limit = src + length;
198 UTF32Char character;
199
200 while (src < limit) {
201 character = *(src++);
202
203 if (CFUniCharIsSurrogateHighCharacter(character)) {
204 if ((src < limit) && CFUniCharIsSurrogateLowCharacter(*src)) {
205 character = CFUniCharGetLongCharacterForSurrogatePair(character, *(src++));
206 } else {
207 if (!allowLossy) return false;
208 character = 0xFFFD; // replacement character
209 }
210 } else if (CFUniCharIsSurrogateLowCharacter(character)) {
211 if (!allowLossy) return false;
212 character = 0xFFFD; // replacement character
213 }
214
215 *(dst++) = (isBigEndien ? CFSwapInt32HostToBig(character) : CFSwapInt32HostToLittle(character));
216 }
217
218 return true;
219 }
220
221 CF_INLINE bool CFUniCharFromUTF32(const UTF32Char *src, CFIndex length, UTF16Char *dst, bool allowLossy, bool isBigEndien) {
222 const UTF32Char *limit = src + length;
223 UTF32Char character;
224
225 while (src < limit) {
226 character = (isBigEndien ? CFSwapInt32BigToHost(*(src++)) : CFSwapInt32LittleToHost(*(src++)));
227
228 if (character < 0xFFFF) { // BMP
229 if (allowLossy) {
230 if (CFUniCharIsSurrogateHighCharacter(character)) {
231 UTF32Char otherCharacter = 0xFFFD; // replacement character
232
233 if (src < limit) {
234 otherCharacter = (isBigEndien ? CFSwapInt32BigToHost(*src) : CFSwapInt32LittleToHost(*src));
235
236
237 if ((otherCharacter < 0x10000) && CFUniCharIsSurrogateLowCharacter(otherCharacter)) {
238 *(dst++) = character; ++src;
239 } else {
240 otherCharacter = 0xFFFD; // replacement character
241 }
242 }
243
244 character = otherCharacter;
245 } else if (CFUniCharIsSurrogateLowCharacter(character)) {
246 character = 0xFFFD; // replacement character
247 }
248 } else {
249 if (CFUniCharIsSurrogateHighCharacter(character) || CFUniCharIsSurrogateLowCharacter(character)) return false;
250 }
251 } else if (character < 0x110000) { // non-BMP
252 character -= 0x10000;
253 *(dst++) = (UTF16Char)((character >> 10) + 0xD800UL);
254 character = (UTF16Char)((character & 0x3FF) + 0xDC00UL);
255 } else {
256 if (!allowLossy) return false;
257 character = 0xFFFD; // replacement character
258 }
259
260 *(dst++) = character;
261 }
262 return true;
263 }
264
265 CF_EXTERN_C_END
266
267 #endif /* ! __COREFOUNDATION_CFUNICHAR__ */
268