]> git.saurik.com Git - apple/cf.git/blob - StringEncodings.subproj/CFUniChar.h
CF-368.28.tar.gz
[apple/cf.git] / StringEncodings.subproj / CFUniChar.h
1 /*
2 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23 /* CFUniChar.h
24 Copyright (c) 1998-2005, Apple, Inc. All rights reserved.
25 */
26
27 #if !defined(__COREFOUNDATION_CFUNICHAR__)
28 #define __COREFOUNDATION_CFUNICHAR__ 1
29
30
31 #include <CoreFoundation/CFByteOrder.h>
32 #include <CoreFoundation/CFBase.h>
33
34 #if defined(__cplusplus)
35 extern "C" {
36 #endif
37
38 #define kCFUniCharBitShiftForByte (3)
39 #define kCFUniCharBitShiftForMask (7)
40
41 CF_INLINE Boolean CFUniCharIsSurrogateHighCharacter(UniChar character) {
42 return ((character >= 0xD800UL) && (character <= 0xDBFFUL) ? true : false);
43 }
44
45 CF_INLINE Boolean CFUniCharIsSurrogateLowCharacter(UniChar character) {
46 return ((character >= 0xDC00UL) && (character <= 0xDFFFUL) ? true : false);
47 }
48
49 CF_INLINE UTF32Char CFUniCharGetLongCharacterForSurrogatePair(UniChar surrogateHigh, UniChar surrogateLow) {
50 return ((surrogateHigh - 0xD800UL) << 10) + (surrogateLow - 0xDC00UL) + 0x0010000UL;
51 }
52
53 // The following values coinside TextEncodingFormat format defines in TextCommon.h
54 enum {
55 kCFUniCharUTF16Format = 0,
56 kCFUniCharUTF8Format = 2,
57 kCFUniCharUTF32Format = 3
58 };
59
60 CF_INLINE bool CFUniCharIsMemberOfBitmap(UTF16Char theChar, const uint8_t *bitmap) {
61 return (bitmap && (bitmap[(theChar) >> kCFUniCharBitShiftForByte] & (((uint32_t)1) << (theChar & kCFUniCharBitShiftForMask))) ? true : false);
62 }
63
64 CF_INLINE void CFUniCharAddCharacterToBitmap(UTF16Char theChar, uint8_t *bitmap) {
65 bitmap[(theChar) >> kCFUniCharBitShiftForByte] |= (((uint32_t)1) << (theChar & kCFUniCharBitShiftForMask));
66 }
67
68 CF_INLINE void CFUniCharRemoveCharacterFromBitmap(UTF16Char theChar, uint8_t *bitmap) {
69 bitmap[(theChar) >> kCFUniCharBitShiftForByte] &= ~(((uint32_t)1) << (theChar & kCFUniCharBitShiftForMask));
70 }
71
72 enum {
73 kCFUniCharControlCharacterSet = 1,
74 kCFUniCharWhitespaceCharacterSet,
75 kCFUniCharWhitespaceAndNewlineCharacterSet,
76 kCFUniCharDecimalDigitCharacterSet,
77 kCFUniCharLetterCharacterSet,
78 kCFUniCharLowercaseLetterCharacterSet,
79 kCFUniCharUppercaseLetterCharacterSet,
80 kCFUniCharNonBaseCharacterSet,
81 kCFUniCharCanonicalDecomposableCharacterSet,
82 kCFUniCharDecomposableCharacterSet = kCFUniCharCanonicalDecomposableCharacterSet,
83 kCFUniCharAlphaNumericCharacterSet,
84 kCFUniCharPunctuationCharacterSet,
85 kCFUniCharIllegalCharacterSet,
86 kCFUniCharTitlecaseLetterCharacterSet,
87 kCFUniCharSymbolAndOperatorCharacterSet,
88 kCFUniCharCompatibilityDecomposableCharacterSet,
89 kCFUniCharHFSPlusDecomposableCharacterSet,
90 kCFUniCharStrongRightToLeftCharacterSet,
91 kCFUniCharHasNonSelfLowercaseCharacterSet,
92 kCFUniCharHasNonSelfUppercaseCharacterSet,
93 kCFUniCharHasNonSelfTitlecaseCharacterSet,
94 kCFUniCharHasNonSelfCaseFoldingCharacterSet,
95 kCFUniCharHasNonSelfMirrorMappingCharacterSet,
96 kCFUniCharControlAndFormatterCharacterSet,
97 kCFUniCharCaseIgnorableCharacterSet
98 };
99
100 CF_EXPORT bool CFUniCharIsMemberOf(UTF32Char theChar, uint32_t charset);
101
102 // This function returns NULL for kCFUniCharControlCharacterSet, kCFUniCharWhitespaceCharacterSet, kCFUniCharWhitespaceAndNewlineCharacterSet, & kCFUniCharIllegalCharacterSet
103 CF_EXPORT const uint8_t *CFUniCharGetBitmapPtrForPlane(uint32_t charset, uint32_t plane);
104
105 enum {
106 kCFUniCharBitmapFilled = (uint8_t)0,
107 kCFUniCharBitmapEmpty = (uint8_t)0xFF,
108 kCFUniCharBitmapAll = (uint8_t)1
109 };
110
111 CF_EXPORT uint8_t CFUniCharGetBitmapForPlane(uint32_t charset, uint32_t plane, void *bitmap, bool isInverted);
112
113 CF_EXPORT uint32_t CFUniCharGetNumberOfPlanes(uint32_t charset);
114
115 enum {
116 kCFUniCharToLowercase = 0,
117 kCFUniCharToUppercase,
118 kCFUniCharToTitlecase,
119 kCFUniCharCaseFold
120 };
121
122 enum {
123 kCFUniCharCaseMapFinalSigma = (1),
124 kCFUniCharCaseMapAfter_i = (1 << 1),
125 kCFUniCharCaseMapMoreAbove = (1 << 2)
126 };
127
128 CF_EXPORT uint32_t CFUniCharMapCaseTo(UTF32Char theChar, UTF16Char *convertedChar, uint32_t maxLength, uint32_t ctype, uint32_t flags, const uint8_t *langCode);
129
130 CF_EXPORT uint32_t CFUniCharGetConditionalCaseMappingFlags(UTF32Char theChar, UTF16Char *buffer, uint32_t currentIndex, uint32_t length, uint32_t type, const uint8_t *langCode, uint32_t lastFlags);
131
132 enum {
133 kCFUniCharBiDiPropertyON = 0,
134 kCFUniCharBiDiPropertyL,
135 kCFUniCharBiDiPropertyR,
136 kCFUniCharBiDiPropertyAN,
137 kCFUniCharBiDiPropertyEN,
138 kCFUniCharBiDiPropertyAL,
139 kCFUniCharBiDiPropertyNSM,
140 kCFUniCharBiDiPropertyCS,
141 kCFUniCharBiDiPropertyES,
142 kCFUniCharBiDiPropertyET,
143 kCFUniCharBiDiPropertyBN,
144 kCFUniCharBiDiPropertyS,
145 kCFUniCharBiDiPropertyWS,
146 kCFUniCharBiDiPropertyB,
147 kCFUniCharBiDiPropertyRLO,
148 kCFUniCharBiDiPropertyRLE,
149 kCFUniCharBiDiPropertyLRO,
150 kCFUniCharBiDiPropertyLRE,
151 kCFUniCharBiDiPropertyPDF
152 };
153
154 enum {
155 kCFUniCharCombiningProperty = 0,
156 kCFUniCharBidiProperty
157 };
158
159 // The second arg 'bitmap' has to be the pointer to a specific plane
160 CF_INLINE uint8_t CFUniCharGetBidiPropertyForCharacter(UTF16Char character, const uint8_t *bitmap) {
161 if (bitmap) {
162 uint8_t value = bitmap[(character >> 8)];
163
164 if (value > kCFUniCharBiDiPropertyPDF) {
165 bitmap = bitmap + 256 + ((value - kCFUniCharBiDiPropertyPDF - 1) * 256);
166 return bitmap[character % 256];
167 } else {
168 return value;
169 }
170 }
171 return kCFUniCharBiDiPropertyL;
172 }
173
174 CF_INLINE uint8_t CFUniCharGetCombiningPropertyForCharacter(UTF16Char character, const uint8_t *bitmap) {
175 if (bitmap) {
176 uint8_t value = bitmap[(character >> 8)];
177
178 if (value) {
179 bitmap = bitmap + 256 + ((value - 1) * 256);
180 return bitmap[character % 256];
181 }
182 }
183 return 0;
184 }
185
186 CF_EXPORT const void *CFUniCharGetUnicodePropertyDataForPlane(uint32_t propertyType, uint32_t plane);
187 CF_EXPORT uint32_t CFUniCharGetNumberOfPlanesForUnicodePropertyData(uint32_t propertyType);
188 CF_EXPORT uint32_t CFUniCharGetUnicodeProperty(UTF32Char character, uint32_t propertyType);
189
190 CF_EXPORT bool CFUniCharFillDestinationBuffer(const UTF32Char *src, uint32_t srcLength, void **dst, uint32_t dstLength, uint32_t *filledLength, uint32_t dstFormat);
191
192 // UTF32 support
193
194 CF_INLINE bool CFUniCharToUTF32(const UTF16Char *src, CFIndex length, UTF32Char *dst, bool allowLossy, bool isBigEndien) {
195 const UTF16Char *limit = src + length;
196 UTF32Char character;
197
198 while (src < limit) {
199 character = *(src++);
200
201 if (CFUniCharIsSurrogateHighCharacter(character)) {
202 if ((src < limit) && CFUniCharIsSurrogateLowCharacter(*src)) {
203 character = CFUniCharGetLongCharacterForSurrogatePair(character, *(src++));
204 } else {
205 if (!allowLossy) return false;
206 character = 0xFFFD; // replacement character
207 }
208 } else if (CFUniCharIsSurrogateLowCharacter(character)) {
209 if (!allowLossy) return false;
210 character = 0xFFFD; // replacement character
211 }
212
213 *(dst++) = (isBigEndien ? CFSwapInt32HostToBig(character) : CFSwapInt32HostToLittle(character));
214 }
215
216 return true;
217 }
218
219 CF_INLINE bool CFUniCharFromUTF32(const UTF32Char *src, CFIndex length, UTF16Char *dst, bool allowLossy, bool isBigEndien) {
220 const UTF32Char *limit = src + length;
221 UTF32Char character;
222
223 while (src < limit) {
224 character = (isBigEndien ? CFSwapInt32BigToHost(*(src++)) : CFSwapInt32LittleToHost(*(src++)));
225
226 if (character < 0xFFFF) { // BMP
227 if (allowLossy) {
228 if (CFUniCharIsSurrogateHighCharacter(character)) {
229 UTF32Char otherCharacter = 0xFFFD; // replacement character
230
231 if (src < limit) {
232 otherCharacter = (isBigEndien ? CFSwapInt32BigToHost(*src) : CFSwapInt32LittleToHost(*src));
233
234
235 if ((otherCharacter < 0x10000) && CFUniCharIsSurrogateLowCharacter(otherCharacter)) {
236 *(dst++) = character; ++src;
237 } else {
238 otherCharacter = 0xFFFD; // replacement character
239 }
240 }
241
242 character = otherCharacter;
243 } else if (CFUniCharIsSurrogateLowCharacter(character)) {
244 character = 0xFFFD; // replacement character
245 }
246 } else {
247 if (CFUniCharIsSurrogateHighCharacter(character) || CFUniCharIsSurrogateLowCharacter(character)) return false;
248 }
249 } else if (character < 0x110000) { // non-BMP
250 character -= 0x10000;
251 *(dst++) = (UTF16Char)((character >> 10) + 0xD800UL);
252 character = (UTF16Char)((character & 0x3FF) + 0xDC00UL);
253 } else {
254 if (!allowLossy) return false;
255 character = 0xFFFD; // replacement character
256 }
257
258 *(dst++) = character;
259 }
260 return true;
261 }
262
263 #if defined(__cplusplus)
264 }
265 #endif
266
267 #endif /* ! __COREFOUNDATION_CFUNICHAR__ */
268