]>
Commit | Line | Data |
---|---|---|
1c79356b | 1 | /* |
0b4e3aa0 | 2 | * Copyright (c) 2000-2001 Apple Computer, Inc. All rights reserved. |
1c79356b A |
3 | * |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * The contents of this file constitute Original Code as defined in and | |
7 | * are subject to the Apple Public Source License Version 1.1 (the | |
8 | * "License"). You may not use this file except in compliance with the | |
9 | * License. Please obtain a copy of the License at | |
10 | * http://www.apple.com/publicsource and read it before using this file. | |
11 | * | |
12 | * This Original Code and all software distributed under the License are | |
13 | * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
14 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
15 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
16 | * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the | |
17 | * License for the specific language governing rights and limitations | |
18 | * under the License. | |
19 | * | |
20 | * @APPLE_LICENSE_HEADER_END@ | |
21 | */ | |
1c79356b A |
22 | #include <sys/param.h> |
23 | #include <sys/utfconv.h> | |
24 | #include <sys/errno.h> | |
25 | #include <architecture/byte_order.h> | |
26 | ||
1c79356b | 27 | /* |
765c9de3 | 28 | * UTF-8 (Unicode Transformation Format) |
1c79356b | 29 | * |
765c9de3 A |
30 | * UTF-8 is the Unicode Transformation Format that serializes a Unicode |
31 | * character as a sequence of one to four bytes. Only the shortest form | |
32 | * required to represent the significant Unicode bits is legal. | |
1c79356b A |
33 | * |
34 | * UTF-8 Multibyte Codes | |
35 | * | |
765c9de3 A |
36 | * Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary) |
37 | * ----------------------------------------------------------------------------- | |
38 | * 1 7 0x0000 0x007F 0xxxxxxx | |
39 | * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx | |
40 | * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx | |
41 | * 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | |
42 | * ----------------------------------------------------------------------------- | |
1c79356b A |
43 | */ |
44 | ||
45 | ||
765c9de3 A |
46 | #define UNICODE_TO_UTF8_LEN(c) \ |
47 | ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3))) | |
0b4e3aa0 A |
48 | |
49 | #define UCS_ALT_NULL 0x2400 | |
1c79356b | 50 | |
765c9de3 A |
51 | /* Surrogate Pair Constants */ |
52 | #define SP_HALF_SHIFT 10 | |
53 | #define SP_HALF_BASE 0x0010000UL | |
54 | #define SP_HALF_MASK 0x3FFUL | |
55 | ||
56 | #define SP_HIGH_FIRST 0xD800UL | |
57 | #define SP_HIGH_LAST 0xDBFFUL | |
58 | #define SP_LOW_FIRST 0xDC00UL | |
59 | #define SP_LOW_LAST 0xDFFFUL | |
1c79356b | 60 | |
765c9de3 A |
61 | |
62 | static u_int16_t ucs_decompose(u_int16_t, u_int16_t *); | |
1c79356b | 63 | |
0b4e3aa0 A |
64 | static u_int16_t ucs_combine(u_int16_t base, u_int16_t comb); |
65 | ||
1c79356b | 66 | |
765c9de3 A |
67 | char utf_extrabytes[32] = { |
68 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
69 | -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1 | |
70 | }; | |
71 | ||
72 | ||
1c79356b | 73 | /* |
765c9de3 | 74 | * utf8_encodelen - Calculates the UTF-8 encoding length for a Unicode filename |
1c79356b A |
75 | * |
76 | * NOTES: | |
77 | * If '/' chars are allowed on disk then an alternate | |
78 | * (replacement) char must be provided in altslash. | |
79 | * | |
80 | * input flags: | |
765c9de3 | 81 | * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime |
1c79356b A |
82 | */ |
83 | size_t | |
765c9de3 A |
84 | utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash, |
85 | int flags) | |
1c79356b A |
86 | { |
87 | u_int16_t ucs_ch; | |
88 | int charcnt; | |
89 | int swapbytes = (flags & UTF_REVERSE_ENDIAN); | |
90 | size_t len; | |
91 | ||
92 | charcnt = ucslen / 2; | |
93 | len = 0; | |
94 | ||
95 | while (charcnt-- > 0) { | |
96 | ucs_ch = *ucsp++; | |
97 | ||
98 | if (swapbytes) | |
99 | ucs_ch = NXSwapShort(ucs_ch); | |
0b4e3aa0 A |
100 | if (ucs_ch == '/') |
101 | ucs_ch = altslash ? altslash : '_'; | |
102 | else if (ucs_ch == '\0') | |
103 | ucs_ch = UCS_ALT_NULL; | |
1c79356b | 104 | |
765c9de3 | 105 | len += UNICODE_TO_UTF8_LEN(ucs_ch); |
1c79356b A |
106 | } |
107 | ||
108 | return (len); | |
109 | } | |
110 | ||
111 | ||
112 | /* | |
765c9de3 | 113 | * utf8_encodestr - Encodes a Unicode string to UTF-8 |
1c79356b A |
114 | * |
115 | * NOTES: | |
0b4e3aa0 | 116 | * The resulting UTF-8 string is NULL terminated. |
1c79356b A |
117 | * |
118 | * If '/' chars are allowed on disk then an alternate | |
119 | * (replacement) char must be provided in altslash. | |
120 | * | |
121 | * input flags: | |
765c9de3 | 122 | * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime |
1c79356b | 123 | * UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output |
0b4e3aa0 A |
124 | * |
125 | * result: | |
126 | * ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded | |
127 | * EINVAL: Illegal char found; char was replaced by an '_'. | |
1c79356b | 128 | */ |
765c9de3 A |
129 | int |
130 | utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p, | |
131 | size_t * utf8len, size_t buflen, u_int16_t altslash, int flags) | |
1c79356b A |
132 | { |
133 | u_int8_t * bufstart; | |
134 | u_int8_t * bufend; | |
135 | u_int16_t ucs_ch; | |
0b4e3aa0 | 136 | u_int16_t extra[2] = {0}; |
1c79356b A |
137 | int charcnt; |
138 | int swapbytes = (flags & UTF_REVERSE_ENDIAN); | |
0b4e3aa0 A |
139 | int nullterm = ((flags & UTF_NO_NULL_TERM) == 0); |
140 | int decompose = (flags & UTF_DECOMPOSED); | |
1c79356b A |
141 | int result = 0; |
142 | ||
143 | bufstart = utf8p; | |
144 | bufend = bufstart + buflen; | |
145 | if (nullterm) | |
146 | --bufend; | |
147 | charcnt = ucslen / 2; | |
148 | ||
149 | while (charcnt-- > 0) { | |
0b4e3aa0 A |
150 | if (!decompose) |
151 | ucs_ch = swapbytes ? NXSwapShort(*ucsp++) : *ucsp++; | |
152 | else if (extra[0]) { | |
153 | ucs_ch = extra[0]; extra[0] = 0; | |
154 | } else if (extra[1]) { | |
155 | ucs_ch = extra[1]; extra[1] = 0; | |
156 | } else { | |
157 | ucs_ch = swapbytes ? NXSwapShort(*ucsp++) : *ucsp++; | |
158 | ucs_ch = ucs_decompose(ucs_ch, &extra[0]); | |
159 | if (extra[0]) | |
160 | charcnt++; | |
161 | if (extra[1]) | |
162 | charcnt++; | |
163 | } | |
1c79356b | 164 | |
0b4e3aa0 A |
165 | /* Slash and NULL are not permitted */ |
166 | if (ucs_ch == '/') { | |
167 | if (altslash) | |
168 | ucs_ch = altslash; | |
169 | else { | |
170 | ucs_ch = '_'; | |
171 | result = EINVAL; | |
172 | } | |
173 | } else if (ucs_ch == '\0') { | |
174 | ucs_ch = UCS_ALT_NULL; | |
175 | } | |
1c79356b | 176 | |
0b4e3aa0 | 177 | if (ucs_ch < 0x0080) { |
1c79356b A |
178 | if (utf8p >= bufend) { |
179 | result = ENAMETOOLONG; | |
180 | break; | |
765c9de3 | 181 | } |
1c79356b A |
182 | *utf8p++ = ucs_ch; |
183 | ||
184 | } else if (ucs_ch < 0x800) { | |
185 | if ((utf8p + 1) >= bufend) { | |
186 | result = ENAMETOOLONG; | |
187 | break; | |
188 | } | |
765c9de3 A |
189 | *utf8p++ = 0xc0 | (ucs_ch >> 6); |
190 | *utf8p++ = 0x80 | (0x3f & ucs_ch); | |
1c79356b A |
191 | |
192 | } else { | |
765c9de3 A |
193 | /* Combine valid surrogate pairs */ |
194 | if (ucs_ch >= SP_HIGH_FIRST && ucs_ch <= SP_HIGH_LAST | |
195 | && charcnt > 0) { | |
196 | u_int16_t ch2; | |
197 | u_int32_t pair; | |
198 | ||
199 | ch2 = swapbytes ? NXSwapShort(*ucsp) : *ucsp; | |
200 | if (ch2 >= SP_LOW_FIRST && ch2 <= SP_LOW_LAST) { | |
201 | pair = ((ucs_ch - SP_HIGH_FIRST) << SP_HALF_SHIFT) | |
202 | + (ch2 - SP_LOW_FIRST) + SP_HALF_BASE; | |
203 | if ((utf8p + 3) >= bufend) { | |
204 | result = ENAMETOOLONG; | |
205 | break; | |
206 | } | |
207 | --charcnt; | |
208 | ++ucsp; | |
209 | *utf8p++ = 0xf0 | (pair >> 18); | |
210 | *utf8p++ = 0x80 | (0x3f & (pair >> 12)); | |
211 | *utf8p++ = 0x80 | (0x3f & (pair >> 6)); | |
212 | *utf8p++ = 0x80 | (0x3f & pair); | |
213 | continue; | |
214 | } | |
215 | } | |
1c79356b A |
216 | if ((utf8p + 2) >= bufend) { |
217 | result = ENAMETOOLONG; | |
218 | break; | |
219 | } | |
765c9de3 A |
220 | *utf8p++ = 0xe0 | (ucs_ch >> 12); |
221 | *utf8p++ = 0x80 | (0x3f & (ucs_ch >> 6)); | |
222 | *utf8p++ = 0x80 | (0x3f & ucs_ch); | |
1c79356b A |
223 | } |
224 | } | |
225 | ||
226 | *utf8len = utf8p - bufstart; | |
227 | if (nullterm) | |
228 | *utf8p++ = '\0'; | |
229 | ||
230 | return (result); | |
231 | } | |
232 | ||
233 | ||
234 | /* | |
765c9de3 | 235 | * utf8_decodestr - Decodes a UTF-8 string back to Unicode |
1c79356b A |
236 | * |
237 | * NOTES: | |
238 | * The input UTF-8 string does not need to be null terminated | |
239 | * if utf8len is set. | |
240 | * | |
241 | * If '/' chars are allowed on disk then an alternate | |
242 | * (replacement) char must be provided in altslash. | |
243 | * | |
244 | * input flags: | |
765c9de3 A |
245 | * UTF_REV_ENDIAN: Unicode byteorder is oposite current runtime |
246 | * UTF_DECOMPOSED: Unicode output string must be fully decompsed | |
0b4e3aa0 A |
247 | * |
248 | * result: | |
249 | * ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded. | |
250 | * EINVAL: Illegal UTF-8 sequence found. | |
1c79356b A |
251 | */ |
252 | int | |
765c9de3 A |
253 | utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp, |
254 | size_t *ucslen, size_t buflen, u_int16_t altslash, int flags) | |
1c79356b A |
255 | { |
256 | u_int16_t* bufstart; | |
257 | u_int16_t* bufend; | |
258 | u_int16_t ucs_ch; | |
259 | u_int8_t byte; | |
260 | int result = 0; | |
0b4e3aa0 | 261 | int decompose, precompose, swapbytes; |
1c79356b | 262 | |
0b4e3aa0 A |
263 | decompose = (flags & UTF_DECOMPOSED); |
264 | precompose = (flags & UTF_PRECOMPOSED); | |
265 | swapbytes = (flags & UTF_REVERSE_ENDIAN); | |
1c79356b A |
266 | |
267 | bufstart = ucsp; | |
268 | bufend = (u_int16_t *)((u_int8_t *)ucsp + buflen); | |
269 | ||
270 | while (utf8len-- > 0 && (byte = *utf8p++) != '\0') { | |
765c9de3 A |
271 | if (ucsp >= bufend) |
272 | goto toolong; | |
1c79356b A |
273 | |
274 | /* check for ascii */ | |
275 | if (byte < 0x80) { | |
765c9de3 | 276 | ucs_ch = byte; /* 1st byte */ |
1c79356b | 277 | } else { |
765c9de3 A |
278 | u_int32_t ch; |
279 | int extrabytes = utf_extrabytes[byte >> 3]; | |
280 | ||
281 | if (utf8len < extrabytes) | |
282 | goto invalid; | |
283 | utf8len -= extrabytes; | |
284 | ||
285 | switch (extrabytes) { | |
286 | case 1: ch = byte; /* 1st byte */ | |
287 | ch <<= 6; | |
288 | ch += *utf8p++; /* 2nd byte */ | |
289 | ch -= 0x00003080UL; | |
290 | if (ch < 0x0080) | |
291 | goto invalid; | |
292 | ucs_ch = ch; | |
293 | break; | |
294 | ||
295 | case 2: ch = byte; /* 1st byte */ | |
296 | ch <<= 6; | |
297 | ch += *utf8p++; /* 2nd byte */ | |
298 | ch <<= 6; | |
299 | ch += *utf8p++; /* 3rd byte */ | |
300 | ch -= 0x000E2080UL; | |
301 | if (ch < 0x0800) | |
302 | goto invalid; | |
303 | ucs_ch = ch; | |
304 | break; | |
305 | ||
306 | case 3: ch = byte; /* 1st byte */ | |
307 | ch <<= 6; | |
308 | ch += *utf8p++; /* 2nd byte */ | |
309 | ch <<= 6; | |
310 | ch += *utf8p++; /* 3rd byte */ | |
311 | ch <<= 6; | |
312 | ch += *utf8p++; /* 4th byte */ | |
313 | ch -= 0x03C82080UL + SP_HALF_BASE; | |
314 | ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST; | |
315 | *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch; | |
316 | if (ucsp >= bufend) | |
317 | goto toolong; | |
318 | ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST; | |
319 | *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch; | |
320 | continue; | |
1c79356b | 321 | |
1c79356b | 322 | default: |
765c9de3 | 323 | goto invalid; |
1c79356b | 324 | } |
1c79356b | 325 | if (decompose) { |
0b4e3aa0 | 326 | u_int16_t comb_ch[2]; |
1c79356b | 327 | |
0b4e3aa0 | 328 | ucs_ch = ucs_decompose(ucs_ch, &comb_ch[0]); |
1c79356b | 329 | |
0b4e3aa0 A |
330 | if (comb_ch[0]) { |
331 | *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch; | |
765c9de3 A |
332 | if (ucsp >= bufend) |
333 | goto toolong; | |
0b4e3aa0 A |
334 | ucs_ch = comb_ch[0]; |
335 | if (comb_ch[1]) { | |
336 | *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch; | |
765c9de3 A |
337 | if (ucsp >= bufend) |
338 | goto toolong; | |
0b4e3aa0 A |
339 | ucs_ch = comb_ch[1]; |
340 | } | |
341 | } | |
342 | } else if (precompose && (ucsp != bufstart)) { | |
343 | u_int16_t composite, base; | |
344 | ||
345 | base = swapbytes ? NXSwapShort(*(ucsp - 1)) : *(ucsp - 1); | |
346 | composite = ucs_combine(base, ucs_ch); | |
347 | if (composite) { | |
348 | --ucsp; | |
349 | ucs_ch = composite; | |
1c79356b A |
350 | } |
351 | } | |
0b4e3aa0 A |
352 | if (ucs_ch == UCS_ALT_NULL) |
353 | ucs_ch = '\0'; | |
1c79356b | 354 | } |
1c79356b A |
355 | if (ucs_ch == altslash) |
356 | ucs_ch = '/'; | |
1c79356b | 357 | |
765c9de3 | 358 | *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch; |
1c79356b | 359 | } |
765c9de3 A |
360 | |
361 | exit: | |
1c79356b A |
362 | *ucslen = (u_int8_t*)ucsp - (u_int8_t*)bufstart; |
363 | ||
364 | return (result); | |
765c9de3 A |
365 | |
366 | invalid: | |
367 | result = EINVAL; | |
368 | goto exit; | |
369 | ||
370 | toolong: | |
371 | result = ENAMETOOLONG; | |
372 | goto exit; | |
1c79356b A |
373 | } |
374 | ||
375 | ||
376 | /* | |
377 | * Lookup tables for Unicode chars 0x00C0 thru 0x00FF | |
378 | * primary_char yields first decomposed char. If this | |
379 | * char is an alpha char then get the combining char | |
380 | * from the combining_char table and add 0x0300 to it. | |
381 | */ | |
382 | ||
0b4e3aa0 A |
383 | static unsigned char primary_char[8*36] = { |
384 | 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x00, 0x43, | |
385 | ||
386 | 0x45, 0x45, 0x45, 0x45, 0x49, 0x49, 0x49, 0x49, /* CF */ | |
1c79356b | 387 | |
0b4e3aa0 | 388 | 0x00, 0x4E, 0x4F, 0x4F, 0x4F, 0x4F, 0x4F, 0x00, |
1c79356b | 389 | |
0b4e3aa0 | 390 | 0x00, 0x55, 0x55, 0x55, 0x55, 0x59, 0x00, 0x00, /* DF */ |
1c79356b | 391 | |
0b4e3aa0 | 392 | 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x00, 0x63, |
1c79356b | 393 | |
0b4e3aa0 | 394 | 0x65, 0x65, 0x65, 0x65, 0x69, 0x69, 0x69, 0x69, /* EF */ |
1c79356b | 395 | |
0b4e3aa0 | 396 | 0x00, 0x6E, 0x6F, 0x6F, 0x6F, 0x6F, 0x6F, 0x00, |
1c79356b | 397 | |
0b4e3aa0 | 398 | 0x00, 0x75, 0x75, 0x75, 0x75, 0x79, 0x00, 0x79, /* FF */ |
1c79356b | 399 | |
0b4e3aa0 A |
400 | 0x41, 0x61, 0x41, 0x61, 0x41, 0x61, 0x43, 0x63, |
401 | ||
402 | 0x43, 0x63, 0x43, 0x63, 0x43, 0x63, 0x44, 0x64, /* 10F */ | |
403 | ||
404 | 0x00, 0x00, 0x45, 0x65, 0x45, 0x65, 0x45, 0x65, | |
405 | ||
406 | 0x45, 0x65, 0x45, 0x65, 0x47, 0x67, 0x47, 0x67, /* 11F */ | |
407 | ||
408 | 0x47, 0x67, 0x47, 0x67, 0x48, 0x68, 0x00, 0x00, | |
409 | ||
410 | 0x49, 0x69, 0x49, 0x69, 0x49, 0x69, 0x49, 0x69, | |
411 | ||
412 | 0x49, 0x00, 0x00, 0x00, 0x4A, 0x6A, 0x4B, 0x6B, | |
413 | ||
414 | 0x00, 0x4C, 0x6C, 0x4C, 0x6C, 0x4C, 0x6C, 0x00, /* 13F */ | |
415 | ||
416 | 0x00, 0x00, 0x00, 0x4E, 0x6E, 0x4E, 0x6E, 0x4E, | |
417 | ||
418 | 0x6E, 0x00, 0x00, 0x00, 0x4F, 0x6F, 0x4F, 0x6F, | |
419 | ||
420 | 0x4F, 0x6F, 0x00, 0x00, 0x52, 0x72, 0x52, 0x72, | |
421 | ||
422 | 0x52, 0x72, 0x53, 0x73, 0x53, 0x73, 0x53, 0x73, /* 15F */ | |
423 | ||
424 | 0x53, 0x73, 0x54, 0x74, 0x54, 0x74, 0x00, 0x00, | |
425 | ||
426 | 0x55, 0x75, 0x55, 0x75, 0x55, 0x75, 0x55, 0x75, | |
427 | ||
428 | 0x55, 0x75, 0x55, 0x75, 0x57, 0x77, 0x59, 0x79, | |
429 | ||
430 | 0x59, 0x5A, 0x7A, 0x5A, 0x7A, 0x5A, 0x7A, 0x00, /* 17F */ | |
431 | ||
432 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | |
433 | ||
434 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | |
435 | ||
436 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | |
437 | ||
438 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 19F */ | |
439 | ||
440 | 0x4F, 0x6F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | |
441 | ||
442 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x55, | |
443 | ||
444 | 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | |
445 | ||
446 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 1BF */ | |
447 | ||
448 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | |
449 | ||
450 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x61, 0x49, | |
451 | ||
452 | 0x69, 0x4F, 0x6F, 0x55, 0x75, 0xDC, 0xFC, 0xDC, | |
453 | ||
454 | 0xFC, 0xDC, 0xFC, 0xDC, 0xFC, 0x00, 0xC4, 0xE4 /* 1DF */ | |
455 | ||
1c79356b A |
456 | }; |
457 | ||
0b4e3aa0 | 458 | static unsigned char combining_char[8*36] = { |
1c79356b A |
459 | 0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27, |
460 | ||
0b4e3aa0 | 461 | 0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08, /* CF */ |
1c79356b A |
462 | |
463 | 0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF, | |
464 | ||
0b4e3aa0 | 465 | 0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0xFF, /* DF */ |
1c79356b A |
466 | |
467 | 0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27, | |
468 | ||
0b4e3aa0 | 469 | 0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08, /* EF */ |
1c79356b A |
470 | |
471 | 0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF, | |
472 | ||
0b4e3aa0 A |
473 | 0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0x08, /* FF */ |
474 | ||
475 | 0x04, 0x04, 0x06, 0x06, 0x28, 0x28, 0x01, 0x01, | |
476 | ||
477 | 0x02, 0x02, 0x07, 0x07, 0x0C, 0x0C, 0x0C, 0x0C, | |
478 | ||
479 | 0x00, 0x00, 0x04, 0x04, 0x06, 0x06, 0x07, 0x07, | |
480 | ||
481 | 0x28, 0x28, 0x0C, 0x0C, 0x02, 0x02, 0x06, 0x06, | |
482 | ||
483 | 0x07, 0x07, 0x27, 0x27, 0x02, 0x02, 0x00, 0x00, | |
484 | ||
485 | 0x03, 0x03, 0x04, 0x04, 0x06, 0x06, 0x28, 0x28, | |
486 | ||
487 | 0x07, 0x00, 0x00, 0x00, 0x02, 0x02, 0x27, 0x27, | |
488 | ||
489 | 0x00, 0x01, 0x01, 0x27, 0x27, 0x0C, 0x0C, 0x00, /* 13F */ | |
490 | ||
491 | 0x00, 0x00, 0x00, 0x01, 0x01, 0x27, 0x27, 0x0C, | |
492 | ||
493 | 0x0C, 0x00, 0x00, 0x00, 0x04, 0x04, 0x06, 0x06, | |
494 | ||
495 | 0x0B, 0x0B, 0x00, 0x00, 0x01, 0x01, 0x27, 0x27, | |
496 | ||
497 | 0x0C, 0x0C, 0x01, 0x01, 0x02, 0x02, 0x27, 0x27, | |
498 | ||
499 | 0x0C, 0x0C, 0x27, 0x27, 0x0C, 0x0C, 0x00, 0x00, | |
500 | ||
501 | 0x03, 0x03, 0x04, 0x04, 0x06, 0x06, 0x0A, 0x0A, /* 16F */ | |
502 | ||
503 | 0x0B, 0x0B, 0x28, 0x28, 0x02, 0x02, 0x02, 0x02, | |
504 | ||
505 | 0x08, 0x01, 0x01, 0x07, 0x07, 0x0C, 0x0C, 0x00, | |
506 | ||
507 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 17F */ | |
508 | ||
509 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | |
510 | ||
511 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | |
512 | ||
513 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 19F */ | |
514 | ||
515 | 0x1B, 0x1B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | |
516 | ||
517 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1B, | |
518 | ||
519 | 0x1B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | |
520 | ||
521 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | |
522 | ||
523 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | |
524 | ||
525 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x0C, 0x0C, 0x0C, /* 1CF */ | |
526 | ||
527 | 0x0C, 0x0C, 0x0C, 0x0C, 0x0C, 0x04, 0x04, 0x01, | |
528 | ||
529 | 0x01, 0x0C, 0x0C, 0x00, 0x00, 0x00, 0x04, 0x04 /* 1DF */ | |
1c79356b A |
530 | }; |
531 | ||
532 | ||
0b4e3aa0 A |
533 | /* CYRILLIC codepoints 0x0400 ~ 0x04FF */ |
534 | static const unsigned long __CyrillicDecompBitmap[] = { | |
765c9de3 | 535 | 0x510A0040, 0x00000040, 0x0000510A, 0x00000000, /* 0x0400 */ |
0b4e3aa0 A |
536 | 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* 0x0480 */ |
537 | }; | |
538 | ||
1c79356b A |
539 | /* CJK codepoints 0x3000 ~ 0x30FF */ |
540 | static const unsigned long __CJKDecompBitmap[] = { | |
541 | 0x00000000, 0x00000000, 0x000AAAAA, 0xA540DB6C, /* 0x3000 */ | |
542 | 0x00000802, 0x000AAAAA, 0xA540DB6C, 0x000009E2, /* 0x3080 */ | |
543 | }; | |
544 | #define IS_DECOMPOSABLE(table,unicodeVal) \ | |
545 | (table[(unicodeVal) / 32] & (1 << (31 - ((unicodeVal) % 32)))) | |
546 | ||
547 | /* | |
765c9de3 | 548 | * ucs_decompose - decompose a composed Unicode char |
1c79356b A |
549 | * |
550 | * Composed Unicode characters are forbidden on | |
551 | * HFS Plus volumes. ucs_decompose will convert a | |
552 | * composed character into its correct decomposed | |
553 | * sequence. | |
554 | * | |
0b4e3aa0 | 555 | * Currently only Tier-1 and Tier-2 languages |
1c79356b A |
556 | * are handled. Other composed characters are |
557 | * passed unchanged. | |
558 | */ | |
559 | static u_int16_t | |
560 | ucs_decompose(register u_int16_t ch, u_int16_t *cmb) | |
561 | { | |
562 | u_int16_t base; | |
563 | ||
0b4e3aa0 A |
564 | cmb[0] = 0; |
565 | cmb[1] = 0; | |
1c79356b | 566 | |
0b4e3aa0 A |
567 | if (ch < 0x00C0) { |
568 | base = ch; | |
569 | } else if (ch <= 0x01DF) { | |
1c79356b | 570 | |
0b4e3aa0 A |
571 | base = (u_int16_t) primary_char[ch - 0x00C0]; |
572 | ||
573 | if (base == 0) | |
574 | base = ch; | |
575 | else { | |
576 | if ((base < 0x00C0) || (primary_char[base - 0x00C0] == 0)) | |
577 | cmb[0] = (u_int16_t)0x0300 + (u_int16_t)combining_char[ch - 0x00C0]; | |
578 | else { | |
579 | u_int16_t tch = base; | |
580 | ||
581 | base = (u_int16_t)primary_char[tch - 0x00C0]; | |
582 | cmb[0] = (u_int16_t)0x0300 + (u_int16_t)combining_char[tch - 0x00C0]; | |
583 | cmb[1] = (u_int16_t)0x0300 + (u_int16_t)combining_char[ch - 0x00C0]; | |
584 | } | |
585 | } | |
586 | } else if ((ch >= 0x0400) && (ch <= 0x04FF) && | |
587 | IS_DECOMPOSABLE(__CyrillicDecompBitmap, ch - 0x0400)) { | |
588 | ||
589 | /* Handle CYRILLIC LETTERs */ | |
590 | switch(ch) { | |
591 | case 0x0401: base = 0x0415; cmb[0] = 0x0308; break; /* */ | |
765c9de3 A |
592 | case 0x0403: base = 0x0413; cmb[0] = 0x0301; break; /* */ |
593 | case 0x0407: base = 0x0406; cmb[0] = 0x0308; break; /* */ | |
594 | case 0x040C: base = 0x041A; cmb[0] = 0x0301; break; /* */ | |
595 | case 0x040E: base = 0x0423; cmb[0] = 0x0306; break; /* */ | |
0b4e3aa0 A |
596 | case 0x0419: base = 0x0418; cmb[0] = 0x0306; break; /* */ |
597 | case 0x0439: base = 0x0438; cmb[0] = 0x0306; break; /* */ | |
598 | case 0x0451: base = 0x0435; cmb[0] = 0x0308; break; /* */ | |
765c9de3 A |
599 | case 0x0453: base = 0x0433; cmb[0] = 0x0301; break; /* */ |
600 | case 0x0457: base = 0x0456; cmb[0] = 0x0308; break; /* */ | |
601 | case 0x045C: base = 0x043A; cmb[0] = 0x0301; break; /* */ | |
602 | case 0x045E: base = 0x0443; cmb[0] = 0x0306; break; /* */ | |
0b4e3aa0 A |
603 | |
604 | default: | |
605 | /* Should not be hit from bit map table */ | |
606 | base = ch; | |
1c79356b | 607 | } |
0b4e3aa0 A |
608 | } else if (ch == 0x1E3F) { |
609 | base = 0x006D; cmb[0] = 0x0301; /* LATIN SMALL LETTER M WITH ACUTE */ | |
1c79356b A |
610 | } else if ((ch > 0x3000) && (ch < 0x3100) && |
611 | IS_DECOMPOSABLE(__CJKDecompBitmap, ch - 0x3000)) { | |
0b4e3aa0 | 612 | |
1c79356b A |
613 | /* Handle HIRAGANA LETTERs */ |
614 | switch(ch) { | |
0b4e3aa0 A |
615 | case 0x3071: base = 0x306F; cmb[0] = 0x309A; break; /* PA */ |
616 | case 0x3074: base = 0x3072; cmb[0] = 0x309A; break; /* PI */ | |
617 | case 0x3077: base = 0x3075; cmb[0] = 0x309A; break; /* PU */ | |
618 | case 0x307A: base = 0x3078; cmb[0] = 0x309A; break; /* PE */ | |
619 | ||
620 | case 0x307D: base = 0x307B; cmb[0] = 0x309A; break; /* PO */ | |
621 | case 0x3094: base = 0x3046; cmb[0] = 0x3099; break; /* VU */ | |
622 | case 0x30D1: base = 0x30CF; cmb[0] = 0x309A; break; /* PA */ | |
623 | case 0x30D4: base = 0x30D2; cmb[0] = 0x309A; break; /* PI */ | |
624 | ||
625 | case 0x30D7: base = 0x30D5; cmb[0] = 0x309A; break; /* PU */ | |
626 | case 0x30DA: base = 0x30D8; cmb[0] = 0x309A; break; /* PE */ | |
627 | case 0x30DD: base = 0x30DB; cmb[0] = 0x309A; break; /* PO */ | |
628 | case 0x30F4: base = 0x30A6; cmb[0] = 0x3099; break; /* VU */ | |
629 | ||
630 | case 0x30F7: base = 0x30EF; cmb[0] = 0x3099; break; /* VA */ | |
631 | case 0x30F8: base = 0x30F0; cmb[0] = 0x3099; break; /* VI */ | |
632 | case 0x30F9: base = 0x30F1; cmb[0] = 0x3099; break; /* VE */ | |
633 | case 0x30FA: base = 0x30F2; cmb[0] = 0x3099; break; /* VO */ | |
1c79356b A |
634 | |
635 | default: | |
636 | /* the rest (41 of them) have a simple conversion */ | |
637 | base = ch - 1; | |
0b4e3aa0 | 638 | cmb[0] = 0x3099; |
1c79356b | 639 | } |
0b4e3aa0 A |
640 | } else if ((ch >= 0xAC00) && (ch < 0xD7A4)) { |
641 | /* Hangul */ | |
642 | ch -= 0xAC00; | |
643 | base = 0x1100 + (ch / (21*28)); | |
644 | cmb[0] = 0x1161 + (ch % (21*28)) / 28; | |
645 | ||
646 | if (ch % 28) | |
647 | cmb[1] = 0x11A7 + (ch % 28); | |
1c79356b A |
648 | } else { |
649 | base = ch; | |
650 | } | |
651 | ||
652 | return (base); | |
653 | } | |
654 | ||
0b4e3aa0 A |
655 | |
656 | static const short diacrit_tbl[8*6] = { | |
657 | /* 300 - 307 */ 0, 58, 116, 174, 232, -1, 290, 348, | |
658 | /* 308 - 30F */ 406, -1, 464, 522, 580, -1, -1, -1, | |
659 | /* 310 - 317 */ -1, -1, -1, -1, -1, -1, -1, -1, | |
660 | /* 318 - 31F */ -1, -1, -1, 638, -1, -1, -1, -1, | |
661 | /* 320 - 327 */ -1, -1, -1, -1, -1, -1, -1, 696, | |
662 | /* 328 - 32F */ 754, -1, -1, -1, -1, -1, -1, -1 | |
663 | }; | |
664 | ||
665 | static const u_int16_t composite_tbl[58*14] = { | |
666 | /* | |
667 | * A B C D E F G H I J K L M | |
668 | * N O P Q R S T U V W X Y Z | |
669 | * [ \ ] ^ _ ` | |
670 | * a b c d e f g h i j k l m | |
671 | * n o p q r s t u v w x y z | |
672 | */ | |
673 | ||
674 | /* | |
675 | * 0x300 - grave accent | |
676 | */ | |
677 | 0x0C0, 0, 0, 0,0x0C8, 0, 0, 0,0x0CC, 0, 0, 0, 0, | |
678 | 0,0x0D2, 0, 0, 0, 0, 0,0x0D9, 0, 0, 0, 0, 0, | |
679 | 0, 0, 0, 0, 0, 0, | |
680 | 0x0E0, 0, 0, 0,0x0E8, 0, 0, 0,0x0EC, 0, 0, 0, 0, | |
681 | 0,0x0F2, 0, 0, 0, 0, 0,0x0F9, 0, 0, 0, 0, 0, | |
682 | /* | |
683 | * 0x301 - acute accent | |
684 | */ | |
685 | 0x0C1, 0,0x106, 0,0x0C9, 0, 0, 0,0x0CD, 0, 0,0x139, 0, | |
686 | 0x143,0x0D3, 0, 0,0x154,0x15A, 0,0x0DA, 0, 0, 0,0x0DD,0x179, | |
687 | 0, 0, 0, 0, 0, 0, | |
688 | 0x0E1, 0,0x107, 0,0x0E9, 0, 0, 0,0x0ED, 0, 0,0x13A,0x1E3F, | |
689 | 0x144,0x0F3, 0, 0,0x155,0x15B, 0,0x0FA, 0, 0, 0,0x0FD,0x17A, | |
690 | /* | |
691 | * 0x302 - circumflex accent | |
692 | */ | |
693 | 0x0C2, 0,0x108, 0,0x0CA, 0,0x11C,0x124,0x0CE,0x134, 0, 0, 0, | |
694 | 0,0x0D4, 0, 0, 0,0x15C, 0,0x0DB, 0,0x174, 0,0x176, 0, | |
695 | 0, 0, 0, 0, 0, 0, | |
696 | 0x0E2, 0,0x109, 0,0x0EA, 0,0x11D,0x125,0x0EE,0x135, 0, 0, 0, | |
697 | 0,0x0F4, 0, 0, 0,0x15D, 0,0x0FB, 0,0x175, 0,0x177, 0, | |
698 | /* | |
699 | * 0x303 - tilde | |
700 | */ | |
701 | 0x0C3, 0, 0, 0, 0, 0, 0, 0,0x128, 0, 0, 0, 0, | |
702 | 0x0D1,0x0D5, 0, 0, 0, 0, 0,0x168, 0, 0, 0, 0, 0, | |
703 | 0, 0, 0, 0, 0, 0, | |
704 | 0x0E3, 0, 0, 0, 0, 0, 0, 0,0x129, 0, 0, 0, 0, | |
705 | 0x0F1,0x0F5, 0, 0, 0, 0, 0,0x169, 0, 0, 0, 0, 0, | |
706 | /* | |
707 | * 0x304 - macron | |
708 | */ | |
709 | 0x100, 0, 0, 0,0x112, 0, 0, 0,0x12A, 0, 0, 0, 0, | |
710 | 0,0x14C, 0, 0, 0, 0, 0,0x16A, 0, 0, 0, 0, 0, | |
711 | 0, 0, 0, 0, 0, 0, | |
712 | 0x101, 0, 0, 0,0x113, 0, 0, 0,0x12B, 0, 0, 0, 0, | |
713 | 0,0x14D, 0, 0, 0, 0, 0,0x16B, 0, 0, 0, 0, 0, | |
714 | /* | |
715 | * 0x306 - breve | |
716 | */ | |
717 | 0x102, 0, 0, 0,0x114, 0,0x11E, 0,0x12C, 0, 0, 0, 0, | |
718 | 0,0x14E, 0, 0, 0, 0, 0,0x16C, 0, 0, 0, 0, 0, | |
719 | 0, 0, 0, 0, 0, 0, | |
720 | 0x103, 0, 0, 0,0x115, 0,0x11F, 0,0x12D, 0, 0, 0, 0, | |
721 | 0,0x14F, 0, 0, 0, 0, 0,0x16D, 0, 0, 0, 0, 0, | |
722 | /* | |
723 | * 0x307 - dot above | |
724 | */ | |
725 | 0, 0,0x10A, 0,0x116, 0,0x120, 0,0x130, 0, 0, 0, 0, | |
726 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0x17B, | |
727 | 0, 0, 0, 0, 0, 0, | |
728 | 0, 0,0x10B, 0,0x117, 0,0x121, 0, 0, 0, 0, 0, 0, | |
729 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0x17C, | |
730 | /* | |
731 | * 0x308 - diaeresis | |
732 | */ | |
733 | 0x0C4, 0, 0, 0,0x0CB, 0, 0, 0,0x0CF, 0, 0, 0, 0, | |
734 | 0,0x0D6, 0, 0, 0, 0, 0,0x0DC, 0, 0, 0,0x178, 0, | |
735 | 0, 0, 0, 0, 0, 0, | |
736 | 0x0E4, 0, 0, 0,0x0EB, 0, 0, 0,0x0EF, 0, 0, 0, 0, | |
737 | 0,0x0F6, 0, 0, 0, 0, 0,0x0FC, 0, 0, 0,0x0FF, 0, | |
738 | /* | |
739 | * 0x30A - ring above | |
740 | */ | |
741 | 0x0C5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
742 | 0, 0, 0, 0, 0, 0, 0,0x16E, 0, 0, 0, 0, 0, | |
743 | 0, 0, 0, 0, 0, 0, | |
744 | 0x0E5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
745 | 0, 0, 0, 0, 0, 0, 0,0x16F, 0, 0, 0, 0, 0, | |
746 | /* | |
747 | * 0x30B - double aute accent | |
748 | */ | |
749 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
750 | 0,0x150, 0, 0, 0, 0, 0,0x170, 0, 0, 0, 0, 0, | |
751 | 0, 0, 0, 0, 0, 0, | |
752 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
753 | 0,0x151, 0, 0, 0, 0, 0,0x171, 0, 0, 0, 0, 0, | |
754 | /* | |
755 | * 0x30C - caron | |
756 | */ | |
757 | 0x1CD, 0,0x10C,0x10E,0x11A, 0, 0, 0,0x1CF, 0, 0,0x13D, 0, | |
758 | 0x147,0x1D1, 0, 0,0x158,0x160,0x164,0x1D3, 0, 0, 0, 0,0x17D, | |
759 | 0, 0, 0, 0, 0, 0, | |
760 | 0x1CE, 0,0x10D,0x10F,0x11B, 0, 0, 0,0x1D0, 0, 0,0x13E, 0, | |
761 | 0x148,0x1D2, 0, 0,0x159,0x161,0x165,0x1D4, 0, 0, 0, 0,0x17E, | |
762 | /* | |
763 | * 0x31B - horn | |
764 | */ | |
765 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
766 | 0,0x1A0, 0, 0, 0, 0, 0,0x1AF, 0, 0, 0, 0, 0, | |
767 | 0, 0, 0, 0, 0, 0, | |
768 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
769 | 0,0x1A1, 0, 0, 0, 0, 0,0x1B0, 0, 0, 0, 0, 0, | |
770 | /* | |
771 | * 0x327 - cedilla | |
772 | */ | |
773 | 0, 0,0x0C7, 0, 0, 0,0x122, 0, 0, 0,0x136,0x13B, 0, | |
774 | 0x145, 0, 0, 0,0x156,0x15E,0x162, 0, 0, 0, 0, 0, 0, | |
775 | 0, 0, 0, 0, 0, 0, | |
776 | 0, 0,0x0E7, 0, 0, 0,0x123, 0, 0, 0,0x137,0x13C, 0, | |
777 | 0x146, 0, 0, 0,0x157,0x15F,0x163, 0, 0, 0, 0, 0, 0, | |
778 | /* | |
779 | * 0x328 - ogonek | |
780 | */ | |
781 | 0x104, 0, 0, 0,0x118, 0, 0, 0,0x12E, 0, 0, 0, 0, | |
782 | 0, 0, 0, 0, 0, 0, 0,0x172, 0, 0, 0, 0, 0, | |
783 | 0, 0, 0, 0, 0, 0, | |
784 | 0x105, 0, 0, 0,0x119, 0, 0, 0,0x12F, 0, 0, 0, 0, | |
785 | 0, 0, 0, 0, 0, 0, 0,0x173, 0, 0, 0, 0, 0, | |
786 | }; | |
787 | ||
788 | ||
789 | /* CJK codepoints 0x3000 ~ 0x30FF */ | |
790 | static const unsigned long __CJKCombBitmap[] = { | |
791 | 0x00000000, 0x00000000, 0x02155555, 0x4A812490, /* 0x3000 */ | |
792 | 0x00000004, 0x02155555, 0x4A812490, 0x0001E004, /* 0x3080 */ | |
793 | }; | |
794 | #define CAN_COMBINE(table,unicodeVal) \ | |
795 | (table[(unicodeVal) / 32] & (1 << (31 - ((unicodeVal) % 32)))) | |
796 | ||
797 | ||
798 | /* | |
765c9de3 | 799 | * ucs_combine - generate a precomposed Unicode char |
0b4e3aa0 A |
800 | * |
801 | * Precomposed Unicode characters are required for some volume | |
802 | * formats and network protocols. ucs_combine will combine a | |
803 | * decomposed character sequence into a single precomposed | |
804 | * (composite) character. | |
805 | * | |
806 | * Currently only decomcomposed sequences from Apple's Tier 1 | |
807 | * and Tier 2 languages are handled. | |
808 | * | |
809 | * INPUT: | |
810 | * base - base character | |
811 | * comb - combining character | |
812 | * OUTPUT: | |
813 | * result - precomposed char or zero if not combinable | |
814 | */ | |
815 | static u_int16_t | |
816 | ucs_combine(u_int16_t base, u_int16_t comb) | |
817 | { | |
818 | /* Get out early if we can */ | |
819 | if (comb < 0x0300) | |
820 | return (0); | |
821 | ||
822 | /* Try ordinary diacritics (0x300 - 0x32F) */ | |
823 | if (comb <= 0x032F) { | |
824 | int index; | |
825 | ||
826 | if (base >= 'A' && base <= 'z') { | |
827 | index = diacrit_tbl[comb - 0x0300]; | |
828 | if (index < 0 ) return (0); | |
829 | ||
830 | return (composite_tbl[index + (base - 'A')]); | |
831 | } | |
832 | ||
833 | /* Handle Cyrillic and some 3 char latin sequences */ | |
834 | switch (comb) { | |
835 | case 0x0300: | |
836 | switch (base) { | |
837 | case 0x00DC: return (0x01DB); | |
838 | case 0x00FC: return (0x01DC); | |
839 | } break; | |
840 | case 0x0301: | |
841 | switch (base) { | |
842 | case 0x00DC: return (0x01D7); | |
843 | case 0x00FC: return (0x01D8); | |
765c9de3 A |
844 | case 0x0413: return (0x0403); |
845 | case 0x041A: return (0x040C); | |
846 | case 0x0433: return (0x0453); | |
847 | case 0x043A: return (0x045C); | |
0b4e3aa0 A |
848 | } break; |
849 | case 0x0304: | |
850 | switch (base) { | |
851 | case 0x00DC: return (0x01D5); | |
852 | case 0x00FC: return (0x01D6); | |
853 | case 0x00C4: return (0x01DE); | |
854 | case 0x00E4: return (0x01DF); | |
855 | } break; | |
856 | case 0x0306: | |
857 | switch (base) { | |
858 | case 0x0418: return (0x0419); | |
765c9de3 | 859 | case 0x0423: return (0x040E); |
0b4e3aa0 | 860 | case 0x0438: return (0x0439); |
765c9de3 | 861 | case 0x0443: return (0x045E); |
0b4e3aa0 A |
862 | } break; |
863 | case 0x0308: | |
864 | switch (base) { | |
765c9de3 | 865 | case 0x0406: return (0x0407); |
0b4e3aa0 A |
866 | case 0x0415: return (0x0401); |
867 | case 0x0435: return (0x0451); | |
765c9de3 | 868 | case 0x0456: return (0x0457); |
0b4e3aa0 A |
869 | } break; |
870 | case 0x030C: | |
871 | switch (base) { | |
872 | case 0x00DC: return (0x01D9); | |
873 | case 0x00FC: return (0x01DA); | |
874 | } break; | |
875 | } | |
876 | return (0); | |
877 | } | |
878 | ||
879 | /* Now try HANGUL */ | |
880 | if (comb < 0x1161) | |
881 | return (0); | |
882 | ||
883 | /* 2 char Hangul sequences */ | |
884 | if ((comb <= 0x1175) && (base >= 0x1100 && base <= 0x1112)) | |
885 | return (0xAC00 + ((base - 0x1100)*(21*28)) + ((comb - 0x1161)*28)); | |
886 | ||
887 | /* 3 char Hangul sequences */ | |
888 | if ((comb >= 0x11A8 && comb <= 0x11C2) && | |
889 | (base >= 0xAC00 && base <= 0xD788)) { | |
890 | if ((base - 0xAC00) % 28) | |
891 | return (0); | |
892 | else | |
893 | return (base + (comb - 0x11A7)); | |
894 | } | |
895 | ||
896 | /* Now try HIRAGANA and KATAKANA */ | |
897 | if ((comb == 0x3099 || comb == 0x309A) && | |
898 | (base > 0x3000 && base < 0x3100) && | |
899 | CAN_COMBINE(__CJKCombBitmap, base - 0x3000)) { | |
900 | if (comb == 0x309A) { | |
901 | switch(base) { | |
902 | case 0x306F: return (0x3071); /* PA */ | |
903 | case 0x3072: return (0x3074); /* PI */ | |
904 | case 0x3075: return (0x3077); /* PU */ | |
905 | case 0x3078: return (0x307A); /* PE */ | |
906 | case 0x307B: return (0x307D); /* PO */ | |
907 | case 0x30CF: return (0x30D1); /* PA */ | |
908 | case 0x30D2: return (0x30D4); /* PI */ | |
909 | case 0x30D5: return (0x30D7); /* PU */ | |
910 | case 0x30D8: return (0x30DA); /* PE */ | |
911 | case 0x30DB: return (0x30DD); /* PO */ | |
912 | default: return (0); | |
913 | } | |
914 | } else /* 0x3099 */ { | |
915 | switch (base) { | |
916 | case 0x3046: return (0x3094); /* VU */ | |
917 | case 0x30A6: return (0x30F4); /* VU */ | |
918 | case 0x30EF: return (0x30F7); /* VA */ | |
919 | case 0x30F0: return (0x30F8); /* VI */ | |
920 | case 0x30F1: return (0x30F9); /* VE */ | |
921 | case 0x30F2: return (0x30FA); /* VO */ | |
922 | default: return (base + 1); /* 41 code points here */ | |
923 | } | |
924 | } | |
925 | } | |
926 | ||
927 | return (0); | |
928 | } | |
929 |