]>
Commit | Line | Data |
---|---|---|
1c79356b A |
1 | /* |
2 | * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * The contents of this file constitute Original Code as defined in and | |
7 | * are subject to the Apple Public Source License Version 1.1 (the | |
8 | * "License"). You may not use this file except in compliance with the | |
9 | * License. Please obtain a copy of the License at | |
10 | * http://www.apple.com/publicsource and read it before using this file. | |
11 | * | |
12 | * This Original Code and all software distributed under the License are | |
13 | * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
14 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
15 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
16 | * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the | |
17 | * License for the specific language governing rights and limitations | |
18 | * under the License. | |
19 | * | |
20 | * @APPLE_LICENSE_HEADER_END@ | |
21 | */ | |
22 | ||
23 | #include <sys/param.h> | |
24 | #include <sys/utfconv.h> | |
25 | #include <sys/errno.h> | |
26 | #include <architecture/byte_order.h> | |
27 | ||
28 | ||
29 | /* | |
30 | * UTF-8 (UCS Transformation Format) | |
31 | * | |
32 | * The following subset of UTF-8 is used to encode UCS-2 filenames. It | |
33 | * requires a maximum of three 3 bytes per UCS-2 character. Only the | |
34 | * shortest encoding required to represent the significant UCS-2 bits | |
35 | * is legal. | |
36 | * | |
37 | * UTF-8 Multibyte Codes | |
38 | * | |
39 | * Bytes Bits UCS-2 Min UCS-2 Max UTF-8 Byte Sequence (binary) | |
40 | * ------------------------------------------------------------------- | |
41 | * 1 7 0x0000 0x007F 0xxxxxxx | |
42 | * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx | |
43 | * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx | |
44 | * ------------------------------------------------------------------- | |
45 | */ | |
46 | ||
47 | ||
48 | #define UCS_TO_UTF_LEN(c) ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : 3)) | |
49 | ||
50 | ||
51 | static u_int16_t ucs_decompose __P((u_int16_t, u_int16_t *)); | |
52 | ||
53 | ||
54 | /* | |
55 | * utf8_encodelen - Calculates the UTF-8 encoding length for a UCS-2 filename | |
56 | * | |
57 | * NOTES: | |
58 | * If '/' chars are allowed on disk then an alternate | |
59 | * (replacement) char must be provided in altslash. | |
60 | * | |
61 | * input flags: | |
62 | * UTF_REVERSE_ENDIAN: UCS-2 byteorder is opposite current runtime | |
63 | */ | |
64 | size_t | |
65 | utf8_encodelen(ucsp, ucslen, altslash, flags) | |
66 | const u_int16_t * ucsp; | |
67 | size_t ucslen; | |
68 | u_int16_t altslash; | |
69 | int flags; | |
70 | { | |
71 | u_int16_t ucs_ch; | |
72 | int charcnt; | |
73 | int swapbytes = (flags & UTF_REVERSE_ENDIAN); | |
74 | size_t len; | |
75 | ||
76 | charcnt = ucslen / 2; | |
77 | len = 0; | |
78 | ||
79 | while (charcnt-- > 0) { | |
80 | ucs_ch = *ucsp++; | |
81 | ||
82 | if (swapbytes) | |
83 | ucs_ch = NXSwapShort(ucs_ch); | |
84 | if (altslash && ucs_ch == '/') | |
85 | ucs_ch = altslash; | |
86 | if (ucs_ch == '\0') | |
87 | ucs_ch = 0xc080; | |
88 | ||
89 | len += UCS_TO_UTF_LEN(ucs_ch); | |
90 | } | |
91 | ||
92 | return (len); | |
93 | } | |
94 | ||
95 | ||
96 | /* | |
97 | * utf8_encodestr - Encodes a UCS-2 (Unicode) string to UTF-8 | |
98 | * | |
99 | * NOTES: | |
100 | * The resulting UTF-8 string is not null terminated. | |
101 | * | |
102 | * If '/' chars are allowed on disk then an alternate | |
103 | * (replacement) char must be provided in altslash. | |
104 | * | |
105 | * input flags: | |
106 | * UTF_REVERSE_ENDIAN: UCS-2 byteorder is opposite current runtime | |
107 | * UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output | |
108 | */ | |
109 | int utf8_encodestr(ucsp, ucslen, utf8p, utf8len, buflen, altslash, flags) | |
110 | const u_int16_t * ucsp; | |
111 | size_t ucslen; | |
112 | u_int8_t * utf8p; | |
113 | size_t * utf8len; | |
114 | size_t buflen; | |
115 | u_int16_t altslash; | |
116 | int flags; | |
117 | { | |
118 | u_int8_t * bufstart; | |
119 | u_int8_t * bufend; | |
120 | u_int16_t ucs_ch; | |
121 | int charcnt; | |
122 | int swapbytes = (flags & UTF_REVERSE_ENDIAN); | |
123 | int nullterm = ((flags & UTF_NO_NULL_TERM) == 0); | |
124 | int result = 0; | |
125 | ||
126 | bufstart = utf8p; | |
127 | bufend = bufstart + buflen; | |
128 | if (nullterm) | |
129 | --bufend; | |
130 | charcnt = ucslen / 2; | |
131 | ||
132 | while (charcnt-- > 0) { | |
133 | ucs_ch = *ucsp++; | |
134 | ||
135 | if (swapbytes) | |
136 | ucs_ch = NXSwapShort(ucs_ch); | |
137 | if (altslash && ucs_ch == '/') | |
138 | ucs_ch = altslash; | |
139 | ||
140 | if ((ucs_ch < 0x0080) && (ucs_ch != '\0')) { | |
141 | if (utf8p >= bufend) { | |
142 | result = ENAMETOOLONG; | |
143 | break; | |
144 | } | |
145 | *utf8p++ = ucs_ch; | |
146 | ||
147 | } else if (ucs_ch < 0x800) { | |
148 | if ((utf8p + 1) >= bufend) { | |
149 | result = ENAMETOOLONG; | |
150 | break; | |
151 | } | |
152 | /* NOTE: NULL maps to 0xC080 */ | |
153 | *utf8p++ = (ucs_ch >> 6) | 0xc0; | |
154 | *utf8p++ = (ucs_ch & 0x3f) | 0x80; | |
155 | ||
156 | } else { | |
157 | if ((utf8p + 2) >= bufend) { | |
158 | result = ENAMETOOLONG; | |
159 | break; | |
160 | } | |
161 | *utf8p++ = (ucs_ch >> 12) | 0xe0; | |
162 | *utf8p++ = ((ucs_ch >> 6) & 0x3f) | 0x80; | |
163 | *utf8p++ = ((ucs_ch) & 0x3f) | 0x80; | |
164 | } | |
165 | } | |
166 | ||
167 | *utf8len = utf8p - bufstart; | |
168 | if (nullterm) | |
169 | *utf8p++ = '\0'; | |
170 | ||
171 | return (result); | |
172 | } | |
173 | ||
174 | ||
175 | /* | |
176 | * utf8_decodestr - Decodes a UTF-8 string back to UCS-2 (Unicode) | |
177 | * | |
178 | * NOTES: | |
179 | * The input UTF-8 string does not need to be null terminated | |
180 | * if utf8len is set. | |
181 | * | |
182 | * If '/' chars are allowed on disk then an alternate | |
183 | * (replacement) char must be provided in altslash. | |
184 | * | |
185 | * input flags: | |
186 | * UTF_REV_ENDIAN: UCS-2 byteorder is oposite current runtime | |
187 | * UTF_DECOMPOSED: UCS-2 output string must be fully decompsed | |
188 | */ | |
189 | int | |
190 | utf8_decodestr(utf8p, utf8len, ucsp, ucslen, buflen, altslash, flags) | |
191 | const u_int8_t* utf8p; | |
192 | size_t utf8len; | |
193 | u_int16_t* ucsp; | |
194 | size_t *ucslen; | |
195 | size_t buflen; | |
196 | u_int16_t altslash; | |
197 | int flags; | |
198 | { | |
199 | u_int16_t* bufstart; | |
200 | u_int16_t* bufend; | |
201 | u_int16_t ucs_ch; | |
202 | u_int8_t byte; | |
203 | int result = 0; | |
204 | int decompose, swapbytes; | |
205 | ||
206 | decompose = (flags & UTF_DECOMPOSED); | |
207 | swapbytes = (flags & UTF_REVERSE_ENDIAN); | |
208 | ||
209 | bufstart = ucsp; | |
210 | bufend = (u_int16_t *)((u_int8_t *)ucsp + buflen); | |
211 | ||
212 | while (utf8len-- > 0 && (byte = *utf8p++) != '\0') { | |
213 | if (ucsp >= bufend) { | |
214 | result = ENAMETOOLONG; | |
215 | goto stop; | |
216 | } | |
217 | ||
218 | /* check for ascii */ | |
219 | if (byte < 0x80) { | |
220 | ucs_ch = byte; | |
221 | } else { | |
222 | switch (byte & 0xf0) { | |
223 | /* 2 byte sequence*/ | |
224 | case 0xc0: | |
225 | case 0xd0: | |
226 | /* extract bits 6 - 10 from first byte */ | |
227 | ucs_ch = (byte & 0x1F) << 6; | |
228 | if ((ucs_ch < 0x0080) && (*utf8p != 0x80)) { | |
229 | result = EINVAL; /* seq not minimal */ | |
230 | goto stop; | |
231 | } | |
232 | break; | |
233 | /* 3 byte sequence*/ | |
234 | case 0xe0: | |
235 | /* extract bits 12 - 15 from first byte */ | |
236 | ucs_ch = (byte & 0x0F) << 6; | |
237 | ||
238 | /* extract bits 6 - 11 from second byte */ | |
239 | if (((byte = *utf8p++) & 0xc0) != 0x80) { | |
240 | result = EINVAL; | |
241 | goto stop; | |
242 | } | |
243 | utf8len--; | |
244 | ||
245 | ucs_ch += (byte & 0x3F); | |
246 | ucs_ch <<= 6; | |
247 | ||
248 | if (ucs_ch < 0x0800) { | |
249 | result = EINVAL; /* seq not minimal */ | |
250 | goto stop; | |
251 | } | |
252 | break; | |
253 | default: | |
254 | result = EINVAL; | |
255 | goto stop; | |
256 | } | |
257 | ||
258 | /* extract bits 0 - 5 from final byte */ | |
259 | if (((byte = *utf8p++) & 0xc0) != 0x80) { | |
260 | result = EINVAL; | |
261 | goto stop; | |
262 | } | |
263 | utf8len--; | |
264 | ucs_ch += (byte & 0x3F); | |
265 | ||
266 | if (decompose) { | |
267 | u_int16_t comb_ch; | |
268 | ||
269 | ucs_ch = ucs_decompose(ucs_ch, &comb_ch); | |
270 | ||
271 | if (comb_ch) { | |
272 | if (swapbytes) | |
273 | *ucsp++ = NXSwapShort(ucs_ch); | |
274 | else | |
275 | *ucsp++ = ucs_ch; | |
276 | ||
277 | if (ucsp >= bufend) { | |
278 | result = ENAMETOOLONG; | |
279 | goto stop; | |
280 | } | |
281 | ||
282 | ucs_ch = comb_ch; | |
283 | } | |
284 | } | |
285 | } | |
286 | ||
287 | if (ucs_ch == altslash) | |
288 | ucs_ch = '/'; | |
289 | if (swapbytes) | |
290 | ucs_ch = NXSwapShort(ucs_ch); | |
291 | ||
292 | *ucsp++ = ucs_ch; | |
293 | } | |
294 | stop: | |
295 | *ucslen = (u_int8_t*)ucsp - (u_int8_t*)bufstart; | |
296 | ||
297 | return (result); | |
298 | } | |
299 | ||
300 | ||
301 | /* | |
302 | * Lookup tables for Unicode chars 0x00C0 thru 0x00FF | |
303 | * primary_char yields first decomposed char. If this | |
304 | * char is an alpha char then get the combining char | |
305 | * from the combining_char table and add 0x0300 to it. | |
306 | */ | |
307 | ||
308 | static unsigned char primary_char[64] = { | |
309 | 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0xC6, 0x43, | |
310 | ||
311 | 0x45, 0x45, 0x45, 0x45, 0x49, 0x49, 0x49, 0x49, | |
312 | ||
313 | 0xD0, 0x4E, 0x4F, 0x4F, 0x4F, 0x4F, 0x4F, 0xD7, | |
314 | ||
315 | 0xD8, 0x55, 0x55, 0x55, 0x55, 0x59, 0xDE, 0xDF, | |
316 | ||
317 | 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0xE6, 0x63, | |
318 | ||
319 | 0x65, 0x65, 0x65, 0x65, 0x69, 0x69, 0x69, 0x69, | |
320 | ||
321 | 0xF0, 0x6E, 0x6F, 0x6F, 0x6F, 0x6F, 0x6F, 0xF7, | |
322 | ||
323 | 0xF8, 0x75, 0x75, 0x75, 0x75, 0x79, 0xFE, 0x79, | |
324 | }; | |
325 | ||
326 | static unsigned char combining_char[64] = { | |
327 | 0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27, | |
328 | ||
329 | 0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08, | |
330 | ||
331 | 0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF, | |
332 | ||
333 | 0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0xFF, | |
334 | ||
335 | 0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27, | |
336 | ||
337 | 0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08, | |
338 | ||
339 | 0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF, | |
340 | ||
341 | 0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0x08 | |
342 | }; | |
343 | ||
344 | ||
345 | /* CJK codepoints 0x3000 ~ 0x30FF */ | |
346 | static const unsigned long __CJKDecompBitmap[] = { | |
347 | 0x00000000, 0x00000000, 0x000AAAAA, 0xA540DB6C, /* 0x3000 */ | |
348 | 0x00000802, 0x000AAAAA, 0xA540DB6C, 0x000009E2, /* 0x3080 */ | |
349 | }; | |
350 | #define IS_DECOMPOSABLE(table,unicodeVal) \ | |
351 | (table[(unicodeVal) / 32] & (1 << (31 - ((unicodeVal) % 32)))) | |
352 | ||
353 | /* | |
354 | * ucs_decompose - decompose a composed UCS-2 char | |
355 | * | |
356 | * Composed Unicode characters are forbidden on | |
357 | * HFS Plus volumes. ucs_decompose will convert a | |
358 | * composed character into its correct decomposed | |
359 | * sequence. | |
360 | * | |
361 | * Currently only MacRoman and MacJapanese chars | |
362 | * are handled. Other composed characters are | |
363 | * passed unchanged. | |
364 | */ | |
365 | static u_int16_t | |
366 | ucs_decompose(register u_int16_t ch, u_int16_t *cmb) | |
367 | { | |
368 | u_int16_t base; | |
369 | ||
370 | *cmb = 0; | |
371 | ||
372 | if ((ch <= 0x00FF) && (ch >= 0x00C0)) { | |
373 | ch -= 0x00C0; | |
374 | ||
375 | base = (u_int16_t) primary_char[ch]; | |
376 | ||
377 | if (base <= 'z') { | |
378 | *cmb = (u_int16_t)0x0300 + (u_int16_t)combining_char[ch]; | |
379 | } | |
380 | } else if ((ch > 0x3000) && (ch < 0x3100) && | |
381 | IS_DECOMPOSABLE(__CJKDecompBitmap, ch - 0x3000)) { | |
382 | ||
383 | /* Handle HIRAGANA LETTERs */ | |
384 | switch(ch) { | |
385 | case 0x3071: base = 0x306F; *cmb = 0x309A; break; /* PA */ | |
386 | case 0x3074: base = 0x3072; *cmb = 0x309A; break; /* PI */ | |
387 | case 0x3077: base = 0x3075; *cmb = 0x309A; break; /* PU */ | |
388 | case 0x307A: base = 0x3078; *cmb = 0x309A; break; /* PE */ | |
389 | ||
390 | case 0x307D: base = 0x307B; *cmb = 0x309A; break; /* PO */ | |
391 | case 0x3094: base = 0x3046; *cmb = 0x3099; break; /* VU */ | |
392 | case 0x30D1: base = 0x30CF; *cmb = 0x309A; break; /* PA */ | |
393 | case 0x30D4: base = 0x30D2; *cmb = 0x309A; break; /* PI */ | |
394 | ||
395 | case 0x30D7: base = 0x30D5; *cmb = 0x309A; break; /* PU */ | |
396 | case 0x30DA: base = 0x30D8; *cmb = 0x309A; break; /* PE */ | |
397 | case 0x30DD: base = 0x30DB; *cmb = 0x309A; break; /* PO */ | |
398 | case 0x30F4: base = 0x30A6; *cmb = 0x3099; break; /* VU */ | |
399 | ||
400 | case 0x30F7: base = 0x30EF; *cmb = 0x3099; break; /* VA */ | |
401 | case 0x30F8: base = 0x30F0; *cmb = 0x3099; break; /* VI */ | |
402 | case 0x30F9: base = 0x30F1; *cmb = 0x3099; break; /* VE */ | |
403 | case 0x30FA: base = 0x30F2; *cmb = 0x3099; break; /* VO */ | |
404 | ||
405 | default: | |
406 | /* the rest (41 of them) have a simple conversion */ | |
407 | base = ch - 1; | |
408 | *cmb = 0x3099; | |
409 | } | |
410 | } else { | |
411 | base = ch; | |
412 | } | |
413 | ||
414 | return (base); | |
415 | } | |
416 |