]> git.saurik.com Git - apple/xnu.git/blame - bsd/hfs/hfs_encodings.c
xnu-517.tar.gz
[apple/xnu.git] / bsd / hfs / hfs_encodings.c
CommitLineData
1c79356b 1/*
9bccf70c 2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
1c79356b
A
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
43866e37 6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
1c79356b 7 *
43866e37
A
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
1c79356b
A
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
43866e37
A
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
1c79356b
A
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25
26#include <sys/param.h>
27#include <sys/systm.h>
28#include <sys/kernel.h>
29#include <sys/lock.h>
30#include <sys/malloc.h>
31#include <sys/queue.h>
32#include <sys/utfconv.h>
33
34#include "hfs.h"
35
36
37/* hfs encoding converter list */
38SLIST_HEAD(encodinglst, hfs_encoding) hfs_encoding_list = {0};
39decl_simple_lock_data(,hfs_encoding_list_slock);
40
41
42/* hfs encoding converter entry */
43struct hfs_encoding {
44 SLIST_ENTRY(hfs_encoding) link;
45 int refcount;
46 int kmod_id;
47 UInt32 encoding;
48 hfs_to_unicode_func_t get_unicode_func;
49 unicode_to_hfs_func_t get_hfsname_func;
50};
51
52/* XXX We should use an "official" interface! */
53extern kern_return_t kmod_destroy(host_priv_t host_priv, kmod_t id);
54extern struct host realhost;
55
56#define MAX_HFS_UNICODE_CHARS (15*5)
57
55e303ae 58int mac_roman_to_unicode(const Str31 hfs_str, UniChar *uni_str, UInt32 maxCharLen, UInt32 *usedCharLen);
1c79356b
A
59
60static int unicode_to_mac_roman(UniChar *uni_str, UInt32 unicodeChars, Str31 hfs_str);
61
62
63void
64hfs_converterinit(void)
65{
66 SLIST_INIT(&hfs_encoding_list);
67 simple_lock_init(&hfs_encoding_list_slock);
68
69 /*
70 * add resident MacRoman converter and take a reference
71 * since its always "loaded".
72 */
73 hfs_addconverter(0, kTextEncodingMacRoman, mac_roman_to_unicode, unicode_to_mac_roman);
74 SLIST_FIRST(&hfs_encoding_list)->refcount++;
75}
76
77
78/*
79 * hfs_addconverter - add an HFS encoding converter
80 *
81 * This is called exclusivly by kernel loadable modules
82 * (like HFS_Japanese.kmod) to register hfs encoding
83 * conversion routines.
84 *
85 */
86int
87hfs_addconverter(int id, UInt32 encoding, hfs_to_unicode_func_t get_unicode, unicode_to_hfs_func_t get_hfsname)
88{
89 struct hfs_encoding *encp;
90
91 MALLOC(encp, struct hfs_encoding *, sizeof(struct hfs_encoding), M_TEMP, M_WAITOK);
92
93 simple_lock(&hfs_encoding_list_slock);
94
95 encp->link.sle_next = NULL;
96 encp->refcount = 0;
97 encp->encoding = encoding;
98 encp->get_unicode_func = get_unicode;
99 encp->get_hfsname_func = get_hfsname;
100 encp->kmod_id = id;
101 SLIST_INSERT_HEAD(&hfs_encoding_list, encp, link);
102
103 simple_unlock(&hfs_encoding_list_slock);
104 return (0);
105}
106
107
108/*
109 * hfs_remconverter - remove an HFS encoding converter
110 *
111 * Can be called by a kernel loadable module's finalize
112 * routine to remove an encoding converter so that the
113 * module (i.e. the code) can be unloaded.
114 *
115 * However, in the normal case, the removing and unloading
116 * of these converters is done in hfs_relconverter.
117 * The call is initiated from within the kernel during the unmounting of an hfs voulume.
118 */
119int
120hfs_remconverter(int id, UInt32 encoding)
121{
122 struct hfs_encoding *encp;
123 int busy = 0;
124
125 simple_lock(&hfs_encoding_list_slock);
126 SLIST_FOREACH(encp, &hfs_encoding_list, link) {
127 if (encp->encoding == encoding && encp->kmod_id == id) {
128 encp->refcount--;
129
130 /* if converter is no longer in use, release it */
131 if (encp->refcount <= 0 && encp->kmod_id != 0) {
132 SLIST_REMOVE(&hfs_encoding_list, encp, hfs_encoding, link);
133 FREE(encp, M_TEMP);
134 } else {
135 busy = 1;
136 }
137 break;
138 }
139 }
140 simple_unlock(&hfs_encoding_list_slock);
141
142 return (busy);
143}
144
145
146/*
147 * hfs_getconverter - get HFS encoding converters
148 *
149 * Normally called during the mounting of an hfs voulume.
150 */
151int
152hfs_getconverter(UInt32 encoding, hfs_to_unicode_func_t *get_unicode, unicode_to_hfs_func_t *get_hfsname)
153{
154 struct hfs_encoding *encp;
155 int found = 0;
156
157 simple_lock(&hfs_encoding_list_slock);
158 SLIST_FOREACH(encp, &hfs_encoding_list, link) {
159 if (encp->encoding == encoding) {
160 found = 1;
161 *get_unicode = encp->get_unicode_func;
162 *get_hfsname = encp->get_hfsname_func;
163 ++encp->refcount;
164 break;
165 }
166 }
167 simple_unlock(&hfs_encoding_list_slock);
168
169 if (!found) {
170 *get_unicode = NULL;
171 *get_hfsname = NULL;
172 return (EINVAL);
173 }
174
175 return (0);
176}
177
178
179/*
180 * hfs_relconverter - release interest in an HFS encoding converter
181 *
182 * Normally called during the unmounting of an hfs voulume.
183 */
184int
185hfs_relconverter(UInt32 encoding)
186{
187 struct hfs_encoding *encp;
188 int found = 0;
189
190 simple_lock(&hfs_encoding_list_slock);
191 SLIST_FOREACH(encp, &hfs_encoding_list, link) {
192 if (encp->encoding == encoding) {
193 found = 1;
194 encp->refcount--;
195
196 /* if converter is no longer in use, release it */
197 if (encp->refcount <= 0 && encp->kmod_id != 0) {
198 int id = encp->kmod_id;
199
200 SLIST_REMOVE(&hfs_encoding_list, encp, hfs_encoding, link);
201 FREE(encp, M_TEMP);
202 encp = NULL;
203
204 simple_unlock(&hfs_encoding_list_slock);
55e303ae 205 kmod_destroy((host_priv_t) host_priv_self(), id);
1c79356b
A
206 simple_lock(&hfs_encoding_list_slock);
207 }
208 break;
209 }
210 }
211 simple_unlock(&hfs_encoding_list_slock);
212
213 return (found ? 0 : EINVAL);
214}
215
216
217/*
218 * Convert HFS encoded string into UTF-8
219 *
220 * Unicode output is fully decomposed
221 * '/' chars are converted to ':'
222 */
223int
224hfs_to_utf8(ExtendedVCB *vcb, Str31 hfs_str, ByteCount maxDstLen, ByteCount *actualDstLen, unsigned char* dstStr)
225{
226 int error;
227 UniChar uniStr[MAX_HFS_UNICODE_CHARS];
228 ItemCount uniCount;
229 size_t utf8len;
230 hfs_to_unicode_func_t hfs_get_unicode = VCBTOHFS(vcb)->hfs_get_unicode;
231
232 error = hfs_get_unicode(hfs_str, uniStr, MAX_HFS_UNICODE_CHARS, &uniCount);
233
234 if (uniCount == 0)
235 error = EINVAL;
236
237 if (error == 0) {
238 error = utf8_encodestr(uniStr, uniCount * sizeof(UniChar), dstStr, &utf8len, maxDstLen , ':', 0);
239 if (error == ENAMETOOLONG)
240 *actualDstLen = utf8_encodelen(uniStr, uniCount * sizeof(UniChar), ':', 0);
241 else
242 *actualDstLen = utf8len;
243 }
244
245 return error;
246}
247
248
249/*
250 * When an HFS name cannot be encoded with the current
251 * volume encoding then MacRoman is used as a fallback.
252 */
253int
254mac_roman_to_utf8(Str31 hfs_str, ByteCount maxDstLen, ByteCount *actualDstLen, unsigned char* dstStr)
255{
256 int error;
257 UniChar uniStr[MAX_HFS_UNICODE_CHARS];
258 ItemCount uniCount;
259 size_t utf8len;
260
261 error = mac_roman_to_unicode(hfs_str, uniStr, MAX_HFS_UNICODE_CHARS, &uniCount);
262
263 if (uniCount == 0)
264 error = EINVAL;
265
266 if (error == 0) {
267 error = utf8_encodestr(uniStr, uniCount * sizeof(UniChar), dstStr, &utf8len, maxDstLen , ':', 0);
268 if (error == ENAMETOOLONG)
269 *actualDstLen = utf8_encodelen(uniStr, uniCount * sizeof(UniChar), ':', 0);
270 else
271 *actualDstLen = utf8len;
272 }
273
274 return error;
275}
276
277
9bccf70c
A
278/*
279 * Convert Unicode string into HFS encoding
280 *
281 * ':' chars are converted to '/'
282 * Assumes input represents fully decomposed Unicode
283 */
284int
285unicode_to_hfs(ExtendedVCB *vcb, ByteCount srcLen, u_int16_t* srcStr, Str31 dstStr, int retry)
286{
287 int error;
288 unicode_to_hfs_func_t hfs_get_hfsname = VCBTOHFS(vcb)->hfs_get_hfsname;
289
290 error = hfs_get_hfsname(srcStr, srcLen/sizeof(UniChar), dstStr);
291 if (error && retry) {
292 error = unicode_to_mac_roman(srcStr, srcLen/sizeof(UniChar), dstStr);
293 }
294 return error;
295}
296
1c79356b
A
297/*
298 * Convert UTF-8 string into HFS encoding
299 *
300 * ':' chars are converted to '/'
301 * Assumes input represents fully decomposed Unicode
302 */
303int
9bccf70c 304utf8_to_hfs(ExtendedVCB *vcb, ByteCount srcLen, const unsigned char* srcStr, Str31 dstStr/*, int retry*/)
1c79356b
A
305{
306 int error;
307 UniChar uniStr[MAX_HFS_UNICODE_CHARS];
308 size_t ucslen;
1c79356b
A
309
310 error = utf8_decodestr(srcStr, srcLen, uniStr, &ucslen, sizeof(uniStr), ':', 0);
311 if (error == 0)
9bccf70c 312 error = unicode_to_hfs(vcb, ucslen, uniStr, dstStr, 1);
1c79356b
A
313
314 return error;
315}
316
317int
318utf8_to_mac_roman(ByteCount srcLen, const unsigned char* srcStr, Str31 dstStr)
319{
320 int error;
321 UniChar uniStr[MAX_HFS_UNICODE_CHARS];
322 size_t ucslen;
323
324 error = utf8_decodestr(srcStr, srcLen, uniStr, &ucslen, sizeof(uniStr), ':', 0);
325 if (error == 0)
326 error = unicode_to_mac_roman(uniStr, ucslen/sizeof(UniChar), dstStr);
327
328 return error;
329}
330
331/*
332 * HFS MacRoman to/from Unicode conversions are built into the kernel
333 * All others hfs encodings are loadable.
334 */
335
336/* 0x00A0 - 0x00FF = Latin 1 Supplement (30 total) */
337static UInt8 gLatin1Table[] = {
338 /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */
339 /* 0x00A0 */ 0xCA, 0xC1, 0xA2, 0xA3, 0xDB, 0xB4, '?', 0xA4, 0xAC, 0xA9, 0xBB, 0xC7, 0xC2, '?', 0xA8, 0xF8,
340 /* 0x00B0 */ 0xA1, 0XB1, '?', '?', 0xAB, 0xB5, 0xA6, 0xe1, 0xFC, '?', 0xBC, 0xC8, '?', '?', '?', 0xC0,
341 /* 0x00C0 */ '?', '?', '?', '?', '?', '?', 0xAE, '?', '?', '?', '?', '?', '?', '?', '?', '?',
342 /* 0x00D0 */ '?', '?', '?', '?', '?', '?', '?', '?', 0xAF, '?', '?', '?', '?', '?', '?', 0xA7,
343 /* 0x00E0 */ '?', '?', '?', '?', '?', '?', 0xBE, '?', '?', '?', '?', '?', '?', '?', '?', '?',
344 /* 0x00F0 */ '?', '?', '?', '?', '?', '?', '?', 0xD6, 0xBF, '?', '?', '?', '?', '?', '?', '?'
345};
346
347/* 0x02C0 - 0x02DF = Spacing Modifiers (8 total) */
348static UInt8 gSpaceModsTable[] = {
349 /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */
350 /* 0x02C0 */ '?', '?', '?', '?', '?', '?', 0xF6, 0xFF, '?', '?', '?', '?', '?', '?', '?', '?',
351 /* 0x02D0 */ '?', '?', '?', '?', '?', '?', '?', '?', 0xF9, 0xFA, 0xFB, 0xFE, 0xF7, 0xFD, '?', '?'
352};
353
354/* 0x2010 - 0x20AF = General Punctuation (17 total) */
355static UInt8 gPunctTable[] = {
356 /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */
357 /* 0x2010 */ '?', '?', '?', 0xd0, 0xd1, '?', '?', '?', 0xd4, 0xd5, 0xe2, '?', 0xd2, 0xd3, 0xe3, '?',
358 /* 0x2020 */ 0xa0, 0xe0, 0xa5, '?', '?', '?', 0xc9, '?', '?', '?', '?', '?', '?', '?', '?', '?',
359 /* 0x2030 */ 0xe4, '?', '?', '?', '?', '?', '?', '?', '?', 0xdc, 0xdd, '?', '?', '?', '?', '?',
360 /* 0x2040 */ '?', '?', '?', '?', 0xda, '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?',
361 /* 0x2050 */ '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?',
362 /* 0x2060 */ '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?',
363 /* 0x2070 */ '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?',
364 /* 0x2080 */ '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?',
365 /* 0x2090 */ '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?',
366 /* 0x20A0 */ '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', 0xdb, '?', '?', '?'
367};
368
369/* 0x22xx = Mathematical Operators (11 total) */
370static UInt8 gMathTable[] = {
371 /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */
372 /* 0x2200 */ '?', '?', 0xb6, '?', '?', '?', 0xc6, '?', '?', '?', '?', '?', '?', '?', '?', 0xb8,
373 /* 0x2210 */ '?', 0xb7, '?', '?', '?', '?', '?', '?', '?', '?', 0xc3, '?', '?', '?', 0xb0, '?',
374 /* 0x2220 */ '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', 0xba, '?', '?', '?', '?',
375 /* 0x2230 */ '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?',
376 /* 0x2240 */ '?', '?', '?', '?', '?', '?', '?', '?', 0xc5, '?', '?', '?', '?', '?', '?', '?',
377 /* 0x2250 */ '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?',
378 /* 0x2260 */ 0xad, '?', '?', '?', 0xb2, 0xb3, '?', '?'
379};
380
381/* */
382static UInt8 gReverseCombTable[] = {
383 /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */
384 /* 0x40 */ 0xDA, 0x40, 0xDA, 0xDA, 0xDA, 0x56, 0xDA, 0xDA, 0xDA, 0x6C, 0xDA, 0xDA, 0xDA, 0xDA, 0x82, 0x98,
385 /* 0x50 */ 0xDA, 0xDA, 0xDA, 0xDA, 0xDA, 0xAE, 0xDA, 0xDA, 0xDA, 0xC4, 0xDA, 0xDA, 0xDA, 0xDA, 0xDA, 0xDA,
386 /* 0x60 */ 0xDA, 0x4B, 0xDA, 0xDA, 0xDA, 0x61, 0xDA, 0xDA, 0xDA, 0x77, 0xDA, 0xDA, 0xDA, 0xDA, 0x8D, 0xA3,
387 /* 0x70 */ 0xDA, 0xDA, 0xDA, 0xDA, 0xDA, 0xB9, 0xDA, 0xDA, 0xDA, 0xCF, 0xDA, 0xDA, 0xDA, 0xDA, 0xDA, 0xDA,
388
389 /* Combining Diacritical Marks (0x0300 - 0x030A) */
390 /* 0 1 2 3 4 5 6 7 8 9 A */
391 /* 'A' */
392 /* 0x0300 */ 0xCB, 0xE7, 0xE5, 0xCC, '?', '?', '?', '?', 0x80, '?', 0x81,
393
394 /* 'a' */
395 /* 0x0300 */ 0x88, 0x87, 0x89, 0x8B, '?', '?', '?', '?', 0x8A, '?', 0x8C,
396
397 /* 'E' */
398 /* 0x0300 */ 0xE9, 0x83, 0xE6, '?', '?', '?', '?', '?', 0xE8, '?', '?',
399
400 /* 'e' */
401 /* 0x0300 */ 0x8F, 0x8E, 0x90, '?', '?', '?', '?', '?', 0x91, '?', '?',
402
403 /* 'I' */
404 /* 0x0300 */ 0xED, 0xEA, 0xEB, '?', '?', '?', '?', '?', 0xEC, '?', '?',
405
406 /* 'i' */
407 /* 0x0300 */ 0x93, 0x92, 0x94, '?', '?', '?', '?', '?', 0x95, '?', '?',
408
409 /* 'N' */
410 /* 0x0300 */ '?', '?', '?', 0x84, '?', '?', '?', '?', '?', '?', '?',
411
412 /* 'n' */
413 /* 0x0300 */ '?', '?', '?', 0x96, '?', '?', '?', '?', '?', '?', '?',
414
415 /* 'O' */
416 /* 0x0300 */ 0xF1, 0xEE, 0xEF, 0xCD, '?', '?', '?', '?', 0x85, '?', '?',
417
418 /* 'o' */
419 /* 0x0300 */ 0x98, 0x97, 0x99, 0x9B, '?', '?', '?', '?', 0x9A, '?', '?',
420
421 /* 'U' */
422 /* 0x0300 */ 0xF4, 0xF2, 0xF3, '?', '?', '?', '?', '?', 0x86, '?', '?',
423
424 /* 'u' */
425 /* 0x0300 */ 0x9D, 0x9C, 0x9E, '?', '?', '?', '?', '?', 0x9F, '?', '?',
426
427 /* 'Y' */
428 /* 0x0300 */ '?', '?', '?', '?', '?', '?', '?', '?', 0xD9, '?', '?',
429
430 /* 'y' */
431 /* 0x0300 */ '?', '?', '?', '?', '?', '?', '?', '?', 0xD8, '?', '?',
432
433 /* else */
434 /* 0x0300 */ '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?'
435};
436
437
438/*
439 * Convert Unicode string into HFS MacRoman encoding
440 *
441 * Assumes Unicode input is fully decomposed
442 */
443static int unicode_to_mac_roman(UniChar *uni_str, UInt32 unicodeChars, Str31 hfs_str)
444{
445 UInt8 *p;
446 const UniChar *u;
447 UniChar c;
448 UniChar mask;
449 UInt16 inputChars;
450 UInt16 pascalChars;
451 OSErr result = noErr;
452 UInt8 lsb;
453 UInt8 prevChar;
454 UInt8 mc;
455
456 mask = (UniChar) 0xFF80;
457 p = &hfs_str[1];
458 u = uni_str;
459 inputChars = unicodeChars;
460 pascalChars = prevChar = 0;
461
462 while (inputChars) {
463 c = *(u++);
464 lsb = (UInt8) c;
465
466 /*
467 * If its not 7-bit ascii, then we need to map it
468 */
469 if ( c & mask ) {
470 mc = '?';
471 switch (c & 0xFF00) {
472 case 0x0000:
473 if (lsb >= 0xA0)
474 mc = gLatin1Table[lsb - 0xA0];
475 break;
476
477 case 0x0200:
478 if (lsb >= 0xC0 && lsb <= 0xDF)
479 mc = gSpaceModsTable[lsb - 0xC0];
480 break;
481
482 case 0x2000:
483 if (lsb >= 0x10 && lsb <= 0xAF)
484 mc = gPunctTable[lsb- 0x10];
485 break;
486
487 case 0x2200:
488 if (lsb <= 0x68)
489 mc = gMathTable[lsb];
490 break;
491
492 case 0x0300:
493 if (c <= 0x030A) {
494 if (prevChar >= 'A' && prevChar < 'z') {
495 mc = gReverseCombTable[gReverseCombTable[prevChar - 0x40] + lsb];
496 --p; /* backup over base char */
497 --pascalChars;
498 }
499 } else {
500 switch (c) {
501 case 0x0327: /* combining cedilla */
502 if (prevChar == 'C')
503 mc = 0x82;
504 else if (prevChar == 'c')
505 mc = 0x8D;
506 else
507 break;
508 --p; /* backup over base char */
509 --pascalChars;
510 break;
511
512 case 0x03A9: mc = 0xBD; break; /* omega */
513
514 case 0x03C0: mc = 0xB9; break; /* pi */
515 }
516 }
517 break;
518
519 default:
520 switch (c) {
521 case 0x0131: mc = 0xf5; break; /* dotless i */
522
523 case 0x0152: mc = 0xce; break; /* OE */
524
525 case 0x0153: mc = 0xcf; break; /* oe */
526
527