]> git.saurik.com Git - apple/hfs.git/blob - core/UnicodeWrappers.c
8e5b6e67f92b66bf2e19a14dd19fb8441c090a89
[apple/hfs.git] / core / UnicodeWrappers.c
1 /*
2 * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 File: UnicodeWrappers.c
30
31 Contains: Wrapper routines for Unicode conversion and comparison.
32
33 */
34
35 #include <sys/param.h>
36 #include <sys/utfconv.h>
37
38 #include "hfs_macos_defs.h"
39 #include "UCStringCompareData.h"
40
41 #include "FileMgrInternal.h"
42 #include "HFSUnicodeWrappers.h"
43
44 enum {
45 kMinFileExtensionChars = 1, /* does not include dot */
46 kMaxFileExtensionChars = 5 /* does not include dot */
47 };
48
49
50 #define EXTENSIONCHAR(c) (((c) >= 0x61 && (c) <= 0x7A) || \
51 ((c) >= 0x41 && (c) <= 0x5A) || \
52 ((c) >= 0x30 && (c) <= 0x39))
53
54
55 #define IsHexDigit(c) (((c) >= (u_int8_t) '0' && (c) <= (u_int8_t) '9') || \
56 ((c) >= (u_int8_t) 'A' && (c) <= (u_int8_t) 'F'))
57
58
59 static void GetFilenameExtension( ItemCount length, ConstUniCharArrayPtr unicodeStr, char* extStr );
60
61
62 static u_int32_t HexStringToInteger( u_int32_t length, const u_int8_t *hexStr );
63
64
65 /*
66 * Get filename extension (if any) as a C string
67 */
68 static void
69 GetFilenameExtension(ItemCount length, ConstUniCharArrayPtr unicodeStr, char * extStr)
70 {
71 u_int32_t i;
72 UniChar c;
73 u_int16_t extChars; /* number of extension chars (excluding dot) */
74 u_int16_t maxExtChars;
75 Boolean foundExtension;
76
77 extStr[0] = '\0'; /* assume there's no extension */
78
79 if ( length < 3 )
80 return; /* "x.y" is smallest possible extension */
81
82 if ( length < (kMaxFileExtensionChars + 2) )
83 maxExtChars = length - 2; /* save room for prefix + dot */
84 else
85 maxExtChars = kMaxFileExtensionChars;
86
87 i = length;
88 extChars = 0;
89 foundExtension = false;
90
91 while ( extChars <= maxExtChars ) {
92 c = unicodeStr[--i];
93
94 /* look for leading dot */
95 if ( c == (UniChar) '.' ) {
96 if ( extChars > 0 ) /* cannot end with a dot */
97 foundExtension = true;
98 break;
99 }
100
101 if ( EXTENSIONCHAR(c) )
102 ++extChars;
103 else
104 break;
105 }
106
107 /* if we found one then copy it */
108 if ( foundExtension ) {
109 u_int8_t *extStrPtr = (u_int8_t *)extStr;
110 const UniChar *unicodeStrPtr = &unicodeStr[i];
111
112 for ( i = 0; i <= extChars; ++i )
113 *(extStrPtr++) = (u_int8_t) *(unicodeStrPtr++);
114 extStr[extChars + 1] = '\0'; /* terminate extension + dot */
115 }
116 }
117
118
119
120 /*
121 * Count filename extension characters (if any)
122 */
123 u_int32_t
124 CountFilenameExtensionChars( const unsigned char * filename, u_int32_t length )
125 {
126 u_int32_t i;
127 UniChar c;
128 u_int32_t extChars; /* number of extension chars (excluding dot) */
129 u_int16_t maxExtChars;
130 Boolean foundExtension;
131
132 if ( length < 3 )
133 return 0; /* "x.y" is smallest possible extension */
134
135 if ( length < (kMaxFileExtensionChars + 2) )
136 maxExtChars = length - 2; /* save room for prefix + dot */
137 else
138 maxExtChars = kMaxFileExtensionChars;
139
140 extChars = 0; /* assume there's no extension */
141 i = length - 1; /* index to last ascii character */
142 foundExtension = false;
143
144 while ( extChars <= maxExtChars ) {
145 c = filename[i--];
146
147 /* look for leading dot */
148 if ( c == (u_int8_t) '.' ) {
149 if ( extChars > 0 ) /* cannot end with a dot */
150 return (extChars);
151
152 break;
153 }
154
155 if ( EXTENSIONCHAR(c) )
156 ++extChars;
157 else
158 break;
159 }
160
161 return 0;
162 }
163
164
165 /*
166 * extract the file id from a mangled name
167 */
168 HFSCatalogNodeID
169 GetEmbeddedFileID(const unsigned char * filename, u_int32_t length, u_int32_t *prefixLength)
170 {
171 short extChars;
172 short i;
173 u_int8_t c;
174
175 *prefixLength = 0;
176
177 if ( filename == NULL )
178 return 0;
179
180 if ( length < 28 )
181 return 0; /* too small to have been mangled */
182
183 /* big enough for a file ID (#10) and an extension (.x) ? */
184 if ( length > 5 )
185 extChars = CountFilenameExtensionChars(filename, length);
186 else
187 extChars = 0;
188
189 /* skip over dot plus extension characters */
190 if ( extChars > 0 )
191 length -= (extChars + 1);
192
193 /* scan for file id digits */
194 for ( i = length - 1; i >= 0; --i) {
195 c = filename[i];
196
197 /* look for file ID marker */
198 if ( c == '#' ) {
199 if ( (length - i) < 3 )
200 break; /* too small to be a file ID */
201
202 *prefixLength = i;
203 return HexStringToInteger(length - i - 1, &filename[i+1]);
204 }
205
206 if ( !IsHexDigit(c) )
207 break; /* file ID string must have hex digits */
208 }
209
210 return 0;
211 }
212
213
214
215 static u_int32_t
216 HexStringToInteger(u_int32_t length, const u_int8_t *hexStr)
217 {
218 u_int32_t value;
219 u_int32_t i;
220 u_int8_t c;
221 const u_int8_t *p;
222
223 value = 0;
224 p = hexStr;
225
226 for ( i = 0; i < length; ++i ) {
227 c = *p++;
228
229 if (c >= '0' && c <= '9') {
230 value = value << 4;
231 value += (u_int32_t) c - (u_int32_t) '0';
232 } else if (c >= 'A' && c <= 'F') {
233 value = value << 4;
234 value += 10 + ((unsigned int) c - (unsigned int) 'A');
235 } else {
236 return 0; /* bad character */
237 }
238 }
239
240 return value;
241 }
242
243
244 /*
245 * Routine: FastRelString
246 *
247 * Output: returns -1 if str1 < str2
248 * returns 1 if str1 > str2
249 * return 0 if equal
250 *
251 */
252 int32_t FastRelString( ConstStr255Param str1, ConstStr255Param str2 )
253 {
254 u_int16_t* compareTable;
255 int32_t bestGuess;
256 u_int8_t length, length2;
257 u_int8_t delta;
258
259 delta = 0;
260 length = *(str1++);
261 length2 = *(str2++);
262
263 if (length == length2)
264 bestGuess = 0;
265 else if (length < length2)
266 {
267 bestGuess = -1;
268 delta = length2 - length;
269 }
270 else
271 {
272 bestGuess = 1;
273 length = length2;
274 }
275
276 compareTable = (u_int16_t*) gCompareTable;
277
278 while (length--)
279 {
280 u_int8_t aChar, bChar;
281
282 aChar = *(str1++);
283 bChar = *(str2++);
284
285 if (aChar != bChar) // If they don't match exacly, do case conversion
286 {
287 u_int16_t aSortWord, bSortWord;
288
289 aSortWord = compareTable[aChar];
290 bSortWord = compareTable[bChar];
291
292 if (aSortWord > bSortWord)
293 return 1;
294
295 if (aSortWord < bSortWord)
296 return -1;
297 }
298
299 // If characters match exactly, then go on to next character immediately without
300 // doing any extra work.
301 }
302
303 // if you got to here, then return bestGuess
304 return bestGuess;
305 }
306
307
308
309 //
310 // FastUnicodeCompare - Compare two Unicode strings; produce a relative ordering
311 //
312 // IF RESULT
313 // --------------------------
314 // str1 < str2 => -1
315 // str1 = str2 => 0
316 // str1 > str2 => +1
317 //
318 // The lower case table starts with 256 entries (one for each of the upper bytes
319 // of the original Unicode char). If that entry is zero, then all characters with
320 // that upper byte are already case folded. If the entry is non-zero, then it is
321 // the _index_ (not byte offset) of the start of the sub-table for the characters
322 // with that upper byte. All ignorable characters are folded to the value zero.
323 //
324 // In pseudocode:
325 //
326 // Let c = source Unicode character
327 // Let table[] = lower case table
328 //
329 // lower = table[highbyte(c)]
330 // if (lower == 0)
331 // lower = c
332 // else
333 // lower = table[lower+lowbyte(c)]
334 //
335 // if (lower == 0)
336 // ignore this character
337 //
338 // To handle ignorable characters, we now need a loop to find the next valid character.
339 // Also, we can't pre-compute the number of characters to compare; the string length might
340 // be larger than the number of non-ignorable characters. Further, we must be able to handle
341 // ignorable characters at any point in the string, including as the first or last characters.
342 // We use a zero value as a sentinel to detect both end-of-string and ignorable characters.
343 // Since the File Manager doesn't prevent the NUL character (value zero) as part of a filename,
344 // the case mapping table is assumed to map u+0000 to some non-zero value (like 0xFFFF, which is
345 // an invalid Unicode character).
346 //
347 // Pseudocode:
348 //
349 // while (1) {
350 // c1 = GetNextValidChar(str1) // returns zero if at end of string
351 // c2 = GetNextValidChar(str2)
352 //
353 // if (c1 != c2) break // found a difference
354 //
355 // if (c1 == 0) // reached end of string on both strings at once?
356 // return 0; // yes, so strings are equal
357 // }
358 //
359 // // When we get here, c1 != c2. So, we just need to determine which one is less.
360 // if (c1 < c2)
361 // return -1;
362 // else
363 // return 1;
364 //
365
366 int32_t FastUnicodeCompare ( register ConstUniCharArrayPtr str1, register ItemCount length1,
367 register ConstUniCharArrayPtr str2, register ItemCount length2)
368 {
369 register u_int16_t c1,c2;
370 register u_int16_t temp;
371 register u_int16_t* lowerCaseTable;
372
373 lowerCaseTable = (u_int16_t*) gLowerCaseTable;
374
375 while (1) {
376 /* Set default values for c1, c2 in case there are no more valid chars */
377 c1 = 0;
378 c2 = 0;
379
380 /* Find next non-ignorable char from str1, or zero if no more */
381 while (length1 && c1 == 0) {
382 c1 = *(str1++);
383 --length1;
384 /* check for basic latin first */
385 if (c1 < 0x0100) {
386 c1 = gLatinCaseFold[c1];
387 break;
388 }
389 /* case fold if neccessary */
390 if ((temp = lowerCaseTable[c1>>8]) != 0)
391 c1 = lowerCaseTable[temp + (c1 & 0x00FF)];
392 }
393
394
395 /* Find next non-ignorable char from str2, or zero if no more */
396 while (length2 && c2 == 0) {
397 c2 = *(str2++);
398 --length2;
399 /* check for basic latin first */
400 if (c2 < 0x0100) {
401 c2 = gLatinCaseFold[c2];
402 break;
403 }
404 /* case fold if neccessary */
405 if ((temp = lowerCaseTable[c2>>8]) != 0)
406 c2 = lowerCaseTable[temp + (c2 & 0x00FF)];
407 }
408
409 if (c1 != c2) // found a difference, so stop looping
410 break;
411
412 if (c1 == 0) // did we reach the end of both strings at the same time?
413 return 0; // yes, so strings are equal
414 }
415
416 if (c1 < c2)
417 return -1;
418 else
419 return 1;
420 }
421
422 /*
423 * UnicodeBinaryCompare
424 * Compare two UTF-16 strings and perform case-sensitive (binary) matching against them.
425 *
426 * Results are emitted like FastUnicodeCompare:
427 *
428 *
429 * IF RESULT
430 * --------------------------
431 * str1 < str2 => -1
432 * str1 = str2 => 0
433 * str1 > str2 => +1
434 *
435 * The case matching source code is greatly simplified due to the lack of case-folding
436 * in this comparison routine. We compare, in order: the lengths, then do character-by-
437 * character comparisons.
438 *
439 */
440 int32_t UnicodeBinaryCompare (register ConstUniCharArrayPtr str1, register ItemCount len1,
441 register ConstUniCharArrayPtr str2, register ItemCount len2) {
442 uint16_t c1;
443 uint16_t c2;
444 int string_length;
445 int32_t result = 0;
446
447 /* Set default values for the two character pointers */
448 c1 = 0;
449 c2 = 0;
450
451 /* First generate the string length (for comparison purposes) */
452 if (len1 < len2) {
453 string_length = len1;
454 --result;
455 }
456 else if (len1 > len2) {
457 string_length = len2;
458 ++result;
459 }
460 else {
461 string_length = len1;
462 }
463
464 /* now compare the two string pointers */
465 while (string_length--) {
466 c1 = *(str1++);
467 c2 = *(str2++);
468
469 if (c1 > c2) {
470 result = 1;
471 break;
472 }
473
474 if (c1 < c2) {
475 result = -1;
476 break;
477 }
478 /* If equal, iterate to the next two respective chars */
479 }
480
481 return result;
482 }
483
484
485 OSErr
486 ConvertUnicodeToUTF8Mangled(ByteCount srcLen, ConstUniCharArrayPtr srcStr, ByteCount maxDstLen,
487 ByteCount *actualDstLen, unsigned char* dstStr, HFSCatalogNodeID cnid)
488 {
489 ByteCount subMaxLen;
490 size_t utf8len;
491 char fileIDStr[15];
492 char extStr[15];
493
494 snprintf(fileIDStr, sizeof(fileIDStr), "#%X", cnid);
495 GetFilenameExtension(srcLen/sizeof(UniChar), srcStr, extStr);
496
497 /* remove extension chars from source */
498 srcLen -= strlen(extStr) * sizeof(UniChar);
499 subMaxLen = maxDstLen - (strlen(extStr) + strlen(fileIDStr));
500
501 (void) utf8_encodestr(srcStr, srcLen, dstStr, &utf8len, subMaxLen, ':', 0);
502
503 strlcat((char *)dstStr, fileIDStr, maxDstLen);
504 strlcat((char *)dstStr, extStr, maxDstLen);
505 *actualDstLen = utf8len + (strlen(extStr) + strlen(fileIDStr));
506
507 return noErr;
508 }