]>
Commit | Line | Data |
---|---|---|
de8ee011 A |
1 | // |
2 | // lf_hfs_unicode_wrappers.c | |
3 | // livefiles_hfs | |
4 | // | |
5 | // Created by Yakov Ben Zaken on 22/03/2018. | |
6 | // | |
7 | ||
8 | #include "lf_hfs_unicode_wrappers.h" | |
9 | #include "lf_hfs_ucs_string_cmp_data.h" | |
10 | #include "lf_hfs_sbunicode.h" | |
11 | ||
12 | ||
13 | ||
14 | enum { | |
15 | kMinFileExtensionChars = 1, /* does not include dot */ | |
16 | kMaxFileExtensionChars = 5 /* does not include dot */ | |
17 | }; | |
18 | ||
19 | ||
20 | #define EXTENSIONCHAR(c) (((c) >= 0x61 && (c) <= 0x7A) || \ | |
21 | ((c) >= 0x41 && (c) <= 0x5A) || \ | |
22 | ((c) >= 0x30 && (c) <= 0x39)) | |
23 | ||
24 | ||
25 | #define IsHexDigit(c) (((c) >= (u_int8_t) '0' && (c) <= (u_int8_t) '9') || \ | |
26 | ((c) >= (u_int8_t) 'A' && (c) <= (u_int8_t) 'F')) | |
27 | ||
28 | ||
29 | static void | |
30 | GetFilenameExtension( ItemCount length, ConstUniCharArrayPtr unicodeStr, char* extStr ); | |
31 | ||
32 | ||
33 | static u_int32_t | |
34 | HexStringToInteger( u_int32_t length, const u_int8_t *hexStr ); | |
35 | ||
36 | ||
37 | /* | |
38 | * Get filename extension (if any) as a C string | |
39 | */ | |
40 | static void | |
41 | GetFilenameExtension(ItemCount length, ConstUniCharArrayPtr unicodeStr, char * extStr) | |
42 | { | |
43 | u_int64_t i; | |
44 | UniChar c; | |
45 | u_int16_t extChars; /* number of extension chars (excluding dot) */ | |
46 | u_int16_t maxExtChars; | |
47 | Boolean foundExtension; | |
48 | ||
49 | extStr[0] = '\0'; /* assume there's no extension */ | |
50 | ||
51 | if ( length < 3 ) | |
52 | return; /* "x.y" is smallest possible extension */ | |
53 | ||
54 | if ( length < (kMaxFileExtensionChars + 2) ) | |
55 | maxExtChars = length - 2; /* save room for prefix + dot */ | |
56 | else | |
57 | maxExtChars = kMaxFileExtensionChars; | |
58 | ||
59 | i = length; | |
60 | extChars = 0; | |
61 | foundExtension = false; | |
62 | ||
63 | while ( extChars <= maxExtChars ) { | |
64 | c = unicodeStr[--i]; | |
65 | ||
66 | /* look for leading dot */ | |
67 | if ( c == (UniChar) '.' ) { | |
68 | if ( extChars > 0 ) /* cannot end with a dot */ | |
69 | foundExtension = true; | |
70 | break; | |
71 | } | |
72 | ||
73 | if ( EXTENSIONCHAR(c) ) | |
74 | ++extChars; | |
75 | else | |
76 | break; | |
77 | } | |
78 | ||
79 | /* if we found one then copy it */ | |
80 | if ( foundExtension ) { | |
81 | u_int8_t *extStrPtr = (u_int8_t *)extStr; | |
82 | const UniChar *unicodeStrPtr = &unicodeStr[i]; | |
83 | ||
84 | for ( i = 0; i <= extChars; ++i ) | |
85 | *(extStrPtr++) = (u_int8_t) *(unicodeStrPtr++); | |
86 | extStr[extChars + 1] = '\0'; /* terminate extension + dot */ | |
87 | } | |
88 | } | |
89 | ||
90 | // | |
91 | // FastUnicodeCompare - Compare two Unicode strings; produce a relative ordering | |
92 | // | |
93 | // IF RESULT | |
94 | // -------------------------- | |
95 | // str1 < str2 => -1 | |
96 | // str1 = str2 => 0 | |
97 | // str1 > str2 => +1 | |
98 | // | |
99 | // The lower case table starts with 256 entries (one for each of the upper bytes | |
100 | // of the original Unicode char). If that entry is zero, then all characters with | |
101 | // that upper byte are already case folded. If the entry is non-zero, then it is | |
102 | // the _index_ (not byte offset) of the start of the sub-table for the characters | |
103 | // with that upper byte. All ignorable characters are folded to the value zero. | |
104 | // | |
105 | // In pseudocode: | |
106 | // | |
107 | // Let c = source Unicode character | |
108 | // Let table[] = lower case table | |
109 | // | |
110 | // lower = table[highbyte(c)] | |
111 | // if (lower == 0) | |
112 | // lower = c | |
113 | // else | |
114 | // lower = table[lower+lowbyte(c)] | |
115 | // | |
116 | // if (lower == 0) | |
117 | // ignore this character | |
118 | // | |
119 | // To handle ignorable characters, we now need a loop to find the next valid character. | |
120 | // Also, we can't pre-compute the number of characters to compare; the string length might | |
121 | // be larger than the number of non-ignorable characters. Further, we must be able to handle | |
122 | // ignorable characters at any point in the string, including as the first or last characters. | |
123 | // We use a zero value as a sentinel to detect both end-of-string and ignorable characters. | |
124 | // Since the File Manager doesn't prevent the NUL character (value zero) as part of a filename, | |
125 | // the case mapping table is assumed to map u+0000 to some non-zero value (like 0xFFFF, which is | |
126 | // an invalid Unicode character). | |
127 | // | |
128 | // Pseudocode: | |
129 | // | |
130 | // while (1) { | |
131 | // c1 = GetNextValidChar(str1) // returns zero if at end of string | |
132 | // c2 = GetNextValidChar(str2) | |
133 | // | |
134 | // if (c1 != c2) break // found a difference | |
135 | // | |
136 | // if (c1 == 0) // reached end of string on both strings at once? | |
137 | // return 0; // yes, so strings are equal | |
138 | // } | |
139 | // | |
140 | // // When we get here, c1 != c2. So, we just need to determine which one is less. | |
141 | // if (c1 < c2) | |
142 | // return -1; | |
143 | // else | |
144 | // return 1; | |
145 | // | |
146 | ||
147 | int32_t FastUnicodeCompare ( register ConstUniCharArrayPtr str1, register ItemCount length1, | |
148 | register ConstUniCharArrayPtr str2, register ItemCount length2) | |
149 | { | |
150 | register u_int16_t c1,c2; | |
151 | register u_int16_t temp; | |
152 | register u_int16_t* lowerCaseTable; | |
153 | ||
154 | lowerCaseTable = (u_int16_t*) gLowerCaseTable; | |
155 | ||
156 | while (1) { | |
157 | /* Set default values for c1, c2 in case there are no more valid chars */ | |
158 | c1 = 0; | |
159 | c2 = 0; | |
160 | ||
161 | /* Find next non-ignorable char from str1, or zero if no more */ | |
162 | while (length1 && c1 == 0) { | |
163 | c1 = *(str1++); | |
164 | --length1; | |
165 | /* check for basic latin first */ | |
166 | if (c1 < 0x0100) { | |
167 | c1 = gLatinCaseFold[c1]; | |
168 | break; | |
169 | } | |
170 | /* case fold if neccessary */ | |
171 | if ((temp = lowerCaseTable[c1>>8]) != 0) | |
172 | c1 = lowerCaseTable[temp + (c1 & 0x00FF)]; | |
173 | } | |
174 | ||
175 | ||
176 | /* Find next non-ignorable char from str2, or zero if no more */ | |
177 | while (length2 && c2 == 0) { | |
178 | c2 = *(str2++); | |
179 | --length2; | |
180 | /* check for basic latin first */ | |
181 | if (c2 < 0x0100) { | |
182 | c2 = gLatinCaseFold[c2]; | |
183 | break; | |
184 | } | |
185 | /* case fold if neccessary */ | |
186 | if ((temp = lowerCaseTable[c2>>8]) != 0) | |
187 | c2 = lowerCaseTable[temp + (c2 & 0x00FF)]; | |
188 | } | |
189 | ||
190 | if (c1 != c2) // found a difference, so stop looping | |
191 | break; | |
192 | ||
193 | if (c1 == 0) // did we reach the end of both strings at the same time? | |
194 | return 0; // yes, so strings are equal | |
195 | } | |
196 | ||
197 | if (c1 < c2) | |
198 | return -1; | |
199 | else | |
200 | return 1; | |
201 | } | |
202 | ||
203 | ||
204 | /* | |
205 | * UnicodeBinaryCompare | |
206 | * Compare two UTF-16 strings and perform case-sensitive (binary) matching against them. | |
207 | * | |
208 | * Results are emitted like FastUnicodeCompare: | |
209 | * | |
210 | * | |
211 | * IF RESULT | |
212 | * -------------------------- | |
213 | * str1 < str2 => -1 | |
214 | * str1 = str2 => 0 | |
215 | * str1 > str2 => +1 | |
216 | * | |
217 | * The case matching source code is greatly simplified due to the lack of case-folding | |
218 | * in this comparison routine. We compare, in order: the lengths, then do character-by- | |
219 | * character comparisons. | |
220 | * | |
221 | */ | |
222 | int32_t UnicodeBinaryCompare (register ConstUniCharArrayPtr str1, register ItemCount len1, | |
223 | register ConstUniCharArrayPtr str2, register ItemCount len2) { | |
224 | uint16_t c1 =0; | |
225 | uint16_t c2 =0; | |
226 | ItemCount string_length; | |
227 | int32_t result = 0; | |
228 | ||
229 | /* First generate the string length (for comparison purposes) */ | |
230 | if (len1 < len2) { | |
231 | string_length = len1; | |
232 | --result; | |
233 | } | |
234 | else if (len1 > len2) { | |
235 | string_length = len2; | |
236 | ++result; | |
237 | } | |
238 | else { | |
239 | string_length = len1; | |
240 | } | |
241 | ||
242 | /* now compare the two string pointers */ | |
243 | while (string_length--) { | |
244 | c1 = *(str1++); | |
245 | c2 = *(str2++); | |
246 | ||
247 | if (c1 > c2) { | |
248 | result = 1; | |
249 | break; | |
250 | } | |
251 | ||
252 | if (c1 < c2) { | |
253 | result = -1; | |
254 | break; | |
255 | } | |
256 | /* If equal, iterate to the next two respective chars */ | |
257 | } | |
258 | ||
259 | return result; | |
260 | } | |
261 | ||
262 | /* | |
263 | * extract the file id from a mangled name | |
264 | */ | |
265 | HFSCatalogNodeID | |
266 | GetEmbeddedFileID(const unsigned char * filename, u_int32_t length, u_int32_t *prefixLength) | |
267 | { | |
268 | short extChars; | |
269 | short i; | |
270 | u_int8_t c; | |
271 | ||
272 | *prefixLength = 0; | |
273 | ||
274 | if ( filename == NULL ) | |
275 | return 0; | |
276 | ||
277 | if ( length < 28 ) | |
278 | return 0; /* too small to have been mangled */ | |
279 | ||
280 | /* big enough for a file ID (#10) and an extension (.x) ? */ | |
281 | if ( length > 5 ) | |
282 | extChars = CountFilenameExtensionChars(filename, length); | |
283 | else | |
284 | extChars = 0; | |
285 | ||
286 | /* skip over dot plus extension characters */ | |
287 | if ( extChars > 0 ) | |
288 | length -= (extChars + 1); | |
289 | ||
290 | /* scan for file id digits */ | |
291 | for ( i = length - 1; i >= 0; --i) { | |
292 | c = filename[i]; | |
293 | ||
294 | /* look for file ID marker */ | |
295 | if ( c == '#' ) { | |
296 | if ( (length - i) < 3 ) | |
297 | break; /* too small to be a file ID */ | |
298 | ||
299 | *prefixLength = i; | |
300 | return HexStringToInteger(length - i - 1, &filename[i+1]); | |
301 | } | |
302 | ||
303 | if ( !IsHexDigit(c) ) | |
304 | break; /* file ID string must have hex digits */ | |
305 | } | |
306 | ||
307 | return 0; | |
308 | } | |
309 | ||
310 | /* | |
311 | * Count filename extension characters (if any) | |
312 | */ | |
313 | u_int32_t | |
314 | CountFilenameExtensionChars( const unsigned char * filename, u_int32_t length ) | |
315 | { | |
316 | UniChar c; | |
317 | u_int16_t maxExtChars; | |
318 | ||
319 | if ( length < 3 ) | |
320 | return 0; /* "x.y" is smallest possible extension */ | |
321 | ||
322 | if ( length < (kMaxFileExtensionChars + 2) ) | |
323 | maxExtChars = length - 2; /* save room for prefix + dot */ | |
324 | else | |
325 | maxExtChars = kMaxFileExtensionChars; | |
326 | ||
327 | u_int32_t extChars = 0; /* number of extension chars (excluding dot) - assume there's no extension */ | |
328 | u_int32_t i = length - 1; /* index to last ascii character */ | |
329 | ||
330 | while ( extChars <= maxExtChars ) { | |
331 | c = filename[i--]; | |
332 | ||
333 | /* look for leading dot */ | |
334 | if ( c == (u_int8_t) '.' ) { | |
335 | if ( extChars > 0 ) /* cannot end with a dot */ | |
336 | return (extChars); | |
337 | ||
338 | break; | |
339 | } | |
340 | ||
341 | if ( EXTENSIONCHAR(c) ) | |
342 | ++extChars; | |
343 | else | |
344 | break; | |
345 | } | |
346 | ||
347 | return 0; | |
348 | } | |
349 | ||
350 | static u_int32_t | |
351 | HexStringToInteger(u_int32_t length, const u_int8_t *hexStr) | |
352 | { | |
353 | u_int32_t value; | |
354 | u_int32_t i; | |
355 | u_int8_t c; | |
356 | const u_int8_t *p; | |
357 | ||
358 | value = 0; | |
359 | p = hexStr; | |
360 | ||
361 | for ( i = 0; i < length; ++i ) { | |
362 | c = *p++; | |
363 | ||
364 | if (c >= '0' && c <= '9') { | |
365 | value = value << 4; | |
366 | value += (u_int32_t) c - (u_int32_t) '0'; | |
367 | } else if (c >= 'A' && c <= 'F') { | |
368 | value = value << 4; | |
369 | value += 10 + ((unsigned int) c - (unsigned int) 'A'); | |
370 | } else { | |
371 | return 0; /* bad character */ | |
372 | } | |
373 | } | |
374 | ||
375 | return value; | |
376 | } | |
377 | ||
378 | OSErr | |
379 | ConvertUnicodeToUTF8Mangled(ByteCount srcLen, ConstUniCharArrayPtr srcStr, ByteCount maxDstLen, | |
380 | ByteCount *actualDstLen, unsigned char* dstStr, HFSCatalogNodeID cnid) | |
381 | { | |
382 | ByteCount subMaxLen; | |
383 | size_t utf8len; | |
384 | char fileIDStr[15]; | |
385 | char extStr[15]; | |
386 | ||
387 | snprintf(fileIDStr, sizeof(fileIDStr), "#%X", cnid); | |
388 | GetFilenameExtension(srcLen/sizeof(UniChar), srcStr, extStr); | |
389 | ||
390 | /* remove extension chars from source */ | |
391 | srcLen -= strlen(extStr) * sizeof(UniChar); | |
392 | subMaxLen = maxDstLen - (strlen(extStr) + strlen(fileIDStr)); | |
393 | ||
394 | (void) utf8_encodestr(srcStr, srcLen, dstStr, &utf8len, subMaxLen, ':', UTF_ADD_NULL_TERM); | |
395 | ||
396 | strlcat((char *)dstStr, fileIDStr, maxDstLen); | |
397 | strlcat((char *)dstStr, extStr, maxDstLen); | |
398 | *actualDstLen = utf8len + (strlen(extStr) + strlen(fileIDStr)); | |
399 | ||
400 | return noErr; | |
401 | } |