]>
Commit | Line | Data |
---|---|---|
73c04bcf A |
1 | /** |
2 | ******************************************************************************* | |
51004dcb | 3 | * Copyright (C) 2007,2012 International Business Machines Corporation, Apple Inc.,* |
73c04bcf A |
4 | * and others. All Rights Reserved. * |
5 | ******************************************************************************* | |
6 | */ | |
7 | ||
8 | #define __STDC_LIMIT_MACROS 1 | |
9 | #include "unicode/utypes.h" | |
10 | ||
4388f060 | 11 | #if !UCONFIG_NO_BREAK_ITERATION && U_PLATFORM_IS_DARWIN_BASED |
73c04bcf A |
12 | |
13 | #include "brkeng.h" | |
14 | #include "dictbe.h" | |
73c04bcf A |
15 | #include "aaplbfct.h" |
16 | #include "unicode/uscript.h" | |
17 | #include "unicode/uniset.h" | |
18 | #include "unicode/ucnv.h" | |
19 | #include "unicode/uchar.h" | |
20 | #include <limits.h> | |
21 | #include <unistd.h> | |
22 | #include <glob.h> | |
23 | #include <strings.h> | |
24 | #include <NSSystemDirectories.h> | |
25 | #include <sys/types.h> | |
26 | #include <sys/stat.h> | |
27 | #include <sys/mman.h> | |
28 | #include <fcntl.h> | |
29 | #include <time.h> | |
30 | #include <stdio.h> | |
31 | #include <stdint.h> | |
4388f060 A |
32 | // The following is now already included by platform.h (included indirectly by |
33 | // utypes.h) if U_PLATFORM_IS_DARWIN_BASED but it doesn't hurt to re-include here | |
729e4ab9 | 34 | #include <TargetConditionals.h> |
73c04bcf A |
35 | |
36 | U_NAMESPACE_BEGIN | |
37 | ||
38 | /* | |
39 | ****************************************************************** | |
40 | */ | |
41 | ||
42 | AppleLanguageBreakFactory::AppleLanguageBreakFactory(UErrorCode &status) | |
43 | : ICULanguageBreakFactory(status) | |
44 | { | |
45 | } | |
46 | ||
47 | AppleLanguageBreakFactory::~AppleLanguageBreakFactory() { | |
48 | } | |
49 | ||
729e4ab9 | 50 | #if !TARGET_OS_EMBEDDED |
51004dcb A |
51 | #if 0 |
52 | // need to update loadDictionaryMatcherFor implementation below | |
729e4ab9 | 53 | |
73c04bcf A |
54 | // Helper function that makes a length-delimited buffer look NUL-terminated |
55 | static __attribute__((always_inline)) inline UChar nextUChar(const UChar *&p, ptrdiff_t &l) { | |
56 | if (l > 0) { | |
57 | l -= 1; | |
58 | return *p++; | |
59 | } | |
60 | else { | |
61 | return 0; | |
62 | } | |
63 | } | |
64 | ||
65 | // Add a file's worth of words to the supplied mutable dictionary | |
66 | static void addDictFile(MutableTrieDictionary *to, const char *path) { | |
67 | UErrorCode status = U_ZERO_ERROR; | |
68 | off_t fileLength; | |
69 | const char *dictRawData = (const char *) -1; | |
70 | const UChar *dictData = NULL; | |
71 | ptrdiff_t dictDataLength = 0; | |
72 | UChar *dictBuffer = NULL; | |
73 | const char *encoding = NULL; | |
74 | int32_t signatureLength = 0; | |
75 | ||
76 | // Open the dictionary file | |
77 | int dictFile = open(path, O_RDONLY, 0); | |
78 | if (dictFile == -1) { | |
79 | status = U_FILE_ACCESS_ERROR; | |
80 | } | |
81 | ||
82 | // Determine its length | |
83 | if (U_SUCCESS(status)) { | |
84 | fileLength = lseek(dictFile, 0, SEEK_END); | |
85 | (void) lseek(dictFile, 0, SEEK_SET); | |
86 | if (fileLength < 0 || fileLength > PTRDIFF_MAX) { | |
87 | status = U_FILE_ACCESS_ERROR; | |
88 | } | |
89 | } | |
90 | ||
91 | // Map it | |
92 | if (U_SUCCESS(status)) { | |
93 | dictRawData = (const char *) mmap(0, (size_t) fileLength, PROT_READ, MAP_SHARED, dictFile, 0); | |
94 | if ((intptr_t)dictRawData == -1) { | |
95 | status = U_FILE_ACCESS_ERROR; | |
96 | } | |
97 | } | |
98 | ||
99 | // No longer need the file descriptor open | |
100 | if (dictFile != -1) { | |
101 | (void) close(dictFile); | |
102 | } | |
103 | ||
104 | // Look for a Unicode signature | |
105 | if (U_SUCCESS(status)) { | |
106 | encoding = ucnv_detectUnicodeSignature(dictRawData, fileLength, &signatureLength, &status); | |
107 | } | |
108 | ||
109 | // If necessary, convert the data to UChars | |
110 | if (U_SUCCESS(status) && encoding != NULL) { | |
111 | UConverter *conv = ucnv_open(encoding, &status); | |
112 | // Preflight to get buffer size | |
113 | uint32_t destCap = ucnv_toUChars(conv, NULL, 0, dictRawData, fileLength, &status); | |
114 | if (status == U_BUFFER_OVERFLOW_ERROR) { | |
115 | status = U_ZERO_ERROR; | |
116 | } | |
117 | if (U_SUCCESS(status)) { | |
118 | dictBuffer = new UChar[destCap+1]; | |
119 | } | |
120 | (void) ucnv_toUChars(conv, dictBuffer, destCap+1, dictRawData, fileLength, &status); | |
121 | dictData = dictBuffer; | |
122 | dictDataLength = destCap; | |
123 | if (U_SUCCESS(status) && dictData[0] == 0xFEFF) { // BOM? Skip it | |
124 | dictData += 1; | |
125 | dictDataLength -= 1; | |
126 | } | |
127 | ||
128 | ucnv_close(conv); | |
129 | } | |
130 | ||
131 | // If it didn't need converting, just assume it's native-endian UTF-16, no BOM | |
132 | if (U_SUCCESS(status) && dictData == NULL) { | |
133 | dictData = (const UChar *) dictRawData; | |
134 | dictDataLength = fileLength/sizeof(UChar); | |
135 | } | |
136 | ||
137 | // OK, we now have a pointer to native-endian UTF-16. Process it as one word per line, | |
138 | // stopping at the first space. | |
139 | if (U_SUCCESS(status)) { | |
140 | UnicodeSet breaks(UNICODE_STRING_SIMPLE("[[:lb=BK:][:lb=CR:][:lb=LF:][:lb=NL:]]"), status); | |
141 | const UChar *candidate = dictData; | |
142 | int32_t length = 0; | |
143 | UChar uc = nextUChar(dictData, dictDataLength); | |
144 | while (U_SUCCESS(status) && uc) { | |
145 | while (uc && !u_isspace(uc)) { | |
146 | length += 1; | |
147 | uc = nextUChar(dictData, dictDataLength); | |
148 | } | |
149 | ||
150 | if (length > 0) { | |
151 | to->addWord(candidate, length, status); | |
152 | } | |
153 | ||
154 | // Find beginning of next line | |
155 | // 1. Skip non-line-break characters | |
156 | while (uc && !breaks.contains(uc)) { | |
157 | uc = nextUChar(dictData, dictDataLength); | |
158 | } | |
159 | // 2. Skip line break characters | |
160 | while (uc && breaks.contains(uc)) { | |
161 | uc = nextUChar(dictData, dictDataLength); | |
162 | } | |
163 | ||
164 | // Prepare for next line | |
165 | candidate = dictData-1; | |
166 | length = 0; | |
167 | } | |
168 | } | |
169 | ||
170 | // Unmap the file if we mapped it | |
171 | if ((intptr_t) dictRawData != -1) { | |
172 | (void) munmap((void *)dictRawData, (size_t) fileLength); | |
173 | } | |
174 | ||
175 | // Delete any temporary buffer | |
176 | delete [] dictBuffer; | |
177 | } | |
178 | ||
179 | #if U_IS_BIG_ENDIAN | |
180 | static const char sArchType[] = ""; | |
181 | #else | |
182 | static const char sArchType[] = ".le"; // little endian | |
183 | #endif | |
184 | ||
729e4ab9 | 185 | #endif |
51004dcb A |
186 | #endif |
187 | ||
188 | /* | |
189 | In ICU50, | |
190 | ICULanguageBreakFactory changes from | |
191 | virtual const CompactTrieDictionary *loadDictionaryFor(UScriptCode script, int32_t breakType); | |
192 | to | |
193 | virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script, int32_t breakType); | |
194 | and CompactTrieDictionary no longer exists. Need to work out new implementation below. | |
195 | */ | |
729e4ab9 | 196 | |
51004dcb A |
197 | DictionaryMatcher * |
198 | AppleLanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t breakType) { | |
0f5d89e8 | 199 | DictionaryMatcher *icuDictMatcher = ICULanguageBreakFactory::loadDictionaryMatcherFor(script); |
729e4ab9 | 200 | #if !TARGET_OS_EMBEDDED |
51004dcb A |
201 | #if 0 |
202 | // need to update loadDictionaryMatcherFor implementation below | |
73c04bcf | 203 | // We only look for a user dictionary if there is actually an ICU dictionary |
51004dcb | 204 | if (icuDictMatcher != NULL) { |
73c04bcf A |
205 | UErrorCode status = U_ZERO_ERROR; |
206 | const char *scriptName = uscript_getName(script); | |
207 | char path[256]; // PATH_MAX is overkill in this case | |
208 | char cachePath[128]; | |
209 | char cacheTargetPath[256]; | |
210 | glob_t dirGlob; | |
211 | glob_t fileGlob; | |
212 | struct stat cacheStat; | |
213 | struct stat dictStat; | |
214 | bool cacheGood = true; | |
215 | int globFlags = (GLOB_NOESCAPE|GLOB_NOSORT|GLOB_TILDE); | |
216 | const CompactTrieDictionary *cacheDict = NULL; | |
217 | ||
218 | // Iterate the dictionary directories and accumulate in dirGlob | |
219 | NSSearchPathEnumerationState state = NSStartSearchPathEnumeration(NSLibraryDirectory, (NSSearchPathDomainMask) (NSUserDomainMask|NSLocalDomainMask|NSNetworkDomainMask)); | |
4388f060 | 220 | while ((state = NSGetNextSearchPathEnumeration(state, path)) != 0) { |
73c04bcf A |
221 | // First get the directory itself. We should never overflow, but use strlcat anyway |
222 | // to avoid a crash if we do. | |
223 | strlcat(path, "/Dictionaries", sizeof(path)); | |
224 | if (!glob(path, globFlags, NULL, &dirGlob)) { | |
225 | globFlags |= GLOB_APPEND; | |
226 | } | |
227 | } | |
228 | ||
229 | // If there are no Dictionaries directories, ignore any cache file and return the ICU | |
230 | // standard dictionary | |
231 | // TODO: Delete the cache? | |
232 | if (dirGlob.gl_pathc == 0) { | |
233 | globfree(&dirGlob); | |
51004dcb | 234 | return icuDictMatcher; |
73c04bcf A |
235 | } |
236 | ||
237 | // See if there is a cache file already; get its mod time | |
238 | // TODO: should we be using geteuid() here instead of getuid()? | |
239 | state = NSStartSearchPathEnumeration(NSCachesDirectory, NSLocalDomainMask); | |
240 | state = NSGetNextSearchPathEnumeration(state, cachePath); // Just use first one | |
241 | // Create the cache file name. We should never overflow, but use snprintf to avoid a crash | |
242 | // if we do. | |
243 | snprintf(cacheTargetPath, sizeof(cacheTargetPath), "%s/com.apple.ICUUserDictionaryCache%s.%s.%d", cachePath, sArchType, scriptName, getuid()); | |
244 | if (stat(cacheTargetPath, &cacheStat) || cacheStat.st_mode != (S_IFREG|S_IRUSR|S_IWUSR)) { | |
245 | cacheGood = false; // No file or bad permissions or type | |
246 | } | |
247 | ||
248 | // Stat the dictionary folders, and glob the dictionary files | |
249 | globFlags &= ~GLOB_APPEND; | |
250 | char **pathsp = dirGlob.gl_pathv; | |
251 | const char *dictpath; | |
4388f060 | 252 | while ((dictpath = *pathsp++) != NULL) { |
73c04bcf A |
253 | // Stat the directory -- ignore if stat failure |
254 | if (!stat(dictpath, &dictStat)) { | |
255 | // Glob the dictionaries in the directory | |
256 | snprintf(path, sizeof(path), "%s/*-%s.txt", dictpath, scriptName); | |
257 | if (!glob(path, globFlags, NULL, &fileGlob)) { | |
258 | globFlags |= GLOB_APPEND; | |
259 | } | |
260 | // If the directory has been modified after the cache file, we need to rebuild; | |
261 | // a dictionary might have been deleted. | |
262 | if (cacheGood && (dictStat.st_mtimespec.tv_sec > cacheStat.st_mtimespec.tv_sec || (dictStat.st_mtimespec.tv_sec == cacheStat.st_mtimespec.tv_sec && dictStat.st_mtimespec.tv_nsec > cacheStat.st_mtimespec.tv_nsec))) { | |
263 | cacheGood = false; | |
264 | } | |
265 | } | |
266 | } | |
267 | ||
268 | // No longer need the directory glob | |
269 | globfree(&dirGlob); | |
270 | ||
271 | // If there are no dictionaries, ignore the cache file and return the ICU dictionary | |
272 | // TODO: Delete the cache? | |
273 | if (fileGlob.gl_pathc == 0) { | |
274 | globfree(&fileGlob); | |
51004dcb | 275 | return icuDictMatcher; |
73c04bcf A |
276 | } |
277 | ||
278 | // Now compare the last modified stamp for the cache against all the dictionaries | |
279 | pathsp = fileGlob.gl_pathv; | |
280 | while (cacheGood && (dictpath = *pathsp++)) { | |
281 | // Stat the dictionary -- ignore if stat failure | |
282 | if (!stat(dictpath, &dictStat) && (dictStat.st_mtimespec.tv_sec > cacheStat.st_mtimespec.tv_sec || (dictStat.st_mtimespec.tv_sec == cacheStat.st_mtimespec.tv_sec && dictStat.st_mtimespec.tv_nsec > cacheStat.st_mtimespec.tv_nsec))) { | |
283 | cacheGood = false; | |
284 | } | |
285 | } | |
286 | ||
287 | // Do we need to build the dictionary cache? | |
288 | if (!cacheGood) { | |
289 | // Create a mutable dictionary from the ICU dictionary | |
51004dcb | 290 | MutableTrieDictionary *sum = icuDictMatcher->cloneMutable(status); |
73c04bcf A |
291 | pathsp = fileGlob.gl_pathv; |
292 | while (U_SUCCESS(status) && (dictpath = *pathsp++)) { | |
293 | // Add the contents of a file to the sum | |
294 | addDictFile(sum, dictpath); | |
295 | } | |
296 | ||
297 | // Create a compact (read-only) dictionary | |
298 | CompactTrieDictionary compact(*sum, status); | |
299 | delete sum; | |
300 | ||
301 | if (U_SUCCESS(status)) { | |
302 | // Open a temp file to write out the cache | |
303 | strlcat(cachePath, "/temp.XXXXXXXXXX", sizeof(cachePath)); | |
304 | int temp = mkstemp(cachePath); | |
305 | if (temp == -1) { | |
306 | status = U_FILE_ACCESS_ERROR; | |
307 | } | |
308 | size_t dictSize = compact.dataSize(); | |
309 | if (U_SUCCESS(status) && write(temp, compact.data(), dictSize) != dictSize) { | |
310 | status = U_FILE_ACCESS_ERROR; | |
311 | } | |
312 | // Rename the temp file to the cache. Note that race conditions here are | |
313 | // fine, as the file system operations are atomic. If an outdated version wins | |
314 | // over a newer version, it will get rebuilt at the next app launch due to the | |
315 | // modification time checks above. We don't care that any given app launch gets | |
316 | // the most up-to-date cache (impossible since we can't lock all the Dictionaries | |
317 | // directories), only that the cache (eventually) reflects the current state of | |
318 | // any user dictionaries. That will happen on the next app launch after changes | |
319 | // to the user dictionaries quiesce. | |
320 | if (U_SUCCESS(status)) { | |
321 | if (rename(cachePath, cacheTargetPath)) { | |
322 | status = U_FILE_ACCESS_ERROR; | |
323 | (void) unlink(cachePath); // Clean up the temp file | |
324 | } | |
325 | } | |
326 | if (temp != -1) { | |
327 | close(temp); | |
328 | } | |
329 | } | |
330 | } | |
331 | ||
332 | // Done with dictionary paths; release memory allocated by glob() | |
333 | globfree(&fileGlob); | |
334 | ||
335 | // Map the cache and build the dictionary | |
336 | if (U_SUCCESS(status)) { | |
337 | int cache = open(cacheTargetPath, O_RDONLY, 0); | |
338 | off_t length; | |
339 | const void *cacheData = (const void *) -1; | |
340 | if (cache == -1) { | |
341 | status = U_FILE_ACCESS_ERROR; | |
342 | } | |
343 | if (U_SUCCESS(status)) { | |
344 | length = lseek(cache, 0, SEEK_END); | |
345 | (void) lseek(cache, 0, SEEK_SET); | |
346 | if (length < 0 || length > PTRDIFF_MAX) { | |
347 | status = U_FILE_ACCESS_ERROR; | |
348 | } | |
349 | } | |
350 | ||
351 | // Map the cache. Note: it is left mapped until process exit. This is the normal | |
352 | // behavior anyway, so it shouldn't be an issue. | |
353 | if (U_SUCCESS(status)) { | |
354 | cacheData = mmap(0, (size_t) length, PROT_READ, MAP_SHARED, cache, 0); | |
355 | if ((intptr_t)cacheData == -1) { | |
356 | status = U_FILE_ACCESS_ERROR; | |
357 | } | |
358 | } | |
359 | // We can close the cache file now that it's mapped (or not) | |
360 | if (cache != -1) { | |
361 | (void) close(cache); | |
362 | } | |
363 | // If all was successful, try to create the dictionary. The constructor will | |
364 | // check the magic number for us. | |
365 | if (U_SUCCESS(status)) { | |
366 | cacheDict = new CompactTrieDictionary(cacheData, status); | |
367 | } | |
368 | if (U_FAILURE(status) && (intptr_t)cacheData != -1) { | |
369 | // Clean up the mmap | |
370 | (void) munmap((void *)cacheData, (size_t) length); | |
371 | } | |
372 | } | |
373 | ||
374 | // If we were successful, free the ICU dictionary and return ours | |
375 | if (U_SUCCESS(status)) { | |
51004dcb | 376 | delete icuDictMatcher; |
73c04bcf A |
377 | return cacheDict; |
378 | } | |
379 | else { | |
380 | delete cacheDict; | |
381 | } | |
382 | } | |
729e4ab9 | 383 | #endif |
51004dcb A |
384 | #endif |
385 | return icuDictMatcher; | |
73c04bcf A |
386 | } |
387 | ||
388 | U_NAMESPACE_END | |
389 | ||
4388f060 | 390 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION && U_PLATFORM_IS_DARWIN_BASED */ |