]>
Commit | Line | Data |
---|---|---|
73c04bcf A |
1 | /** |
2 | ******************************************************************************* | |
3 | * Copyright (C) 2007, International Business Machines Corporation, Apple Inc.,* | |
4 | * and others. All Rights Reserved. * | |
5 | ******************************************************************************* | |
6 | */ | |
7 | ||
8 | #define __STDC_LIMIT_MACROS 1 | |
9 | #include "unicode/utypes.h" | |
10 | ||
11 | #if !UCONFIG_NO_BREAK_ITERATION && defined(U_DARWIN) | |
12 | ||
13 | #include "brkeng.h" | |
14 | #include "dictbe.h" | |
15 | #include "triedict.h" | |
16 | #include "aaplbfct.h" | |
17 | #include "unicode/uscript.h" | |
18 | #include "unicode/uniset.h" | |
19 | #include "unicode/ucnv.h" | |
20 | #include "unicode/uchar.h" | |
21 | #include <limits.h> | |
22 | #include <unistd.h> | |
23 | #include <glob.h> | |
24 | #include <strings.h> | |
25 | #include <NSSystemDirectories.h> | |
26 | #include <sys/types.h> | |
27 | #include <sys/stat.h> | |
28 | #include <sys/mman.h> | |
29 | #include <fcntl.h> | |
30 | #include <time.h> | |
31 | #include <stdio.h> | |
32 | #include <stdint.h> | |
33 | ||
34 | U_NAMESPACE_BEGIN | |
35 | ||
36 | /* | |
37 | ****************************************************************** | |
38 | */ | |
39 | ||
40 | AppleLanguageBreakFactory::AppleLanguageBreakFactory(UErrorCode &status) | |
41 | : ICULanguageBreakFactory(status) | |
42 | { | |
43 | } | |
44 | ||
45 | AppleLanguageBreakFactory::~AppleLanguageBreakFactory() { | |
46 | } | |
47 | ||
48 | // Helper function that makes a length-delimited buffer look NUL-terminated | |
49 | static __attribute__((always_inline)) inline UChar nextUChar(const UChar *&p, ptrdiff_t &l) { | |
50 | if (l > 0) { | |
51 | l -= 1; | |
52 | return *p++; | |
53 | } | |
54 | else { | |
55 | return 0; | |
56 | } | |
57 | } | |
58 | ||
59 | // Add a file's worth of words to the supplied mutable dictionary | |
60 | static void addDictFile(MutableTrieDictionary *to, const char *path) { | |
61 | UErrorCode status = U_ZERO_ERROR; | |
62 | off_t fileLength; | |
63 | const char *dictRawData = (const char *) -1; | |
64 | const UChar *dictData = NULL; | |
65 | ptrdiff_t dictDataLength = 0; | |
66 | UChar *dictBuffer = NULL; | |
67 | const char *encoding = NULL; | |
68 | int32_t signatureLength = 0; | |
69 | ||
70 | // Open the dictionary file | |
71 | int dictFile = open(path, O_RDONLY, 0); | |
72 | if (dictFile == -1) { | |
73 | status = U_FILE_ACCESS_ERROR; | |
74 | } | |
75 | ||
76 | // Determine its length | |
77 | if (U_SUCCESS(status)) { | |
78 | fileLength = lseek(dictFile, 0, SEEK_END); | |
79 | (void) lseek(dictFile, 0, SEEK_SET); | |
80 | if (fileLength < 0 || fileLength > PTRDIFF_MAX) { | |
81 | status = U_FILE_ACCESS_ERROR; | |
82 | } | |
83 | } | |
84 | ||
85 | // Map it | |
86 | if (U_SUCCESS(status)) { | |
87 | dictRawData = (const char *) mmap(0, (size_t) fileLength, PROT_READ, MAP_SHARED, dictFile, 0); | |
88 | if ((intptr_t)dictRawData == -1) { | |
89 | status = U_FILE_ACCESS_ERROR; | |
90 | } | |
91 | } | |
92 | ||
93 | // No longer need the file descriptor open | |
94 | if (dictFile != -1) { | |
95 | (void) close(dictFile); | |
96 | } | |
97 | ||
98 | // Look for a Unicode signature | |
99 | if (U_SUCCESS(status)) { | |
100 | encoding = ucnv_detectUnicodeSignature(dictRawData, fileLength, &signatureLength, &status); | |
101 | } | |
102 | ||
103 | // If necessary, convert the data to UChars | |
104 | if (U_SUCCESS(status) && encoding != NULL) { | |
105 | UConverter *conv = ucnv_open(encoding, &status); | |
106 | // Preflight to get buffer size | |
107 | uint32_t destCap = ucnv_toUChars(conv, NULL, 0, dictRawData, fileLength, &status); | |
108 | if (status == U_BUFFER_OVERFLOW_ERROR) { | |
109 | status = U_ZERO_ERROR; | |
110 | } | |
111 | if (U_SUCCESS(status)) { | |
112 | dictBuffer = new UChar[destCap+1]; | |
113 | } | |
114 | (void) ucnv_toUChars(conv, dictBuffer, destCap+1, dictRawData, fileLength, &status); | |
115 | dictData = dictBuffer; | |
116 | dictDataLength = destCap; | |
117 | if (U_SUCCESS(status) && dictData[0] == 0xFEFF) { // BOM? Skip it | |
118 | dictData += 1; | |
119 | dictDataLength -= 1; | |
120 | } | |
121 | ||
122 | ucnv_close(conv); | |
123 | } | |
124 | ||
125 | // If it didn't need converting, just assume it's native-endian UTF-16, no BOM | |
126 | if (U_SUCCESS(status) && dictData == NULL) { | |
127 | dictData = (const UChar *) dictRawData; | |
128 | dictDataLength = fileLength/sizeof(UChar); | |
129 | } | |
130 | ||
131 | // OK, we now have a pointer to native-endian UTF-16. Process it as one word per line, | |
132 | // stopping at the first space. | |
133 | if (U_SUCCESS(status)) { | |
134 | UnicodeSet breaks(UNICODE_STRING_SIMPLE("[[:lb=BK:][:lb=CR:][:lb=LF:][:lb=NL:]]"), status); | |
135 | const UChar *candidate = dictData; | |
136 | int32_t length = 0; | |
137 | UChar uc = nextUChar(dictData, dictDataLength); | |
138 | while (U_SUCCESS(status) && uc) { | |
139 | while (uc && !u_isspace(uc)) { | |
140 | length += 1; | |
141 | uc = nextUChar(dictData, dictDataLength); | |
142 | } | |
143 | ||
144 | if (length > 0) { | |
145 | to->addWord(candidate, length, status); | |
146 | } | |
147 | ||
148 | // Find beginning of next line | |
149 | // 1. Skip non-line-break characters | |
150 | while (uc && !breaks.contains(uc)) { | |
151 | uc = nextUChar(dictData, dictDataLength); | |
152 | } | |
153 | // 2. Skip line break characters | |
154 | while (uc && breaks.contains(uc)) { | |
155 | uc = nextUChar(dictData, dictDataLength); | |
156 | } | |
157 | ||
158 | // Prepare for next line | |
159 | candidate = dictData-1; | |
160 | length = 0; | |
161 | } | |
162 | } | |
163 | ||
164 | // Unmap the file if we mapped it | |
165 | if ((intptr_t) dictRawData != -1) { | |
166 | (void) munmap((void *)dictRawData, (size_t) fileLength); | |
167 | } | |
168 | ||
169 | // Delete any temporary buffer | |
170 | delete [] dictBuffer; | |
171 | } | |
172 | ||
173 | #if U_IS_BIG_ENDIAN | |
174 | static const char sArchType[] = ""; | |
175 | #else | |
176 | static const char sArchType[] = ".le"; // little endian | |
177 | #endif | |
178 | ||
179 | const CompactTrieDictionary * | |
180 | AppleLanguageBreakFactory::loadDictionaryFor(UScriptCode script, int32_t breakType) { | |
181 | const CompactTrieDictionary *icuDict = ICULanguageBreakFactory::loadDictionaryFor(script, breakType); | |
182 | // We only look for a user dictionary if there is actually an ICU dictionary | |
183 | if (icuDict != NULL) { | |
184 | UErrorCode status = U_ZERO_ERROR; | |
185 | const char *scriptName = uscript_getName(script); | |
186 | char path[256]; // PATH_MAX is overkill in this case | |
187 | char cachePath[128]; | |
188 | char cacheTargetPath[256]; | |
189 | glob_t dirGlob; | |
190 | glob_t fileGlob; | |
191 | struct stat cacheStat; | |
192 | struct stat dictStat; | |
193 | bool cacheGood = true; | |
194 | int globFlags = (GLOB_NOESCAPE|GLOB_NOSORT|GLOB_TILDE); | |
195 | const CompactTrieDictionary *cacheDict = NULL; | |
196 | ||
197 | // Iterate the dictionary directories and accumulate in dirGlob | |
198 | NSSearchPathEnumerationState state = NSStartSearchPathEnumeration(NSLibraryDirectory, (NSSearchPathDomainMask) (NSUserDomainMask|NSLocalDomainMask|NSNetworkDomainMask)); | |
199 | while (state = NSGetNextSearchPathEnumeration(state, path)) { | |
200 | // First get the directory itself. We should never overflow, but use strlcat anyway | |
201 | // to avoid a crash if we do. | |
202 | strlcat(path, "/Dictionaries", sizeof(path)); | |
203 | if (!glob(path, globFlags, NULL, &dirGlob)) { | |
204 | globFlags |= GLOB_APPEND; | |
205 | } | |
206 | } | |
207 | ||
208 | // If there are no Dictionaries directories, ignore any cache file and return the ICU | |
209 | // standard dictionary | |
210 | // TODO: Delete the cache? | |
211 | if (dirGlob.gl_pathc == 0) { | |
212 | globfree(&dirGlob); | |
213 | return icuDict; | |
214 | } | |
215 | ||
216 | // See if there is a cache file already; get its mod time | |
217 | // TODO: should we be using geteuid() here instead of getuid()? | |
218 | state = NSStartSearchPathEnumeration(NSCachesDirectory, NSLocalDomainMask); | |
219 | state = NSGetNextSearchPathEnumeration(state, cachePath); // Just use first one | |
220 | // Create the cache file name. We should never overflow, but use snprintf to avoid a crash | |
221 | // if we do. | |
222 | snprintf(cacheTargetPath, sizeof(cacheTargetPath), "%s/com.apple.ICUUserDictionaryCache%s.%s.%d", cachePath, sArchType, scriptName, getuid()); | |
223 | if (stat(cacheTargetPath, &cacheStat) || cacheStat.st_mode != (S_IFREG|S_IRUSR|S_IWUSR)) { | |
224 | cacheGood = false; // No file or bad permissions or type | |
225 | } | |
226 | ||
227 | // Stat the dictionary folders, and glob the dictionary files | |
228 | globFlags &= ~GLOB_APPEND; | |
229 | char **pathsp = dirGlob.gl_pathv; | |
230 | const char *dictpath; | |
231 | while (dictpath = *pathsp++) { | |
232 | // Stat the directory -- ignore if stat failure | |
233 | if (!stat(dictpath, &dictStat)) { | |
234 | // Glob the dictionaries in the directory | |
235 | snprintf(path, sizeof(path), "%s/*-%s.txt", dictpath, scriptName); | |
236 | if (!glob(path, globFlags, NULL, &fileGlob)) { | |
237 | globFlags |= GLOB_APPEND; | |
238 | } | |
239 | // If the directory has been modified after the cache file, we need to rebuild; | |
240 | // a dictionary might have been deleted. | |
241 | if (cacheGood && (dictStat.st_mtimespec.tv_sec > cacheStat.st_mtimespec.tv_sec || (dictStat.st_mtimespec.tv_sec == cacheStat.st_mtimespec.tv_sec && dictStat.st_mtimespec.tv_nsec > cacheStat.st_mtimespec.tv_nsec))) { | |
242 | cacheGood = false; | |
243 | } | |
244 | } | |
245 | } | |
246 | ||
247 | // No longer need the directory glob | |
248 | globfree(&dirGlob); | |
249 | ||
250 | // If there are no dictionaries, ignore the cache file and return the ICU dictionary | |
251 | // TODO: Delete the cache? | |
252 | if (fileGlob.gl_pathc == 0) { | |
253 | globfree(&fileGlob); | |
254 | return icuDict; | |
255 | } | |
256 | ||
257 | // Now compare the last modified stamp for the cache against all the dictionaries | |
258 | pathsp = fileGlob.gl_pathv; | |
259 | while (cacheGood && (dictpath = *pathsp++)) { | |
260 | // Stat the dictionary -- ignore if stat failure | |
261 | if (!stat(dictpath, &dictStat) && (dictStat.st_mtimespec.tv_sec > cacheStat.st_mtimespec.tv_sec || (dictStat.st_mtimespec.tv_sec == cacheStat.st_mtimespec.tv_sec && dictStat.st_mtimespec.tv_nsec > cacheStat.st_mtimespec.tv_nsec))) { | |
262 | cacheGood = false; | |
263 | } | |
264 | } | |
265 | ||
266 | // Do we need to build the dictionary cache? | |
267 | if (!cacheGood) { | |
268 | // Create a mutable dictionary from the ICU dictionary | |
269 | MutableTrieDictionary *sum = icuDict->cloneMutable(status); | |
270 | pathsp = fileGlob.gl_pathv; | |
271 | while (U_SUCCESS(status) && (dictpath = *pathsp++)) { | |
272 | // Add the contents of a file to the sum | |
273 | addDictFile(sum, dictpath); | |
274 | } | |
275 | ||
276 | // Create a compact (read-only) dictionary | |
277 | CompactTrieDictionary compact(*sum, status); | |
278 | delete sum; | |
279 | ||
280 | if (U_SUCCESS(status)) { | |
281 | // Open a temp file to write out the cache | |
282 | strlcat(cachePath, "/temp.XXXXXXXXXX", sizeof(cachePath)); | |
283 | int temp = mkstemp(cachePath); | |
284 | if (temp == -1) { | |
285 | status = U_FILE_ACCESS_ERROR; | |
286 | } | |
287 | size_t dictSize = compact.dataSize(); | |
288 | if (U_SUCCESS(status) && write(temp, compact.data(), dictSize) != dictSize) { | |
289 | status = U_FILE_ACCESS_ERROR; | |
290 | } | |
291 | // Rename the temp file to the cache. Note that race conditions here are | |
292 | // fine, as the file system operations are atomic. If an outdated version wins | |
293 | // over a newer version, it will get rebuilt at the next app launch due to the | |
294 | // modification time checks above. We don't care that any given app launch gets | |
295 | // the most up-to-date cache (impossible since we can't lock all the Dictionaries | |
296 | // directories), only that the cache (eventually) reflects the current state of | |
297 | // any user dictionaries. That will happen on the next app launch after changes | |
298 | // to the user dictionaries quiesce. | |
299 | if (U_SUCCESS(status)) { | |
300 | if (rename(cachePath, cacheTargetPath)) { | |
301 | status = U_FILE_ACCESS_ERROR; | |
302 | (void) unlink(cachePath); // Clean up the temp file | |
303 | } | |
304 | } | |
305 | if (temp != -1) { | |
306 | close(temp); | |
307 | } | |
308 | } | |
309 | } | |
310 | ||
311 | // Done with dictionary paths; release memory allocated by glob() | |
312 | globfree(&fileGlob); | |
313 | ||
314 | // Map the cache and build the dictionary | |
315 | if (U_SUCCESS(status)) { | |
316 | int cache = open(cacheTargetPath, O_RDONLY, 0); | |
317 | off_t length; | |
318 | const void *cacheData = (const void *) -1; | |
319 | if (cache == -1) { | |
320 | status = U_FILE_ACCESS_ERROR; | |
321 | } | |
322 | if (U_SUCCESS(status)) { | |
323 | length = lseek(cache, 0, SEEK_END); | |
324 | (void) lseek(cache, 0, SEEK_SET); | |
325 | if (length < 0 || length > PTRDIFF_MAX) { | |
326 | status = U_FILE_ACCESS_ERROR; | |
327 | } | |
328 | } | |
329 | ||
330 | // Map the cache. Note: it is left mapped until process exit. This is the normal | |
331 | // behavior anyway, so it shouldn't be an issue. | |
332 | if (U_SUCCESS(status)) { | |
333 | cacheData = mmap(0, (size_t) length, PROT_READ, MAP_SHARED, cache, 0); | |
334 | if ((intptr_t)cacheData == -1) { | |
335 | status = U_FILE_ACCESS_ERROR; | |
336 | } | |
337 | } | |
338 | // We can close the cache file now that it's mapped (or not) | |
339 | if (cache != -1) { | |
340 | (void) close(cache); | |
341 | } | |
342 | // If all was successful, try to create the dictionary. The constructor will | |
343 | // check the magic number for us. | |
344 | if (U_SUCCESS(status)) { | |
345 | cacheDict = new CompactTrieDictionary(cacheData, status); | |
346 | } | |
347 | if (U_FAILURE(status) && (intptr_t)cacheData != -1) { | |
348 | // Clean up the mmap | |
349 | (void) munmap((void *)cacheData, (size_t) length); | |
350 | } | |
351 | } | |
352 | ||
353 | // If we were successful, free the ICU dictionary and return ours | |
354 | if (U_SUCCESS(status)) { | |
355 | delete icuDict; | |
356 | return cacheDict; | |
357 | } | |
358 | else { | |
359 | delete cacheDict; | |
360 | } | |
361 | } | |
362 | return icuDict; | |
363 | } | |
364 | ||
365 | U_NAMESPACE_END | |
366 | ||
367 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION && defined(U_DARWIN) */ |