]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/aaplbfct.cpp
ICU-491.11.2.tar.gz
[apple/icu.git] / icuSources / common / aaplbfct.cpp
1 /**
2 *******************************************************************************
3 * Copyright (C) 2007, International Business Machines Corporation, Apple Inc.,*
4 * and others. All Rights Reserved. *
5 *******************************************************************************
6 */
7
8 #define __STDC_LIMIT_MACROS 1
9 #include "unicode/utypes.h"
10
11 #if !UCONFIG_NO_BREAK_ITERATION && U_PLATFORM_IS_DARWIN_BASED
12
13 #include "brkeng.h"
14 #include "dictbe.h"
15 #include "triedict.h"
16 #include "aaplbfct.h"
17 #include "unicode/uscript.h"
18 #include "unicode/uniset.h"
19 #include "unicode/ucnv.h"
20 #include "unicode/uchar.h"
21 #include <limits.h>
22 #include <unistd.h>
23 #include <glob.h>
24 #include <strings.h>
25 #include <NSSystemDirectories.h>
26 #include <sys/types.h>
27 #include <sys/stat.h>
28 #include <sys/mman.h>
29 #include <fcntl.h>
30 #include <time.h>
31 #include <stdio.h>
32 #include <stdint.h>
33 // The following is now already included by platform.h (included indirectly by
34 // utypes.h) if U_PLATFORM_IS_DARWIN_BASED but it doesn't hurt to re-include here
35 #include <TargetConditionals.h>
36
37 U_NAMESPACE_BEGIN
38
39 /*
40 ******************************************************************
41 */
42
43 AppleLanguageBreakFactory::AppleLanguageBreakFactory(UErrorCode &status)
44 : ICULanguageBreakFactory(status)
45 {
46 }
47
48 AppleLanguageBreakFactory::~AppleLanguageBreakFactory() {
49 }
50
51 #if !TARGET_OS_EMBEDDED
52
53 // Helper function that makes a length-delimited buffer look NUL-terminated
54 static __attribute__((always_inline)) inline UChar nextUChar(const UChar *&p, ptrdiff_t &l) {
55 if (l > 0) {
56 l -= 1;
57 return *p++;
58 }
59 else {
60 return 0;
61 }
62 }
63
64 // Add a file's worth of words to the supplied mutable dictionary
65 static void addDictFile(MutableTrieDictionary *to, const char *path) {
66 UErrorCode status = U_ZERO_ERROR;
67 off_t fileLength;
68 const char *dictRawData = (const char *) -1;
69 const UChar *dictData = NULL;
70 ptrdiff_t dictDataLength = 0;
71 UChar *dictBuffer = NULL;
72 const char *encoding = NULL;
73 int32_t signatureLength = 0;
74
75 // Open the dictionary file
76 int dictFile = open(path, O_RDONLY, 0);
77 if (dictFile == -1) {
78 status = U_FILE_ACCESS_ERROR;
79 }
80
81 // Determine its length
82 if (U_SUCCESS(status)) {
83 fileLength = lseek(dictFile, 0, SEEK_END);
84 (void) lseek(dictFile, 0, SEEK_SET);
85 if (fileLength < 0 || fileLength > PTRDIFF_MAX) {
86 status = U_FILE_ACCESS_ERROR;
87 }
88 }
89
90 // Map it
91 if (U_SUCCESS(status)) {
92 dictRawData = (const char *) mmap(0, (size_t) fileLength, PROT_READ, MAP_SHARED, dictFile, 0);
93 if ((intptr_t)dictRawData == -1) {
94 status = U_FILE_ACCESS_ERROR;
95 }
96 }
97
98 // No longer need the file descriptor open
99 if (dictFile != -1) {
100 (void) close(dictFile);
101 }
102
103 // Look for a Unicode signature
104 if (U_SUCCESS(status)) {
105 encoding = ucnv_detectUnicodeSignature(dictRawData, fileLength, &signatureLength, &status);
106 }
107
108 // If necessary, convert the data to UChars
109 if (U_SUCCESS(status) && encoding != NULL) {
110 UConverter *conv = ucnv_open(encoding, &status);
111 // Preflight to get buffer size
112 uint32_t destCap = ucnv_toUChars(conv, NULL, 0, dictRawData, fileLength, &status);
113 if (status == U_BUFFER_OVERFLOW_ERROR) {
114 status = U_ZERO_ERROR;
115 }
116 if (U_SUCCESS(status)) {
117 dictBuffer = new UChar[destCap+1];
118 }
119 (void) ucnv_toUChars(conv, dictBuffer, destCap+1, dictRawData, fileLength, &status);
120 dictData = dictBuffer;
121 dictDataLength = destCap;
122 if (U_SUCCESS(status) && dictData[0] == 0xFEFF) { // BOM? Skip it
123 dictData += 1;
124 dictDataLength -= 1;
125 }
126
127 ucnv_close(conv);
128 }
129
130 // If it didn't need converting, just assume it's native-endian UTF-16, no BOM
131 if (U_SUCCESS(status) && dictData == NULL) {
132 dictData = (const UChar *) dictRawData;
133 dictDataLength = fileLength/sizeof(UChar);
134 }
135
136 // OK, we now have a pointer to native-endian UTF-16. Process it as one word per line,
137 // stopping at the first space.
138 if (U_SUCCESS(status)) {
139 UnicodeSet breaks(UNICODE_STRING_SIMPLE("[[:lb=BK:][:lb=CR:][:lb=LF:][:lb=NL:]]"), status);
140 const UChar *candidate = dictData;
141 int32_t length = 0;
142 UChar uc = nextUChar(dictData, dictDataLength);
143 while (U_SUCCESS(status) && uc) {
144 while (uc && !u_isspace(uc)) {
145 length += 1;
146 uc = nextUChar(dictData, dictDataLength);
147 }
148
149 if (length > 0) {
150 to->addWord(candidate, length, status);
151 }
152
153 // Find beginning of next line
154 // 1. Skip non-line-break characters
155 while (uc && !breaks.contains(uc)) {
156 uc = nextUChar(dictData, dictDataLength);
157 }
158 // 2. Skip line break characters
159 while (uc && breaks.contains(uc)) {
160 uc = nextUChar(dictData, dictDataLength);
161 }
162
163 // Prepare for next line
164 candidate = dictData-1;
165 length = 0;
166 }
167 }
168
169 // Unmap the file if we mapped it
170 if ((intptr_t) dictRawData != -1) {
171 (void) munmap((void *)dictRawData, (size_t) fileLength);
172 }
173
174 // Delete any temporary buffer
175 delete [] dictBuffer;
176 }
177
178 #if U_IS_BIG_ENDIAN
179 static const char sArchType[] = "";
180 #else
181 static const char sArchType[] = ".le"; // little endian
182 #endif
183
184 #endif
185
186 const CompactTrieDictionary *
187 AppleLanguageBreakFactory::loadDictionaryFor(UScriptCode script, int32_t breakType) {
188 const CompactTrieDictionary *icuDict = ICULanguageBreakFactory::loadDictionaryFor(script, breakType);
189 #if !TARGET_OS_EMBEDDED
190 // We only look for a user dictionary if there is actually an ICU dictionary
191 if (icuDict != NULL) {
192 UErrorCode status = U_ZERO_ERROR;
193 const char *scriptName = uscript_getName(script);
194 char path[256]; // PATH_MAX is overkill in this case
195 char cachePath[128];
196 char cacheTargetPath[256];
197 glob_t dirGlob;
198 glob_t fileGlob;
199 struct stat cacheStat;
200 struct stat dictStat;
201 bool cacheGood = true;
202 int globFlags = (GLOB_NOESCAPE|GLOB_NOSORT|GLOB_TILDE);
203 const CompactTrieDictionary *cacheDict = NULL;
204
205 // Iterate the dictionary directories and accumulate in dirGlob
206 NSSearchPathEnumerationState state = NSStartSearchPathEnumeration(NSLibraryDirectory, (NSSearchPathDomainMask) (NSUserDomainMask|NSLocalDomainMask|NSNetworkDomainMask));
207 while ((state = NSGetNextSearchPathEnumeration(state, path)) != 0) {
208 // First get the directory itself. We should never overflow, but use strlcat anyway
209 // to avoid a crash if we do.
210 strlcat(path, "/Dictionaries", sizeof(path));
211 if (!glob(path, globFlags, NULL, &dirGlob)) {
212 globFlags |= GLOB_APPEND;
213 }
214 }
215
216 // If there are no Dictionaries directories, ignore any cache file and return the ICU
217 // standard dictionary
218 // TODO: Delete the cache?
219 if (dirGlob.gl_pathc == 0) {
220 globfree(&dirGlob);
221 return icuDict;
222 }
223
224 // See if there is a cache file already; get its mod time
225 // TODO: should we be using geteuid() here instead of getuid()?
226 state = NSStartSearchPathEnumeration(NSCachesDirectory, NSLocalDomainMask);
227 state = NSGetNextSearchPathEnumeration(state, cachePath); // Just use first one
228 // Create the cache file name. We should never overflow, but use snprintf to avoid a crash
229 // if we do.
230 snprintf(cacheTargetPath, sizeof(cacheTargetPath), "%s/com.apple.ICUUserDictionaryCache%s.%s.%d", cachePath, sArchType, scriptName, getuid());
231 if (stat(cacheTargetPath, &cacheStat) || cacheStat.st_mode != (S_IFREG|S_IRUSR|S_IWUSR)) {
232 cacheGood = false; // No file or bad permissions or type
233 }
234
235 // Stat the dictionary folders, and glob the dictionary files
236 globFlags &= ~GLOB_APPEND;
237 char **pathsp = dirGlob.gl_pathv;
238 const char *dictpath;
239 while ((dictpath = *pathsp++) != NULL) {
240 // Stat the directory -- ignore if stat failure
241 if (!stat(dictpath, &dictStat)) {
242 // Glob the dictionaries in the directory
243 snprintf(path, sizeof(path), "%s/*-%s.txt", dictpath, scriptName);
244 if (!glob(path, globFlags, NULL, &fileGlob)) {
245 globFlags |= GLOB_APPEND;
246 }
247 // If the directory has been modified after the cache file, we need to rebuild;
248 // a dictionary might have been deleted.
249 if (cacheGood && (dictStat.st_mtimespec.tv_sec > cacheStat.st_mtimespec.tv_sec || (dictStat.st_mtimespec.tv_sec == cacheStat.st_mtimespec.tv_sec && dictStat.st_mtimespec.tv_nsec > cacheStat.st_mtimespec.tv_nsec))) {
250 cacheGood = false;
251 }
252 }
253 }
254
255 // No longer need the directory glob
256 globfree(&dirGlob);
257
258 // If there are no dictionaries, ignore the cache file and return the ICU dictionary
259 // TODO: Delete the cache?
260 if (fileGlob.gl_pathc == 0) {
261 globfree(&fileGlob);
262 return icuDict;
263 }
264
265 // Now compare the last modified stamp for the cache against all the dictionaries
266 pathsp = fileGlob.gl_pathv;
267 while (cacheGood && (dictpath = *pathsp++)) {
268 // Stat the dictionary -- ignore if stat failure
269 if (!stat(dictpath, &dictStat) && (dictStat.st_mtimespec.tv_sec > cacheStat.st_mtimespec.tv_sec || (dictStat.st_mtimespec.tv_sec == cacheStat.st_mtimespec.tv_sec && dictStat.st_mtimespec.tv_nsec > cacheStat.st_mtimespec.tv_nsec))) {
270 cacheGood = false;
271 }
272 }
273
274 // Do we need to build the dictionary cache?
275 if (!cacheGood) {
276 // Create a mutable dictionary from the ICU dictionary
277 MutableTrieDictionary *sum = icuDict->cloneMutable(status);
278 pathsp = fileGlob.gl_pathv;
279 while (U_SUCCESS(status) && (dictpath = *pathsp++)) {
280 // Add the contents of a file to the sum
281 addDictFile(sum, dictpath);
282 }
283
284 // Create a compact (read-only) dictionary
285 CompactTrieDictionary compact(*sum, status);
286 delete sum;
287
288 if (U_SUCCESS(status)) {
289 // Open a temp file to write out the cache
290 strlcat(cachePath, "/temp.XXXXXXXXXX", sizeof(cachePath));
291 int temp = mkstemp(cachePath);
292 if (temp == -1) {
293 status = U_FILE_ACCESS_ERROR;
294 }
295 size_t dictSize = compact.dataSize();
296 if (U_SUCCESS(status) && write(temp, compact.data(), dictSize) != dictSize) {
297 status = U_FILE_ACCESS_ERROR;
298 }
299 // Rename the temp file to the cache. Note that race conditions here are
300 // fine, as the file system operations are atomic. If an outdated version wins
301 // over a newer version, it will get rebuilt at the next app launch due to the
302 // modification time checks above. We don't care that any given app launch gets
303 // the most up-to-date cache (impossible since we can't lock all the Dictionaries
304 // directories), only that the cache (eventually) reflects the current state of
305 // any user dictionaries. That will happen on the next app launch after changes
306 // to the user dictionaries quiesce.
307 if (U_SUCCESS(status)) {
308 if (rename(cachePath, cacheTargetPath)) {
309 status = U_FILE_ACCESS_ERROR;
310 (void) unlink(cachePath); // Clean up the temp file
311 }
312 }
313 if (temp != -1) {
314 close(temp);
315 }
316 }
317 }
318
319 // Done with dictionary paths; release memory allocated by glob()
320 globfree(&fileGlob);
321
322 // Map the cache and build the dictionary
323 if (U_SUCCESS(status)) {
324 int cache = open(cacheTargetPath, O_RDONLY, 0);
325 off_t length;
326 const void *cacheData = (const void *) -1;
327 if (cache == -1) {
328 status = U_FILE_ACCESS_ERROR;
329 }
330 if (U_SUCCESS(status)) {
331 length = lseek(cache, 0, SEEK_END);
332 (void) lseek(cache, 0, SEEK_SET);
333 if (length < 0 || length > PTRDIFF_MAX) {
334 status = U_FILE_ACCESS_ERROR;
335 }
336 }
337
338 // Map the cache. Note: it is left mapped until process exit. This is the normal
339 // behavior anyway, so it shouldn't be an issue.
340 if (U_SUCCESS(status)) {
341 cacheData = mmap(0, (size_t) length, PROT_READ, MAP_SHARED, cache, 0);
342 if ((intptr_t)cacheData == -1) {
343 status = U_FILE_ACCESS_ERROR;
344 }
345 }
346 // We can close the cache file now that it's mapped (or not)
347 if (cache != -1) {
348 (void) close(cache);
349 }
350 // If all was successful, try to create the dictionary. The constructor will
351 // check the magic number for us.
352 if (U_SUCCESS(status)) {
353 cacheDict = new CompactTrieDictionary(cacheData, status);
354 }
355 if (U_FAILURE(status) && (intptr_t)cacheData != -1) {
356 // Clean up the mmap
357 (void) munmap((void *)cacheData, (size_t) length);
358 }
359 }
360
361 // If we were successful, free the ICU dictionary and return ours
362 if (U_SUCCESS(status)) {
363 delete icuDict;
364 return cacheDict;
365 }
366 else {
367 delete cacheDict;
368 }
369 }
370 #endif
371 return icuDict;
372 }
373
374 U_NAMESPACE_END
375
376 #endif /* #if !UCONFIG_NO_BREAK_ITERATION && U_PLATFORM_IS_DARWIN_BASED */