]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/aaplbfct.cpp
ICU-461.13.tar.gz
[apple/icu.git] / icuSources / common / aaplbfct.cpp
1 /**
2 *******************************************************************************
3 * Copyright (C) 2007, International Business Machines Corporation, Apple Inc.,*
4 * and others. All Rights Reserved. *
5 *******************************************************************************
6 */
7
8 #define __STDC_LIMIT_MACROS 1
9 #include "unicode/utypes.h"
10
11 #if !UCONFIG_NO_BREAK_ITERATION && defined(U_DARWIN)
12
13 #include "brkeng.h"
14 #include "dictbe.h"
15 #include "triedict.h"
16 #include "aaplbfct.h"
17 #include "unicode/uscript.h"
18 #include "unicode/uniset.h"
19 #include "unicode/ucnv.h"
20 #include "unicode/uchar.h"
21 #include <limits.h>
22 #include <unistd.h>
23 #include <glob.h>
24 #include <strings.h>
25 #include <NSSystemDirectories.h>
26 #include <sys/types.h>
27 #include <sys/stat.h>
28 #include <sys/mman.h>
29 #include <fcntl.h>
30 #include <time.h>
31 #include <stdio.h>
32 #include <stdint.h>
33 #include <TargetConditionals.h>
34
35 U_NAMESPACE_BEGIN
36
37 /*
38 ******************************************************************
39 */
40
41 AppleLanguageBreakFactory::AppleLanguageBreakFactory(UErrorCode &status)
42 : ICULanguageBreakFactory(status)
43 {
44 }
45
46 AppleLanguageBreakFactory::~AppleLanguageBreakFactory() {
47 }
48
49 #if !TARGET_OS_EMBEDDED
50
51 // Helper function that makes a length-delimited buffer look NUL-terminated
52 static __attribute__((always_inline)) inline UChar nextUChar(const UChar *&p, ptrdiff_t &l) {
53 if (l > 0) {
54 l -= 1;
55 return *p++;
56 }
57 else {
58 return 0;
59 }
60 }
61
62 // Add a file's worth of words to the supplied mutable dictionary
63 static void addDictFile(MutableTrieDictionary *to, const char *path) {
64 UErrorCode status = U_ZERO_ERROR;
65 off_t fileLength;
66 const char *dictRawData = (const char *) -1;
67 const UChar *dictData = NULL;
68 ptrdiff_t dictDataLength = 0;
69 UChar *dictBuffer = NULL;
70 const char *encoding = NULL;
71 int32_t signatureLength = 0;
72
73 // Open the dictionary file
74 int dictFile = open(path, O_RDONLY, 0);
75 if (dictFile == -1) {
76 status = U_FILE_ACCESS_ERROR;
77 }
78
79 // Determine its length
80 if (U_SUCCESS(status)) {
81 fileLength = lseek(dictFile, 0, SEEK_END);
82 (void) lseek(dictFile, 0, SEEK_SET);
83 if (fileLength < 0 || fileLength > PTRDIFF_MAX) {
84 status = U_FILE_ACCESS_ERROR;
85 }
86 }
87
88 // Map it
89 if (U_SUCCESS(status)) {
90 dictRawData = (const char *) mmap(0, (size_t) fileLength, PROT_READ, MAP_SHARED, dictFile, 0);
91 if ((intptr_t)dictRawData == -1) {
92 status = U_FILE_ACCESS_ERROR;
93 }
94 }
95
96 // No longer need the file descriptor open
97 if (dictFile != -1) {
98 (void) close(dictFile);
99 }
100
101 // Look for a Unicode signature
102 if (U_SUCCESS(status)) {
103 encoding = ucnv_detectUnicodeSignature(dictRawData, fileLength, &signatureLength, &status);
104 }
105
106 // If necessary, convert the data to UChars
107 if (U_SUCCESS(status) && encoding != NULL) {
108 UConverter *conv = ucnv_open(encoding, &status);
109 // Preflight to get buffer size
110 uint32_t destCap = ucnv_toUChars(conv, NULL, 0, dictRawData, fileLength, &status);
111 if (status == U_BUFFER_OVERFLOW_ERROR) {
112 status = U_ZERO_ERROR;
113 }
114 if (U_SUCCESS(status)) {
115 dictBuffer = new UChar[destCap+1];
116 }
117 (void) ucnv_toUChars(conv, dictBuffer, destCap+1, dictRawData, fileLength, &status);
118 dictData = dictBuffer;
119 dictDataLength = destCap;
120 if (U_SUCCESS(status) && dictData[0] == 0xFEFF) { // BOM? Skip it
121 dictData += 1;
122 dictDataLength -= 1;
123 }
124
125 ucnv_close(conv);
126 }
127
128 // If it didn't need converting, just assume it's native-endian UTF-16, no BOM
129 if (U_SUCCESS(status) && dictData == NULL) {
130 dictData = (const UChar *) dictRawData;
131 dictDataLength = fileLength/sizeof(UChar);
132 }
133
134 // OK, we now have a pointer to native-endian UTF-16. Process it as one word per line,
135 // stopping at the first space.
136 if (U_SUCCESS(status)) {
137 UnicodeSet breaks(UNICODE_STRING_SIMPLE("[[:lb=BK:][:lb=CR:][:lb=LF:][:lb=NL:]]"), status);
138 const UChar *candidate = dictData;
139 int32_t length = 0;
140 UChar uc = nextUChar(dictData, dictDataLength);
141 while (U_SUCCESS(status) && uc) {
142 while (uc && !u_isspace(uc)) {
143 length += 1;
144 uc = nextUChar(dictData, dictDataLength);
145 }
146
147 if (length > 0) {
148 to->addWord(candidate, length, status);
149 }
150
151 // Find beginning of next line
152 // 1. Skip non-line-break characters
153 while (uc && !breaks.contains(uc)) {
154 uc = nextUChar(dictData, dictDataLength);
155 }
156 // 2. Skip line break characters
157 while (uc && breaks.contains(uc)) {
158 uc = nextUChar(dictData, dictDataLength);
159 }
160
161 // Prepare for next line
162 candidate = dictData-1;
163 length = 0;
164 }
165 }
166
167 // Unmap the file if we mapped it
168 if ((intptr_t) dictRawData != -1) {
169 (void) munmap((void *)dictRawData, (size_t) fileLength);
170 }
171
172 // Delete any temporary buffer
173 delete [] dictBuffer;
174 }
175
176 #if U_IS_BIG_ENDIAN
177 static const char sArchType[] = "";
178 #else
179 static const char sArchType[] = ".le"; // little endian
180 #endif
181
182 #endif
183
184 const CompactTrieDictionary *
185 AppleLanguageBreakFactory::loadDictionaryFor(UScriptCode script, int32_t breakType) {
186 const CompactTrieDictionary *icuDict = ICULanguageBreakFactory::loadDictionaryFor(script, breakType);
187 #if !TARGET_OS_EMBEDDED
188 // We only look for a user dictionary if there is actually an ICU dictionary
189 if (icuDict != NULL) {
190 UErrorCode status = U_ZERO_ERROR;
191 const char *scriptName = uscript_getName(script);
192 char path[256]; // PATH_MAX is overkill in this case
193 char cachePath[128];
194 char cacheTargetPath[256];
195 glob_t dirGlob;
196 glob_t fileGlob;
197 struct stat cacheStat;
198 struct stat dictStat;
199 bool cacheGood = true;
200 int globFlags = (GLOB_NOESCAPE|GLOB_NOSORT|GLOB_TILDE);
201 const CompactTrieDictionary *cacheDict = NULL;
202
203 // Iterate the dictionary directories and accumulate in dirGlob
204 NSSearchPathEnumerationState state = NSStartSearchPathEnumeration(NSLibraryDirectory, (NSSearchPathDomainMask) (NSUserDomainMask|NSLocalDomainMask|NSNetworkDomainMask));
205 while (state = NSGetNextSearchPathEnumeration(state, path)) {
206 // First get the directory itself. We should never overflow, but use strlcat anyway
207 // to avoid a crash if we do.
208 strlcat(path, "/Dictionaries", sizeof(path));
209 if (!glob(path, globFlags, NULL, &dirGlob)) {
210 globFlags |= GLOB_APPEND;
211 }
212 }
213
214 // If there are no Dictionaries directories, ignore any cache file and return the ICU
215 // standard dictionary
216 // TODO: Delete the cache?
217 if (dirGlob.gl_pathc == 0) {
218 globfree(&dirGlob);
219 return icuDict;
220 }
221
222 // See if there is a cache file already; get its mod time
223 // TODO: should we be using geteuid() here instead of getuid()?
224 state = NSStartSearchPathEnumeration(NSCachesDirectory, NSLocalDomainMask);
225 state = NSGetNextSearchPathEnumeration(state, cachePath); // Just use first one
226 // Create the cache file name. We should never overflow, but use snprintf to avoid a crash
227 // if we do.
228 snprintf(cacheTargetPath, sizeof(cacheTargetPath), "%s/com.apple.ICUUserDictionaryCache%s.%s.%d", cachePath, sArchType, scriptName, getuid());
229 if (stat(cacheTargetPath, &cacheStat) || cacheStat.st_mode != (S_IFREG|S_IRUSR|S_IWUSR)) {
230 cacheGood = false; // No file or bad permissions or type
231 }
232
233 // Stat the dictionary folders, and glob the dictionary files
234 globFlags &= ~GLOB_APPEND;
235 char **pathsp = dirGlob.gl_pathv;
236 const char *dictpath;
237 while (dictpath = *pathsp++) {
238 // Stat the directory -- ignore if stat failure
239 if (!stat(dictpath, &dictStat)) {
240 // Glob the dictionaries in the directory
241 snprintf(path, sizeof(path), "%s/*-%s.txt", dictpath, scriptName);
242 if (!glob(path, globFlags, NULL, &fileGlob)) {
243 globFlags |= GLOB_APPEND;
244 }
245 // If the directory has been modified after the cache file, we need to rebuild;
246 // a dictionary might have been deleted.
247 if (cacheGood && (dictStat.st_mtimespec.tv_sec > cacheStat.st_mtimespec.tv_sec || (dictStat.st_mtimespec.tv_sec == cacheStat.st_mtimespec.tv_sec && dictStat.st_mtimespec.tv_nsec > cacheStat.st_mtimespec.tv_nsec))) {
248 cacheGood = false;
249 }
250 }
251 }
252
253 // No longer need the directory glob
254 globfree(&dirGlob);
255
256 // If there are no dictionaries, ignore the cache file and return the ICU dictionary
257 // TODO: Delete the cache?
258 if (fileGlob.gl_pathc == 0) {
259 globfree(&fileGlob);
260 return icuDict;
261 }
262
263 // Now compare the last modified stamp for the cache against all the dictionaries
264 pathsp = fileGlob.gl_pathv;
265 while (cacheGood && (dictpath = *pathsp++)) {
266 // Stat the dictionary -- ignore if stat failure
267 if (!stat(dictpath, &dictStat) && (dictStat.st_mtimespec.tv_sec > cacheStat.st_mtimespec.tv_sec || (dictStat.st_mtimespec.tv_sec == cacheStat.st_mtimespec.tv_sec && dictStat.st_mtimespec.tv_nsec > cacheStat.st_mtimespec.tv_nsec))) {
268 cacheGood = false;
269 }
270 }
271
272 // Do we need to build the dictionary cache?
273 if (!cacheGood) {
274 // Create a mutable dictionary from the ICU dictionary
275 MutableTrieDictionary *sum = icuDict->cloneMutable(status);
276 pathsp = fileGlob.gl_pathv;
277 while (U_SUCCESS(status) && (dictpath = *pathsp++)) {
278 // Add the contents of a file to the sum
279 addDictFile(sum, dictpath);
280 }
281
282 // Create a compact (read-only) dictionary
283 CompactTrieDictionary compact(*sum, status);
284 delete sum;
285
286 if (U_SUCCESS(status)) {
287 // Open a temp file to write out the cache
288 strlcat(cachePath, "/temp.XXXXXXXXXX", sizeof(cachePath));
289 int temp = mkstemp(cachePath);
290 if (temp == -1) {
291 status = U_FILE_ACCESS_ERROR;
292 }
293 size_t dictSize = compact.dataSize();
294 if (U_SUCCESS(status) && write(temp, compact.data(), dictSize) != dictSize) {
295 status = U_FILE_ACCESS_ERROR;
296 }
297 // Rename the temp file to the cache. Note that race conditions here are
298 // fine, as the file system operations are atomic. If an outdated version wins
299 // over a newer version, it will get rebuilt at the next app launch due to the
300 // modification time checks above. We don't care that any given app launch gets
301 // the most up-to-date cache (impossible since we can't lock all the Dictionaries
302 // directories), only that the cache (eventually) reflects the current state of
303 // any user dictionaries. That will happen on the next app launch after changes
304 // to the user dictionaries quiesce.
305 if (U_SUCCESS(status)) {
306 if (rename(cachePath, cacheTargetPath)) {
307 status = U_FILE_ACCESS_ERROR;
308 (void) unlink(cachePath); // Clean up the temp file
309 }
310 }
311 if (temp != -1) {
312 close(temp);
313 }
314 }
315 }
316
317 // Done with dictionary paths; release memory allocated by glob()
318 globfree(&fileGlob);
319
320 // Map the cache and build the dictionary
321 if (U_SUCCESS(status)) {
322 int cache = open(cacheTargetPath, O_RDONLY, 0);
323 off_t length;
324 const void *cacheData = (const void *) -1;
325 if (cache == -1) {
326 status = U_FILE_ACCESS_ERROR;
327 }
328 if (U_SUCCESS(status)) {
329 length = lseek(cache, 0, SEEK_END);
330 (void) lseek(cache, 0, SEEK_SET);
331 if (length < 0 || length > PTRDIFF_MAX) {
332 status = U_FILE_ACCESS_ERROR;
333 }
334 }
335
336 // Map the cache. Note: it is left mapped until process exit. This is the normal
337 // behavior anyway, so it shouldn't be an issue.
338 if (U_SUCCESS(status)) {
339 cacheData = mmap(0, (size_t) length, PROT_READ, MAP_SHARED, cache, 0);
340 if ((intptr_t)cacheData == -1) {
341 status = U_FILE_ACCESS_ERROR;
342 }
343 }
344 // We can close the cache file now that it's mapped (or not)
345 if (cache != -1) {
346 (void) close(cache);
347 }
348 // If all was successful, try to create the dictionary. The constructor will
349 // check the magic number for us.
350 if (U_SUCCESS(status)) {
351 cacheDict = new CompactTrieDictionary(cacheData, status);
352 }
353 if (U_FAILURE(status) && (intptr_t)cacheData != -1) {
354 // Clean up the mmap
355 (void) munmap((void *)cacheData, (size_t) length);
356 }
357 }
358
359 // If we were successful, free the ICU dictionary and return ours
360 if (U_SUCCESS(status)) {
361 delete icuDict;
362 return cacheDict;
363 }
364 else {
365 delete cacheDict;
366 }
367 }
368 #endif
369 return icuDict;
370 }
371
372 U_NAMESPACE_END
373
374 #endif /* #if !UCONFIG_NO_BREAK_ITERATION && defined(U_DARWIN) */