]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/aaplbfct.cpp
ICU-8.11.2.tar.gz
[apple/icu.git] / icuSources / common / aaplbfct.cpp
1 /**
2 *******************************************************************************
3 * Copyright (C) 2007, International Business Machines Corporation, Apple Inc.,*
4 * and others. All Rights Reserved. *
5 *******************************************************************************
6 */
7
8 #define __STDC_LIMIT_MACROS 1
9 #include "unicode/utypes.h"
10
11 #if !UCONFIG_NO_BREAK_ITERATION && defined(U_DARWIN)
12
13 #include "brkeng.h"
14 #include "dictbe.h"
15 #include "triedict.h"
16 #include "aaplbfct.h"
17 #include "unicode/uscript.h"
18 #include "unicode/uniset.h"
19 #include "unicode/ucnv.h"
20 #include "unicode/uchar.h"
21 #include <limits.h>
22 #include <unistd.h>
23 #include <glob.h>
24 #include <strings.h>
25 #include <NSSystemDirectories.h>
26 #include <sys/types.h>
27 #include <sys/stat.h>
28 #include <sys/mman.h>
29 #include <fcntl.h>
30 #include <time.h>
31 #include <stdio.h>
32 #include <stdint.h>
33
34 U_NAMESPACE_BEGIN
35
36 /*
37 ******************************************************************
38 */
39
40 AppleLanguageBreakFactory::AppleLanguageBreakFactory(UErrorCode &status)
41 : ICULanguageBreakFactory(status)
42 {
43 }
44
45 AppleLanguageBreakFactory::~AppleLanguageBreakFactory() {
46 }
47
48 // Helper function that makes a length-delimited buffer look NUL-terminated
49 static __attribute__((always_inline)) inline UChar nextUChar(const UChar *&p, ptrdiff_t &l) {
50 if (l > 0) {
51 l -= 1;
52 return *p++;
53 }
54 else {
55 return 0;
56 }
57 }
58
59 // Add a file's worth of words to the supplied mutable dictionary
60 static void addDictFile(MutableTrieDictionary *to, const char *path) {
61 UErrorCode status = U_ZERO_ERROR;
62 off_t fileLength;
63 const char *dictRawData = (const char *) -1;
64 const UChar *dictData = NULL;
65 ptrdiff_t dictDataLength = 0;
66 UChar *dictBuffer = NULL;
67 const char *encoding = NULL;
68 int32_t signatureLength = 0;
69
70 // Open the dictionary file
71 int dictFile = open(path, O_RDONLY, 0);
72 if (dictFile == -1) {
73 status = U_FILE_ACCESS_ERROR;
74 }
75
76 // Determine its length
77 if (U_SUCCESS(status)) {
78 fileLength = lseek(dictFile, 0, SEEK_END);
79 (void) lseek(dictFile, 0, SEEK_SET);
80 if (fileLength < 0 || fileLength > PTRDIFF_MAX) {
81 status = U_FILE_ACCESS_ERROR;
82 }
83 }
84
85 // Map it
86 if (U_SUCCESS(status)) {
87 dictRawData = (const char *) mmap(0, (size_t) fileLength, PROT_READ, MAP_SHARED, dictFile, 0);
88 if ((intptr_t)dictRawData == -1) {
89 status = U_FILE_ACCESS_ERROR;
90 }
91 }
92
93 // No longer need the file descriptor open
94 if (dictFile != -1) {
95 (void) close(dictFile);
96 }
97
98 // Look for a Unicode signature
99 if (U_SUCCESS(status)) {
100 encoding = ucnv_detectUnicodeSignature(dictRawData, fileLength, &signatureLength, &status);
101 }
102
103 // If necessary, convert the data to UChars
104 if (U_SUCCESS(status) && encoding != NULL) {
105 UConverter *conv = ucnv_open(encoding, &status);
106 // Preflight to get buffer size
107 uint32_t destCap = ucnv_toUChars(conv, NULL, 0, dictRawData, fileLength, &status);
108 if (status == U_BUFFER_OVERFLOW_ERROR) {
109 status = U_ZERO_ERROR;
110 }
111 if (U_SUCCESS(status)) {
112 dictBuffer = new UChar[destCap+1];
113 }
114 (void) ucnv_toUChars(conv, dictBuffer, destCap+1, dictRawData, fileLength, &status);
115 dictData = dictBuffer;
116 dictDataLength = destCap;
117 if (U_SUCCESS(status) && dictData[0] == 0xFEFF) { // BOM? Skip it
118 dictData += 1;
119 dictDataLength -= 1;
120 }
121
122 ucnv_close(conv);
123 }
124
125 // If it didn't need converting, just assume it's native-endian UTF-16, no BOM
126 if (U_SUCCESS(status) && dictData == NULL) {
127 dictData = (const UChar *) dictRawData;
128 dictDataLength = fileLength/sizeof(UChar);
129 }
130
131 // OK, we now have a pointer to native-endian UTF-16. Process it as one word per line,
132 // stopping at the first space.
133 if (U_SUCCESS(status)) {
134 UnicodeSet breaks(UNICODE_STRING_SIMPLE("[[:lb=BK:][:lb=CR:][:lb=LF:][:lb=NL:]]"), status);
135 const UChar *candidate = dictData;
136 int32_t length = 0;
137 UChar uc = nextUChar(dictData, dictDataLength);
138 while (U_SUCCESS(status) && uc) {
139 while (uc && !u_isspace(uc)) {
140 length += 1;
141 uc = nextUChar(dictData, dictDataLength);
142 }
143
144 if (length > 0) {
145 to->addWord(candidate, length, status);
146 }
147
148 // Find beginning of next line
149 // 1. Skip non-line-break characters
150 while (uc && !breaks.contains(uc)) {
151 uc = nextUChar(dictData, dictDataLength);
152 }
153 // 2. Skip line break characters
154 while (uc && breaks.contains(uc)) {
155 uc = nextUChar(dictData, dictDataLength);
156 }
157
158 // Prepare for next line
159 candidate = dictData-1;
160 length = 0;
161 }
162 }
163
164 // Unmap the file if we mapped it
165 if ((intptr_t) dictRawData != -1) {
166 (void) munmap((void *)dictRawData, (size_t) fileLength);
167 }
168
169 // Delete any temporary buffer
170 delete [] dictBuffer;
171 }
172
173 #if U_IS_BIG_ENDIAN
174 static const char sArchType[] = "";
175 #else
176 static const char sArchType[] = ".le"; // little endian
177 #endif
178
179 const CompactTrieDictionary *
180 AppleLanguageBreakFactory::loadDictionaryFor(UScriptCode script, int32_t breakType) {
181 const CompactTrieDictionary *icuDict = ICULanguageBreakFactory::loadDictionaryFor(script, breakType);
182 // We only look for a user dictionary if there is actually an ICU dictionary
183 if (icuDict != NULL) {
184 UErrorCode status = U_ZERO_ERROR;
185 const char *scriptName = uscript_getName(script);
186 char path[256]; // PATH_MAX is overkill in this case
187 char cachePath[128];
188 char cacheTargetPath[256];
189 glob_t dirGlob;
190 glob_t fileGlob;
191 struct stat cacheStat;
192 struct stat dictStat;
193 bool cacheGood = true;
194 int globFlags = (GLOB_NOESCAPE|GLOB_NOSORT|GLOB_TILDE);
195 const CompactTrieDictionary *cacheDict = NULL;
196
197 // Iterate the dictionary directories and accumulate in dirGlob
198 NSSearchPathEnumerationState state = NSStartSearchPathEnumeration(NSLibraryDirectory, (NSSearchPathDomainMask) (NSUserDomainMask|NSLocalDomainMask|NSNetworkDomainMask));
199 while (state = NSGetNextSearchPathEnumeration(state, path)) {
200 // First get the directory itself. We should never overflow, but use strlcat anyway
201 // to avoid a crash if we do.
202 strlcat(path, "/Dictionaries", sizeof(path));
203 if (!glob(path, globFlags, NULL, &dirGlob)) {
204 globFlags |= GLOB_APPEND;
205 }
206 }
207
208 // If there are no Dictionaries directories, ignore any cache file and return the ICU
209 // standard dictionary
210 // TODO: Delete the cache?
211 if (dirGlob.gl_pathc == 0) {
212 globfree(&dirGlob);
213 return icuDict;
214 }
215
216 // See if there is a cache file already; get its mod time
217 // TODO: should we be using geteuid() here instead of getuid()?
218 state = NSStartSearchPathEnumeration(NSCachesDirectory, NSLocalDomainMask);
219 state = NSGetNextSearchPathEnumeration(state, cachePath); // Just use first one
220 // Create the cache file name. We should never overflow, but use snprintf to avoid a crash
221 // if we do.
222 snprintf(cacheTargetPath, sizeof(cacheTargetPath), "%s/com.apple.ICUUserDictionaryCache%s.%s.%d", cachePath, sArchType, scriptName, getuid());
223 if (stat(cacheTargetPath, &cacheStat) || cacheStat.st_mode != (S_IFREG|S_IRUSR|S_IWUSR)) {
224 cacheGood = false; // No file or bad permissions or type
225 }
226
227 // Stat the dictionary folders, and glob the dictionary files
228 globFlags &= ~GLOB_APPEND;
229 char **pathsp = dirGlob.gl_pathv;
230 const char *dictpath;
231 while (dictpath = *pathsp++) {
232 // Stat the directory -- ignore if stat failure
233 if (!stat(dictpath, &dictStat)) {
234 // Glob the dictionaries in the directory
235 snprintf(path, sizeof(path), "%s/*-%s.txt", dictpath, scriptName);
236 if (!glob(path, globFlags, NULL, &fileGlob)) {
237 globFlags |= GLOB_APPEND;
238 }
239 // If the directory has been modified after the cache file, we need to rebuild;
240 // a dictionary might have been deleted.
241 if (cacheGood && (dictStat.st_mtimespec.tv_sec > cacheStat.st_mtimespec.tv_sec || (dictStat.st_mtimespec.tv_sec == cacheStat.st_mtimespec.tv_sec && dictStat.st_mtimespec.tv_nsec > cacheStat.st_mtimespec.tv_nsec))) {
242 cacheGood = false;
243 }
244 }
245 }
246
247 // No longer need the directory glob
248 globfree(&dirGlob);
249
250 // If there are no dictionaries, ignore the cache file and return the ICU dictionary
251 // TODO: Delete the cache?
252 if (fileGlob.gl_pathc == 0) {
253 globfree(&fileGlob);
254 return icuDict;
255 }
256
257 // Now compare the last modified stamp for the cache against all the dictionaries
258 pathsp = fileGlob.gl_pathv;
259 while (cacheGood && (dictpath = *pathsp++)) {
260 // Stat the dictionary -- ignore if stat failure
261 if (!stat(dictpath, &dictStat) && (dictStat.st_mtimespec.tv_sec > cacheStat.st_mtimespec.tv_sec || (dictStat.st_mtimespec.tv_sec == cacheStat.st_mtimespec.tv_sec && dictStat.st_mtimespec.tv_nsec > cacheStat.st_mtimespec.tv_nsec))) {
262 cacheGood = false;
263 }
264 }
265
266 // Do we need to build the dictionary cache?
267 if (!cacheGood) {
268 // Create a mutable dictionary from the ICU dictionary
269 MutableTrieDictionary *sum = icuDict->cloneMutable(status);
270 pathsp = fileGlob.gl_pathv;
271 while (U_SUCCESS(status) && (dictpath = *pathsp++)) {
272 // Add the contents of a file to the sum
273 addDictFile(sum, dictpath);
274 }
275
276 // Create a compact (read-only) dictionary
277 CompactTrieDictionary compact(*sum, status);
278 delete sum;
279
280 if (U_SUCCESS(status)) {
281 // Open a temp file to write out the cache
282 strlcat(cachePath, "/temp.XXXXXXXXXX", sizeof(cachePath));
283 int temp = mkstemp(cachePath);
284 if (temp == -1) {
285 status = U_FILE_ACCESS_ERROR;
286 }
287 size_t dictSize = compact.dataSize();
288 if (U_SUCCESS(status) && write(temp, compact.data(), dictSize) != dictSize) {
289 status = U_FILE_ACCESS_ERROR;
290 }
291 // Rename the temp file to the cache. Note that race conditions here are
292 // fine, as the file system operations are atomic. If an outdated version wins
293 // over a newer version, it will get rebuilt at the next app launch due to the
294 // modification time checks above. We don't care that any given app launch gets
295 // the most up-to-date cache (impossible since we can't lock all the Dictionaries
296 // directories), only that the cache (eventually) reflects the current state of
297 // any user dictionaries. That will happen on the next app launch after changes
298 // to the user dictionaries quiesce.
299 if (U_SUCCESS(status)) {
300 if (rename(cachePath, cacheTargetPath)) {
301 status = U_FILE_ACCESS_ERROR;
302 (void) unlink(cachePath); // Clean up the temp file
303 }
304 }
305 if (temp != -1) {
306 close(temp);
307 }
308 }
309 }
310
311 // Done with dictionary paths; release memory allocated by glob()
312 globfree(&fileGlob);
313
314 // Map the cache and build the dictionary
315 if (U_SUCCESS(status)) {
316 int cache = open(cacheTargetPath, O_RDONLY, 0);
317 off_t length;
318 const void *cacheData = (const void *) -1;
319 if (cache == -1) {
320 status = U_FILE_ACCESS_ERROR;
321 }
322 if (U_SUCCESS(status)) {
323 length = lseek(cache, 0, SEEK_END);
324 (void) lseek(cache, 0, SEEK_SET);
325 if (length < 0 || length > PTRDIFF_MAX) {
326 status = U_FILE_ACCESS_ERROR;
327 }
328 }
329
330 // Map the cache. Note: it is left mapped until process exit. This is the normal
331 // behavior anyway, so it shouldn't be an issue.
332 if (U_SUCCESS(status)) {
333 cacheData = mmap(0, (size_t) length, PROT_READ, MAP_SHARED, cache, 0);
334 if ((intptr_t)cacheData == -1) {
335 status = U_FILE_ACCESS_ERROR;
336 }
337 }
338 // We can close the cache file now that it's mapped (or not)
339 if (cache != -1) {
340 (void) close(cache);
341 }
342 // If all was successful, try to create the dictionary. The constructor will
343 // check the magic number for us.
344 if (U_SUCCESS(status)) {
345 cacheDict = new CompactTrieDictionary(cacheData, status);
346 }
347 if (U_FAILURE(status) && (intptr_t)cacheData != -1) {
348 // Clean up the mmap
349 (void) munmap((void *)cacheData, (size_t) length);
350 }
351 }
352
353 // If we were successful, free the ICU dictionary and return ours
354 if (U_SUCCESS(status)) {
355 delete icuDict;
356 return cacheDict;
357 }
358 else {
359 delete cacheDict;
360 }
361 }
362 return icuDict;
363 }
364
365 U_NAMESPACE_END
366
367 #endif /* #if !UCONFIG_NO_BREAK_ITERATION && defined(U_DARWIN) */