]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/aaplbfct.cpp
ICU-57165.0.1.tar.gz
[apple/icu.git] / icuSources / common / aaplbfct.cpp
CommitLineData
73c04bcf
A
1/**
2 *******************************************************************************
51004dcb 3 * Copyright (C) 2007,2012 International Business Machines Corporation, Apple Inc.,*
73c04bcf
A
4 * and others. All Rights Reserved. *
5 *******************************************************************************
6 */
7
8#define __STDC_LIMIT_MACROS 1
9#include "unicode/utypes.h"
10
4388f060 11#if !UCONFIG_NO_BREAK_ITERATION && U_PLATFORM_IS_DARWIN_BASED
73c04bcf
A
12
13#include "brkeng.h"
14#include "dictbe.h"
73c04bcf
A
15#include "aaplbfct.h"
16#include "unicode/uscript.h"
17#include "unicode/uniset.h"
18#include "unicode/ucnv.h"
19#include "unicode/uchar.h"
20#include <limits.h>
21#include <unistd.h>
22#include <glob.h>
23#include <strings.h>
24#include <NSSystemDirectories.h>
25#include <sys/types.h>
26#include <sys/stat.h>
27#include <sys/mman.h>
28#include <fcntl.h>
29#include <time.h>
30#include <stdio.h>
31#include <stdint.h>
4388f060
A
32// The following is now already included by platform.h (included indirectly by
33// utypes.h) if U_PLATFORM_IS_DARWIN_BASED but it doesn't hurt to re-include here
729e4ab9 34#include <TargetConditionals.h>
73c04bcf
A
35
36U_NAMESPACE_BEGIN
37
38/*
39 ******************************************************************
40 */
41
42AppleLanguageBreakFactory::AppleLanguageBreakFactory(UErrorCode &status)
43: ICULanguageBreakFactory(status)
44{
45}
46
47AppleLanguageBreakFactory::~AppleLanguageBreakFactory() {
48}
49
729e4ab9 50#if !TARGET_OS_EMBEDDED
51004dcb
A
51#if 0
52// need to update loadDictionaryMatcherFor implementation below
729e4ab9 53
73c04bcf
A
54// Helper function that makes a length-delimited buffer look NUL-terminated
55static __attribute__((always_inline)) inline UChar nextUChar(const UChar *&p, ptrdiff_t &l) {
56 if (l > 0) {
57 l -= 1;
58 return *p++;
59 }
60 else {
61 return 0;
62 }
63}
64
65// Add a file's worth of words to the supplied mutable dictionary
66static void addDictFile(MutableTrieDictionary *to, const char *path) {
67 UErrorCode status = U_ZERO_ERROR;
68 off_t fileLength;
69 const char *dictRawData = (const char *) -1;
70 const UChar *dictData = NULL;
71 ptrdiff_t dictDataLength = 0;
72 UChar *dictBuffer = NULL;
73 const char *encoding = NULL;
74 int32_t signatureLength = 0;
75
76 // Open the dictionary file
77 int dictFile = open(path, O_RDONLY, 0);
78 if (dictFile == -1) {
79 status = U_FILE_ACCESS_ERROR;
80 }
81
82 // Determine its length
83 if (U_SUCCESS(status)) {
84 fileLength = lseek(dictFile, 0, SEEK_END);
85 (void) lseek(dictFile, 0, SEEK_SET);
86 if (fileLength < 0 || fileLength > PTRDIFF_MAX) {
87 status = U_FILE_ACCESS_ERROR;
88 }
89 }
90
91 // Map it
92 if (U_SUCCESS(status)) {
93 dictRawData = (const char *) mmap(0, (size_t) fileLength, PROT_READ, MAP_SHARED, dictFile, 0);
94 if ((intptr_t)dictRawData == -1) {
95 status = U_FILE_ACCESS_ERROR;
96 }
97 }
98
99 // No longer need the file descriptor open
100 if (dictFile != -1) {
101 (void) close(dictFile);
102 }
103
104 // Look for a Unicode signature
105 if (U_SUCCESS(status)) {
106 encoding = ucnv_detectUnicodeSignature(dictRawData, fileLength, &signatureLength, &status);
107 }
108
109 // If necessary, convert the data to UChars
110 if (U_SUCCESS(status) && encoding != NULL) {
111 UConverter *conv = ucnv_open(encoding, &status);
112 // Preflight to get buffer size
113 uint32_t destCap = ucnv_toUChars(conv, NULL, 0, dictRawData, fileLength, &status);
114 if (status == U_BUFFER_OVERFLOW_ERROR) {
115 status = U_ZERO_ERROR;
116 }
117 if (U_SUCCESS(status)) {
118 dictBuffer = new UChar[destCap+1];
119 }
120 (void) ucnv_toUChars(conv, dictBuffer, destCap+1, dictRawData, fileLength, &status);
121 dictData = dictBuffer;
122 dictDataLength = destCap;
123 if (U_SUCCESS(status) && dictData[0] == 0xFEFF) { // BOM? Skip it
124 dictData += 1;
125 dictDataLength -= 1;
126 }
127
128 ucnv_close(conv);
129 }
130
131 // If it didn't need converting, just assume it's native-endian UTF-16, no BOM
132 if (U_SUCCESS(status) && dictData == NULL) {
133 dictData = (const UChar *) dictRawData;
134 dictDataLength = fileLength/sizeof(UChar);
135 }
136
137 // OK, we now have a pointer to native-endian UTF-16. Process it as one word per line,
138 // stopping at the first space.
139 if (U_SUCCESS(status)) {
140 UnicodeSet breaks(UNICODE_STRING_SIMPLE("[[:lb=BK:][:lb=CR:][:lb=LF:][:lb=NL:]]"), status);
141 const UChar *candidate = dictData;
142 int32_t length = 0;
143 UChar uc = nextUChar(dictData, dictDataLength);
144 while (U_SUCCESS(status) && uc) {
145 while (uc && !u_isspace(uc)) {
146 length += 1;
147 uc = nextUChar(dictData, dictDataLength);
148 }
149
150 if (length > 0) {
151 to->addWord(candidate, length, status);
152 }
153
154 // Find beginning of next line
155 // 1. Skip non-line-break characters
156 while (uc && !breaks.contains(uc)) {
157 uc = nextUChar(dictData, dictDataLength);
158 }
159 // 2. Skip line break characters
160 while (uc && breaks.contains(uc)) {
161 uc = nextUChar(dictData, dictDataLength);
162 }
163
164 // Prepare for next line
165 candidate = dictData-1;
166 length = 0;
167 }
168 }
169
170 // Unmap the file if we mapped it
171 if ((intptr_t) dictRawData != -1) {
172 (void) munmap((void *)dictRawData, (size_t) fileLength);
173 }
174
175 // Delete any temporary buffer
176 delete [] dictBuffer;
177}
178
179#if U_IS_BIG_ENDIAN
180 static const char sArchType[] = "";
181#else
182 static const char sArchType[] = ".le"; // little endian
183#endif
184
729e4ab9 185#endif
51004dcb
A
186#endif
187
188/*
189In ICU50,
190ICULanguageBreakFactory changes from
191 virtual const CompactTrieDictionary *loadDictionaryFor(UScriptCode script, int32_t breakType);
192to
193 virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script, int32_t breakType);
194and CompactTrieDictionary no longer exists. Need to work out new implementation below.
195*/
729e4ab9 196
51004dcb
A
197DictionaryMatcher *
198AppleLanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t breakType) {
199 DictionaryMatcher *icuDictMatcher = ICULanguageBreakFactory::loadDictionaryMatcherFor(script, breakType);
729e4ab9 200#if !TARGET_OS_EMBEDDED
51004dcb
A
201#if 0
202// need to update loadDictionaryMatcherFor implementation below
73c04bcf 203 // We only look for a user dictionary if there is actually an ICU dictionary
51004dcb 204 if (icuDictMatcher != NULL) {
73c04bcf
A
205 UErrorCode status = U_ZERO_ERROR;
206 const char *scriptName = uscript_getName(script);
207 char path[256]; // PATH_MAX is overkill in this case
208 char cachePath[128];
209 char cacheTargetPath[256];
210 glob_t dirGlob;
211 glob_t fileGlob;
212 struct stat cacheStat;
213 struct stat dictStat;
214 bool cacheGood = true;
215 int globFlags = (GLOB_NOESCAPE|GLOB_NOSORT|GLOB_TILDE);
216 const CompactTrieDictionary *cacheDict = NULL;
217
218 // Iterate the dictionary directories and accumulate in dirGlob
219 NSSearchPathEnumerationState state = NSStartSearchPathEnumeration(NSLibraryDirectory, (NSSearchPathDomainMask) (NSUserDomainMask|NSLocalDomainMask|NSNetworkDomainMask));
4388f060 220 while ((state = NSGetNextSearchPathEnumeration(state, path)) != 0) {
73c04bcf
A
221 // First get the directory itself. We should never overflow, but use strlcat anyway
222 // to avoid a crash if we do.
223 strlcat(path, "/Dictionaries", sizeof(path));
224 if (!glob(path, globFlags, NULL, &dirGlob)) {
225 globFlags |= GLOB_APPEND;
226 }
227 }
228
229 // If there are no Dictionaries directories, ignore any cache file and return the ICU
230 // standard dictionary
231 // TODO: Delete the cache?
232 if (dirGlob.gl_pathc == 0) {
233 globfree(&dirGlob);
51004dcb 234 return icuDictMatcher;
73c04bcf
A
235 }
236
237 // See if there is a cache file already; get its mod time
238 // TODO: should we be using geteuid() here instead of getuid()?
239 state = NSStartSearchPathEnumeration(NSCachesDirectory, NSLocalDomainMask);
240 state = NSGetNextSearchPathEnumeration(state, cachePath); // Just use first one
241 // Create the cache file name. We should never overflow, but use snprintf to avoid a crash
242 // if we do.
243 snprintf(cacheTargetPath, sizeof(cacheTargetPath), "%s/com.apple.ICUUserDictionaryCache%s.%s.%d", cachePath, sArchType, scriptName, getuid());
244 if (stat(cacheTargetPath, &cacheStat) || cacheStat.st_mode != (S_IFREG|S_IRUSR|S_IWUSR)) {
245 cacheGood = false; // No file or bad permissions or type
246 }
247
248 // Stat the dictionary folders, and glob the dictionary files
249 globFlags &= ~GLOB_APPEND;
250 char **pathsp = dirGlob.gl_pathv;
251 const char *dictpath;
4388f060 252 while ((dictpath = *pathsp++) != NULL) {
73c04bcf
A
253 // Stat the directory -- ignore if stat failure
254 if (!stat(dictpath, &dictStat)) {
255 // Glob the dictionaries in the directory
256 snprintf(path, sizeof(path), "%s/*-%s.txt", dictpath, scriptName);
257 if (!glob(path, globFlags, NULL, &fileGlob)) {
258 globFlags |= GLOB_APPEND;
259 }
260 // If the directory has been modified after the cache file, we need to rebuild;
261 // a dictionary might have been deleted.
262 if (cacheGood && (dictStat.st_mtimespec.tv_sec > cacheStat.st_mtimespec.tv_sec || (dictStat.st_mtimespec.tv_sec == cacheStat.st_mtimespec.tv_sec && dictStat.st_mtimespec.tv_nsec > cacheStat.st_mtimespec.tv_nsec))) {
263 cacheGood = false;
264 }
265 }
266 }
267
268 // No longer need the directory glob
269 globfree(&dirGlob);
270
271 // If there are no dictionaries, ignore the cache file and return the ICU dictionary
272 // TODO: Delete the cache?
273 if (fileGlob.gl_pathc == 0) {
274 globfree(&fileGlob);
51004dcb 275 return icuDictMatcher;
73c04bcf
A
276 }
277
278 // Now compare the last modified stamp for the cache against all the dictionaries
279 pathsp = fileGlob.gl_pathv;
280 while (cacheGood && (dictpath = *pathsp++)) {
281 // Stat the dictionary -- ignore if stat failure
282 if (!stat(dictpath, &dictStat) && (dictStat.st_mtimespec.tv_sec > cacheStat.st_mtimespec.tv_sec || (dictStat.st_mtimespec.tv_sec == cacheStat.st_mtimespec.tv_sec && dictStat.st_mtimespec.tv_nsec > cacheStat.st_mtimespec.tv_nsec))) {
283 cacheGood = false;
284 }
285 }
286
287 // Do we need to build the dictionary cache?
288 if (!cacheGood) {
289 // Create a mutable dictionary from the ICU dictionary
51004dcb 290 MutableTrieDictionary *sum = icuDictMatcher->cloneMutable(status);
73c04bcf
A
291 pathsp = fileGlob.gl_pathv;
292 while (U_SUCCESS(status) && (dictpath = *pathsp++)) {
293 // Add the contents of a file to the sum
294 addDictFile(sum, dictpath);
295 }
296
297 // Create a compact (read-only) dictionary
298 CompactTrieDictionary compact(*sum, status);
299 delete sum;
300
301 if (U_SUCCESS(status)) {
302 // Open a temp file to write out the cache
303 strlcat(cachePath, "/temp.XXXXXXXXXX", sizeof(cachePath));
304 int temp = mkstemp(cachePath);
305 if (temp == -1) {
306 status = U_FILE_ACCESS_ERROR;
307 }
308 size_t dictSize = compact.dataSize();
309 if (U_SUCCESS(status) && write(temp, compact.data(), dictSize) != dictSize) {
310 status = U_FILE_ACCESS_ERROR;
311 }
312 // Rename the temp file to the cache. Note that race conditions here are
313 // fine, as the file system operations are atomic. If an outdated version wins
314 // over a newer version, it will get rebuilt at the next app launch due to the
315 // modification time checks above. We don't care that any given app launch gets
316 // the most up-to-date cache (impossible since we can't lock all the Dictionaries
317 // directories), only that the cache (eventually) reflects the current state of
318 // any user dictionaries. That will happen on the next app launch after changes
319 // to the user dictionaries quiesce.
320 if (U_SUCCESS(status)) {
321 if (rename(cachePath, cacheTargetPath)) {
322 status = U_FILE_ACCESS_ERROR;
323 (void) unlink(cachePath); // Clean up the temp file
324 }
325 }
326 if (temp != -1) {
327 close(temp);
328 }
329 }
330 }
331
332 // Done with dictionary paths; release memory allocated by glob()
333 globfree(&fileGlob);
334
335 // Map the cache and build the dictionary
336 if (U_SUCCESS(status)) {
337 int cache = open(cacheTargetPath, O_RDONLY, 0);
338 off_t length;
339 const void *cacheData = (const void *) -1;
340 if (cache == -1) {
341 status = U_FILE_ACCESS_ERROR;
342 }
343 if (U_SUCCESS(status)) {
344 length = lseek(cache, 0, SEEK_END);
345 (void) lseek(cache, 0, SEEK_SET);
346 if (length < 0 || length > PTRDIFF_MAX) {
347 status = U_FILE_ACCESS_ERROR;
348 }
349 }
350
351 // Map the cache. Note: it is left mapped until process exit. This is the normal
352 // behavior anyway, so it shouldn't be an issue.
353 if (U_SUCCESS(status)) {
354 cacheData = mmap(0, (size_t) length, PROT_READ, MAP_SHARED, cache, 0);
355 if ((intptr_t)cacheData == -1) {
356 status = U_FILE_ACCESS_ERROR;
357 }
358 }
359 // We can close the cache file now that it's mapped (or not)
360 if (cache != -1) {
361 (void) close(cache);
362 }
363 // If all was successful, try to create the dictionary. The constructor will
364 // check the magic number for us.
365 if (U_SUCCESS(status)) {
366 cacheDict = new CompactTrieDictionary(cacheData, status);
367 }
368 if (U_FAILURE(status) && (intptr_t)cacheData != -1) {
369 // Clean up the mmap
370 (void) munmap((void *)cacheData, (size_t) length);
371 }
372 }
373
374 // If we were successful, free the ICU dictionary and return ours
375 if (U_SUCCESS(status)) {
51004dcb 376 delete icuDictMatcher;
73c04bcf
A
377 return cacheDict;
378 }
379 else {
380 delete cacheDict;
381 }
382 }
729e4ab9 383#endif
51004dcb
A
384#endif
385 return icuDictMatcher;
73c04bcf
A
386}
387
388U_NAMESPACE_END
389
4388f060 390#endif /* #if !UCONFIG_NO_BREAK_ITERATION && U_PLATFORM_IS_DARWIN_BASED */