2 *******************************************************************************
3 * Copyright (C) 2007, International Business Machines Corporation, Apple Inc.,*
4 * and others. All Rights Reserved. *
5 *******************************************************************************
8 #define __STDC_LIMIT_MACROS 1
9 #include "unicode/utypes.h"
11 #if !UCONFIG_NO_BREAK_ITERATION && defined(U_DARWIN)
17 #include "unicode/uscript.h"
18 #include "unicode/uniset.h"
19 #include "unicode/ucnv.h"
20 #include "unicode/uchar.h"
25 #include <NSSystemDirectories.h>
26 #include <sys/types.h>
33 #include <TargetConditionals.h>
38 ******************************************************************
41 AppleLanguageBreakFactory::AppleLanguageBreakFactory(UErrorCode
&status
)
42 : ICULanguageBreakFactory(status
)
46 AppleLanguageBreakFactory::~AppleLanguageBreakFactory() {
49 #if !TARGET_OS_EMBEDDED
51 // Helper function that makes a length-delimited buffer look NUL-terminated
52 static __attribute__((always_inline
)) inline UChar
nextUChar(const UChar
*&p
, ptrdiff_t &l
) {
62 // Add a file's worth of words to the supplied mutable dictionary
63 static void addDictFile(MutableTrieDictionary
*to
, const char *path
) {
64 UErrorCode status
= U_ZERO_ERROR
;
66 const char *dictRawData
= (const char *) -1;
67 const UChar
*dictData
= NULL
;
68 ptrdiff_t dictDataLength
= 0;
69 UChar
*dictBuffer
= NULL
;
70 const char *encoding
= NULL
;
71 int32_t signatureLength
= 0;
73 // Open the dictionary file
74 int dictFile
= open(path
, O_RDONLY
, 0);
76 status
= U_FILE_ACCESS_ERROR
;
79 // Determine its length
80 if (U_SUCCESS(status
)) {
81 fileLength
= lseek(dictFile
, 0, SEEK_END
);
82 (void) lseek(dictFile
, 0, SEEK_SET
);
83 if (fileLength
< 0 || fileLength
> PTRDIFF_MAX
) {
84 status
= U_FILE_ACCESS_ERROR
;
89 if (U_SUCCESS(status
)) {
90 dictRawData
= (const char *) mmap(0, (size_t) fileLength
, PROT_READ
, MAP_SHARED
, dictFile
, 0);
91 if ((intptr_t)dictRawData
== -1) {
92 status
= U_FILE_ACCESS_ERROR
;
96 // No longer need the file descriptor open
98 (void) close(dictFile
);
101 // Look for a Unicode signature
102 if (U_SUCCESS(status
)) {
103 encoding
= ucnv_detectUnicodeSignature(dictRawData
, fileLength
, &signatureLength
, &status
);
106 // If necessary, convert the data to UChars
107 if (U_SUCCESS(status
) && encoding
!= NULL
) {
108 UConverter
*conv
= ucnv_open(encoding
, &status
);
109 // Preflight to get buffer size
110 uint32_t destCap
= ucnv_toUChars(conv
, NULL
, 0, dictRawData
, fileLength
, &status
);
111 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
112 status
= U_ZERO_ERROR
;
114 if (U_SUCCESS(status
)) {
115 dictBuffer
= new UChar
[destCap
+1];
117 (void) ucnv_toUChars(conv
, dictBuffer
, destCap
+1, dictRawData
, fileLength
, &status
);
118 dictData
= dictBuffer
;
119 dictDataLength
= destCap
;
120 if (U_SUCCESS(status
) && dictData
[0] == 0xFEFF) { // BOM? Skip it
128 // If it didn't need converting, just assume it's native-endian UTF-16, no BOM
129 if (U_SUCCESS(status
) && dictData
== NULL
) {
130 dictData
= (const UChar
*) dictRawData
;
131 dictDataLength
= fileLength
/sizeof(UChar
);
134 // OK, we now have a pointer to native-endian UTF-16. Process it as one word per line,
135 // stopping at the first space.
136 if (U_SUCCESS(status
)) {
137 UnicodeSet
breaks(UNICODE_STRING_SIMPLE("[[:lb=BK:][:lb=CR:][:lb=LF:][:lb=NL:]]"), status
);
138 const UChar
*candidate
= dictData
;
140 UChar uc
= nextUChar(dictData
, dictDataLength
);
141 while (U_SUCCESS(status
) && uc
) {
142 while (uc
&& !u_isspace(uc
)) {
144 uc
= nextUChar(dictData
, dictDataLength
);
148 to
->addWord(candidate
, length
, status
);
151 // Find beginning of next line
152 // 1. Skip non-line-break characters
153 while (uc
&& !breaks
.contains(uc
)) {
154 uc
= nextUChar(dictData
, dictDataLength
);
156 // 2. Skip line break characters
157 while (uc
&& breaks
.contains(uc
)) {
158 uc
= nextUChar(dictData
, dictDataLength
);
161 // Prepare for next line
162 candidate
= dictData
-1;
167 // Unmap the file if we mapped it
168 if ((intptr_t) dictRawData
!= -1) {
169 (void) munmap((void *)dictRawData
, (size_t) fileLength
);
172 // Delete any temporary buffer
173 delete [] dictBuffer
;
177 static const char sArchType
[] = "";
179 static const char sArchType
[] = ".le"; // little endian
184 const CompactTrieDictionary
*
185 AppleLanguageBreakFactory::loadDictionaryFor(UScriptCode script
, int32_t breakType
) {
186 const CompactTrieDictionary
*icuDict
= ICULanguageBreakFactory::loadDictionaryFor(script
, breakType
);
187 #if !TARGET_OS_EMBEDDED
188 // We only look for a user dictionary if there is actually an ICU dictionary
189 if (icuDict
!= NULL
) {
190 UErrorCode status
= U_ZERO_ERROR
;
191 const char *scriptName
= uscript_getName(script
);
192 char path
[256]; // PATH_MAX is overkill in this case
194 char cacheTargetPath
[256];
197 struct stat cacheStat
;
198 struct stat dictStat
;
199 bool cacheGood
= true;
200 int globFlags
= (GLOB_NOESCAPE
|GLOB_NOSORT
|GLOB_TILDE
);
201 const CompactTrieDictionary
*cacheDict
= NULL
;
203 // Iterate the dictionary directories and accumulate in dirGlob
204 NSSearchPathEnumerationState state
= NSStartSearchPathEnumeration(NSLibraryDirectory
, (NSSearchPathDomainMask
) (NSUserDomainMask
|NSLocalDomainMask
|NSNetworkDomainMask
));
205 while (state
= NSGetNextSearchPathEnumeration(state
, path
)) {
206 // First get the directory itself. We should never overflow, but use strlcat anyway
207 // to avoid a crash if we do.
208 strlcat(path
, "/Dictionaries", sizeof(path
));
209 if (!glob(path
, globFlags
, NULL
, &dirGlob
)) {
210 globFlags
|= GLOB_APPEND
;
214 // If there are no Dictionaries directories, ignore any cache file and return the ICU
215 // standard dictionary
216 // TODO: Delete the cache?
217 if (dirGlob
.gl_pathc
== 0) {
222 // See if there is a cache file already; get its mod time
223 // TODO: should we be using geteuid() here instead of getuid()?
224 state
= NSStartSearchPathEnumeration(NSCachesDirectory
, NSLocalDomainMask
);
225 state
= NSGetNextSearchPathEnumeration(state
, cachePath
); // Just use first one
226 // Create the cache file name. We should never overflow, but use snprintf to avoid a crash
228 snprintf(cacheTargetPath
, sizeof(cacheTargetPath
), "%s/com.apple.ICUUserDictionaryCache%s.%s.%d", cachePath
, sArchType
, scriptName
, getuid());
229 if (stat(cacheTargetPath
, &cacheStat
) || cacheStat
.st_mode
!= (S_IFREG
|S_IRUSR
|S_IWUSR
)) {
230 cacheGood
= false; // No file or bad permissions or type
233 // Stat the dictionary folders, and glob the dictionary files
234 globFlags
&= ~GLOB_APPEND
;
235 char **pathsp
= dirGlob
.gl_pathv
;
236 const char *dictpath
;
237 while (dictpath
= *pathsp
++) {
238 // Stat the directory -- ignore if stat failure
239 if (!stat(dictpath
, &dictStat
)) {
240 // Glob the dictionaries in the directory
241 snprintf(path
, sizeof(path
), "%s/*-%s.txt", dictpath
, scriptName
);
242 if (!glob(path
, globFlags
, NULL
, &fileGlob
)) {
243 globFlags
|= GLOB_APPEND
;
245 // If the directory has been modified after the cache file, we need to rebuild;
246 // a dictionary might have been deleted.
247 if (cacheGood
&& (dictStat
.st_mtimespec
.tv_sec
> cacheStat
.st_mtimespec
.tv_sec
|| (dictStat
.st_mtimespec
.tv_sec
== cacheStat
.st_mtimespec
.tv_sec
&& dictStat
.st_mtimespec
.tv_nsec
> cacheStat
.st_mtimespec
.tv_nsec
))) {
253 // No longer need the directory glob
256 // If there are no dictionaries, ignore the cache file and return the ICU dictionary
257 // TODO: Delete the cache?
258 if (fileGlob
.gl_pathc
== 0) {
263 // Now compare the last modified stamp for the cache against all the dictionaries
264 pathsp
= fileGlob
.gl_pathv
;
265 while (cacheGood
&& (dictpath
= *pathsp
++)) {
266 // Stat the dictionary -- ignore if stat failure
267 if (!stat(dictpath
, &dictStat
) && (dictStat
.st_mtimespec
.tv_sec
> cacheStat
.st_mtimespec
.tv_sec
|| (dictStat
.st_mtimespec
.tv_sec
== cacheStat
.st_mtimespec
.tv_sec
&& dictStat
.st_mtimespec
.tv_nsec
> cacheStat
.st_mtimespec
.tv_nsec
))) {
272 // Do we need to build the dictionary cache?
274 // Create a mutable dictionary from the ICU dictionary
275 MutableTrieDictionary
*sum
= icuDict
->cloneMutable(status
);
276 pathsp
= fileGlob
.gl_pathv
;
277 while (U_SUCCESS(status
) && (dictpath
= *pathsp
++)) {
278 // Add the contents of a file to the sum
279 addDictFile(sum
, dictpath
);
282 // Create a compact (read-only) dictionary
283 CompactTrieDictionary
compact(*sum
, status
);
286 if (U_SUCCESS(status
)) {
287 // Open a temp file to write out the cache
288 strlcat(cachePath
, "/temp.XXXXXXXXXX", sizeof(cachePath
));
289 int temp
= mkstemp(cachePath
);
291 status
= U_FILE_ACCESS_ERROR
;
293 size_t dictSize
= compact
.dataSize();
294 if (U_SUCCESS(status
) && write(temp
, compact
.data(), dictSize
) != dictSize
) {
295 status
= U_FILE_ACCESS_ERROR
;
297 // Rename the temp file to the cache. Note that race conditions here are
298 // fine, as the file system operations are atomic. If an outdated version wins
299 // over a newer version, it will get rebuilt at the next app launch due to the
300 // modification time checks above. We don't care that any given app launch gets
301 // the most up-to-date cache (impossible since we can't lock all the Dictionaries
302 // directories), only that the cache (eventually) reflects the current state of
303 // any user dictionaries. That will happen on the next app launch after changes
304 // to the user dictionaries quiesce.
305 if (U_SUCCESS(status
)) {
306 if (rename(cachePath
, cacheTargetPath
)) {
307 status
= U_FILE_ACCESS_ERROR
;
308 (void) unlink(cachePath
); // Clean up the temp file
317 // Done with dictionary paths; release memory allocated by glob()
320 // Map the cache and build the dictionary
321 if (U_SUCCESS(status
)) {
322 int cache
= open(cacheTargetPath
, O_RDONLY
, 0);
324 const void *cacheData
= (const void *) -1;
326 status
= U_FILE_ACCESS_ERROR
;
328 if (U_SUCCESS(status
)) {
329 length
= lseek(cache
, 0, SEEK_END
);
330 (void) lseek(cache
, 0, SEEK_SET
);
331 if (length
< 0 || length
> PTRDIFF_MAX
) {
332 status
= U_FILE_ACCESS_ERROR
;
336 // Map the cache. Note: it is left mapped until process exit. This is the normal
337 // behavior anyway, so it shouldn't be an issue.
338 if (U_SUCCESS(status
)) {
339 cacheData
= mmap(0, (size_t) length
, PROT_READ
, MAP_SHARED
, cache
, 0);
340 if ((intptr_t)cacheData
== -1) {
341 status
= U_FILE_ACCESS_ERROR
;
344 // We can close the cache file now that it's mapped (or not)
348 // If all was successful, try to create the dictionary. The constructor will
349 // check the magic number for us.
350 if (U_SUCCESS(status
)) {
351 cacheDict
= new CompactTrieDictionary(cacheData
, status
);
353 if (U_FAILURE(status
) && (intptr_t)cacheData
!= -1) {
355 (void) munmap((void *)cacheData
, (size_t) length
);
359 // If we were successful, free the ICU dictionary and return ours
360 if (U_SUCCESS(status
)) {
374 #endif /* #if !UCONFIG_NO_BREAK_ITERATION && defined(U_DARWIN) */