2 *******************************************************************************
3 * Copyright (C) 2007, International Business Machines Corporation, Apple Inc.,*
4 * and others. All Rights Reserved. *
5 *******************************************************************************
8 #define __STDC_LIMIT_MACROS 1
9 #include "unicode/utypes.h"
11 #if !UCONFIG_NO_BREAK_ITERATION && defined(U_DARWIN)
17 #include "unicode/uscript.h"
18 #include "unicode/uniset.h"
19 #include "unicode/ucnv.h"
20 #include "unicode/uchar.h"
25 #include <NSSystemDirectories.h>
26 #include <sys/types.h>
37 ******************************************************************
40 AppleLanguageBreakFactory::AppleLanguageBreakFactory(UErrorCode
&status
)
41 : ICULanguageBreakFactory(status
)
45 AppleLanguageBreakFactory::~AppleLanguageBreakFactory() {
48 // Helper function that makes a length-delimited buffer look NUL-terminated
49 static __attribute__((always_inline
)) inline UChar
nextUChar(const UChar
*&p
, ptrdiff_t &l
) {
59 // Add a file's worth of words to the supplied mutable dictionary
60 static void addDictFile(MutableTrieDictionary
*to
, const char *path
) {
61 UErrorCode status
= U_ZERO_ERROR
;
63 const char *dictRawData
= (const char *) -1;
64 const UChar
*dictData
= NULL
;
65 ptrdiff_t dictDataLength
= 0;
66 UChar
*dictBuffer
= NULL
;
67 const char *encoding
= NULL
;
68 int32_t signatureLength
= 0;
70 // Open the dictionary file
71 int dictFile
= open(path
, O_RDONLY
, 0);
73 status
= U_FILE_ACCESS_ERROR
;
76 // Determine its length
77 if (U_SUCCESS(status
)) {
78 fileLength
= lseek(dictFile
, 0, SEEK_END
);
79 (void) lseek(dictFile
, 0, SEEK_SET
);
80 if (fileLength
< 0 || fileLength
> PTRDIFF_MAX
) {
81 status
= U_FILE_ACCESS_ERROR
;
86 if (U_SUCCESS(status
)) {
87 dictRawData
= (const char *) mmap(0, (size_t) fileLength
, PROT_READ
, MAP_SHARED
, dictFile
, 0);
88 if ((intptr_t)dictRawData
== -1) {
89 status
= U_FILE_ACCESS_ERROR
;
93 // No longer need the file descriptor open
95 (void) close(dictFile
);
98 // Look for a Unicode signature
99 if (U_SUCCESS(status
)) {
100 encoding
= ucnv_detectUnicodeSignature(dictRawData
, fileLength
, &signatureLength
, &status
);
103 // If necessary, convert the data to UChars
104 if (U_SUCCESS(status
) && encoding
!= NULL
) {
105 UConverter
*conv
= ucnv_open(encoding
, &status
);
106 // Preflight to get buffer size
107 uint32_t destCap
= ucnv_toUChars(conv
, NULL
, 0, dictRawData
, fileLength
, &status
);
108 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
109 status
= U_ZERO_ERROR
;
111 if (U_SUCCESS(status
)) {
112 dictBuffer
= new UChar
[destCap
+1];
114 (void) ucnv_toUChars(conv
, dictBuffer
, destCap
+1, dictRawData
, fileLength
, &status
);
115 dictData
= dictBuffer
;
116 dictDataLength
= destCap
;
117 if (U_SUCCESS(status
) && dictData
[0] == 0xFEFF) { // BOM? Skip it
125 // If it didn't need converting, just assume it's native-endian UTF-16, no BOM
126 if (U_SUCCESS(status
) && dictData
== NULL
) {
127 dictData
= (const UChar
*) dictRawData
;
128 dictDataLength
= fileLength
/sizeof(UChar
);
131 // OK, we now have a pointer to native-endian UTF-16. Process it as one word per line,
132 // stopping at the first space.
133 if (U_SUCCESS(status
)) {
134 UnicodeSet
breaks(UNICODE_STRING_SIMPLE("[[:lb=BK:][:lb=CR:][:lb=LF:][:lb=NL:]]"), status
);
135 const UChar
*candidate
= dictData
;
137 UChar uc
= nextUChar(dictData
, dictDataLength
);
138 while (U_SUCCESS(status
) && uc
) {
139 while (uc
&& !u_isspace(uc
)) {
141 uc
= nextUChar(dictData
, dictDataLength
);
145 to
->addWord(candidate
, length
, status
);
148 // Find beginning of next line
149 // 1. Skip non-line-break characters
150 while (uc
&& !breaks
.contains(uc
)) {
151 uc
= nextUChar(dictData
, dictDataLength
);
153 // 2. Skip line break characters
154 while (uc
&& breaks
.contains(uc
)) {
155 uc
= nextUChar(dictData
, dictDataLength
);
158 // Prepare for next line
159 candidate
= dictData
-1;
164 // Unmap the file if we mapped it
165 if ((intptr_t) dictRawData
!= -1) {
166 (void) munmap((void *)dictRawData
, (size_t) fileLength
);
169 // Delete any temporary buffer
170 delete [] dictBuffer
;
174 static const char sArchType
[] = "";
176 static const char sArchType
[] = ".le"; // little endian
179 const CompactTrieDictionary
*
180 AppleLanguageBreakFactory::loadDictionaryFor(UScriptCode script
, int32_t breakType
) {
181 const CompactTrieDictionary
*icuDict
= ICULanguageBreakFactory::loadDictionaryFor(script
, breakType
);
182 // We only look for a user dictionary if there is actually an ICU dictionary
183 if (icuDict
!= NULL
) {
184 UErrorCode status
= U_ZERO_ERROR
;
185 const char *scriptName
= uscript_getName(script
);
186 char path
[256]; // PATH_MAX is overkill in this case
188 char cacheTargetPath
[256];
191 struct stat cacheStat
;
192 struct stat dictStat
;
193 bool cacheGood
= true;
194 int globFlags
= (GLOB_NOESCAPE
|GLOB_NOSORT
|GLOB_TILDE
);
195 const CompactTrieDictionary
*cacheDict
= NULL
;
197 // Iterate the dictionary directories and accumulate in dirGlob
198 NSSearchPathEnumerationState state
= NSStartSearchPathEnumeration(NSLibraryDirectory
, (NSSearchPathDomainMask
) (NSUserDomainMask
|NSLocalDomainMask
|NSNetworkDomainMask
));
199 while (state
= NSGetNextSearchPathEnumeration(state
, path
)) {
200 // First get the directory itself. We should never overflow, but use strlcat anyway
201 // to avoid a crash if we do.
202 strlcat(path
, "/Dictionaries", sizeof(path
));
203 if (!glob(path
, globFlags
, NULL
, &dirGlob
)) {
204 globFlags
|= GLOB_APPEND
;
208 // If there are no Dictionaries directories, ignore any cache file and return the ICU
209 // standard dictionary
210 // TODO: Delete the cache?
211 if (dirGlob
.gl_pathc
== 0) {
216 // See if there is a cache file already; get its mod time
217 // TODO: should we be using geteuid() here instead of getuid()?
218 state
= NSStartSearchPathEnumeration(NSCachesDirectory
, NSLocalDomainMask
);
219 state
= NSGetNextSearchPathEnumeration(state
, cachePath
); // Just use first one
220 // Create the cache file name. We should never overflow, but use snprintf to avoid a crash
222 snprintf(cacheTargetPath
, sizeof(cacheTargetPath
), "%s/com.apple.ICUUserDictionaryCache%s.%s.%d", cachePath
, sArchType
, scriptName
, getuid());
223 if (stat(cacheTargetPath
, &cacheStat
) || cacheStat
.st_mode
!= (S_IFREG
|S_IRUSR
|S_IWUSR
)) {
224 cacheGood
= false; // No file or bad permissions or type
227 // Stat the dictionary folders, and glob the dictionary files
228 globFlags
&= ~GLOB_APPEND
;
229 char **pathsp
= dirGlob
.gl_pathv
;
230 const char *dictpath
;
231 while (dictpath
= *pathsp
++) {
232 // Stat the directory -- ignore if stat failure
233 if (!stat(dictpath
, &dictStat
)) {
234 // Glob the dictionaries in the directory
235 snprintf(path
, sizeof(path
), "%s/*-%s.txt", dictpath
, scriptName
);
236 if (!glob(path
, globFlags
, NULL
, &fileGlob
)) {
237 globFlags
|= GLOB_APPEND
;
239 // If the directory has been modified after the cache file, we need to rebuild;
240 // a dictionary might have been deleted.
241 if (cacheGood
&& (dictStat
.st_mtimespec
.tv_sec
> cacheStat
.st_mtimespec
.tv_sec
|| (dictStat
.st_mtimespec
.tv_sec
== cacheStat
.st_mtimespec
.tv_sec
&& dictStat
.st_mtimespec
.tv_nsec
> cacheStat
.st_mtimespec
.tv_nsec
))) {
247 // No longer need the directory glob
250 // If there are no dictionaries, ignore the cache file and return the ICU dictionary
251 // TODO: Delete the cache?
252 if (fileGlob
.gl_pathc
== 0) {
257 // Now compare the last modified stamp for the cache against all the dictionaries
258 pathsp
= fileGlob
.gl_pathv
;
259 while (cacheGood
&& (dictpath
= *pathsp
++)) {
260 // Stat the dictionary -- ignore if stat failure
261 if (!stat(dictpath
, &dictStat
) && (dictStat
.st_mtimespec
.tv_sec
> cacheStat
.st_mtimespec
.tv_sec
|| (dictStat
.st_mtimespec
.tv_sec
== cacheStat
.st_mtimespec
.tv_sec
&& dictStat
.st_mtimespec
.tv_nsec
> cacheStat
.st_mtimespec
.tv_nsec
))) {
266 // Do we need to build the dictionary cache?
268 // Create a mutable dictionary from the ICU dictionary
269 MutableTrieDictionary
*sum
= icuDict
->cloneMutable(status
);
270 pathsp
= fileGlob
.gl_pathv
;
271 while (U_SUCCESS(status
) && (dictpath
= *pathsp
++)) {
272 // Add the contents of a file to the sum
273 addDictFile(sum
, dictpath
);
276 // Create a compact (read-only) dictionary
277 CompactTrieDictionary
compact(*sum
, status
);
280 if (U_SUCCESS(status
)) {
281 // Open a temp file to write out the cache
282 strlcat(cachePath
, "/temp.XXXXXXXXXX", sizeof(cachePath
));
283 int temp
= mkstemp(cachePath
);
285 status
= U_FILE_ACCESS_ERROR
;
287 size_t dictSize
= compact
.dataSize();
288 if (U_SUCCESS(status
) && write(temp
, compact
.data(), dictSize
) != dictSize
) {
289 status
= U_FILE_ACCESS_ERROR
;
291 // Rename the temp file to the cache. Note that race conditions here are
292 // fine, as the file system operations are atomic. If an outdated version wins
293 // over a newer version, it will get rebuilt at the next app launch due to the
294 // modification time checks above. We don't care that any given app launch gets
295 // the most up-to-date cache (impossible since we can't lock all the Dictionaries
296 // directories), only that the cache (eventually) reflects the current state of
297 // any user dictionaries. That will happen on the next app launch after changes
298 // to the user dictionaries quiesce.
299 if (U_SUCCESS(status
)) {
300 if (rename(cachePath
, cacheTargetPath
)) {
301 status
= U_FILE_ACCESS_ERROR
;
302 (void) unlink(cachePath
); // Clean up the temp file
311 // Done with dictionary paths; release memory allocated by glob()
314 // Map the cache and build the dictionary
315 if (U_SUCCESS(status
)) {
316 int cache
= open(cacheTargetPath
, O_RDONLY
, 0);
318 const void *cacheData
= (const void *) -1;
320 status
= U_FILE_ACCESS_ERROR
;
322 if (U_SUCCESS(status
)) {
323 length
= lseek(cache
, 0, SEEK_END
);
324 (void) lseek(cache
, 0, SEEK_SET
);
325 if (length
< 0 || length
> PTRDIFF_MAX
) {
326 status
= U_FILE_ACCESS_ERROR
;
330 // Map the cache. Note: it is left mapped until process exit. This is the normal
331 // behavior anyway, so it shouldn't be an issue.
332 if (U_SUCCESS(status
)) {
333 cacheData
= mmap(0, (size_t) length
, PROT_READ
, MAP_SHARED
, cache
, 0);
334 if ((intptr_t)cacheData
== -1) {
335 status
= U_FILE_ACCESS_ERROR
;
338 // We can close the cache file now that it's mapped (or not)
342 // If all was successful, try to create the dictionary. The constructor will
343 // check the magic number for us.
344 if (U_SUCCESS(status
)) {
345 cacheDict
= new CompactTrieDictionary(cacheData
, status
);
347 if (U_FAILURE(status
) && (intptr_t)cacheData
!= -1) {
349 (void) munmap((void *)cacheData
, (size_t) length
);
353 // If we were successful, free the ICU dictionary and return ours
354 if (U_SUCCESS(status
)) {
367 #endif /* #if !UCONFIG_NO_BREAK_ITERATION && defined(U_DARWIN) */