2 *******************************************************************************
3 * Copyright (C) 2007, International Business Machines Corporation, Apple Inc.,*
4 * and others. All Rights Reserved. *
5 *******************************************************************************
8 #define __STDC_LIMIT_MACROS 1
9 #include "unicode/utypes.h"
11 #if !UCONFIG_NO_BREAK_ITERATION && U_PLATFORM_IS_DARWIN_BASED
17 #include "unicode/uscript.h"
18 #include "unicode/uniset.h"
19 #include "unicode/ucnv.h"
20 #include "unicode/uchar.h"
25 #include <NSSystemDirectories.h>
26 #include <sys/types.h>
33 // The following is now already included by platform.h (included indirectly by
34 // utypes.h) if U_PLATFORM_IS_DARWIN_BASED but it doesn't hurt to re-include here
35 #include <TargetConditionals.h>
40 ******************************************************************
43 AppleLanguageBreakFactory::AppleLanguageBreakFactory(UErrorCode
&status
)
44 : ICULanguageBreakFactory(status
)
48 AppleLanguageBreakFactory::~AppleLanguageBreakFactory() {
51 #if !TARGET_OS_EMBEDDED
53 // Helper function that makes a length-delimited buffer look NUL-terminated
54 static __attribute__((always_inline
)) inline UChar
nextUChar(const UChar
*&p
, ptrdiff_t &l
) {
64 // Add a file's worth of words to the supplied mutable dictionary
65 static void addDictFile(MutableTrieDictionary
*to
, const char *path
) {
66 UErrorCode status
= U_ZERO_ERROR
;
68 const char *dictRawData
= (const char *) -1;
69 const UChar
*dictData
= NULL
;
70 ptrdiff_t dictDataLength
= 0;
71 UChar
*dictBuffer
= NULL
;
72 const char *encoding
= NULL
;
73 int32_t signatureLength
= 0;
75 // Open the dictionary file
76 int dictFile
= open(path
, O_RDONLY
, 0);
78 status
= U_FILE_ACCESS_ERROR
;
81 // Determine its length
82 if (U_SUCCESS(status
)) {
83 fileLength
= lseek(dictFile
, 0, SEEK_END
);
84 (void) lseek(dictFile
, 0, SEEK_SET
);
85 if (fileLength
< 0 || fileLength
> PTRDIFF_MAX
) {
86 status
= U_FILE_ACCESS_ERROR
;
91 if (U_SUCCESS(status
)) {
92 dictRawData
= (const char *) mmap(0, (size_t) fileLength
, PROT_READ
, MAP_SHARED
, dictFile
, 0);
93 if ((intptr_t)dictRawData
== -1) {
94 status
= U_FILE_ACCESS_ERROR
;
98 // No longer need the file descriptor open
100 (void) close(dictFile
);
103 // Look for a Unicode signature
104 if (U_SUCCESS(status
)) {
105 encoding
= ucnv_detectUnicodeSignature(dictRawData
, fileLength
, &signatureLength
, &status
);
108 // If necessary, convert the data to UChars
109 if (U_SUCCESS(status
) && encoding
!= NULL
) {
110 UConverter
*conv
= ucnv_open(encoding
, &status
);
111 // Preflight to get buffer size
112 uint32_t destCap
= ucnv_toUChars(conv
, NULL
, 0, dictRawData
, fileLength
, &status
);
113 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
114 status
= U_ZERO_ERROR
;
116 if (U_SUCCESS(status
)) {
117 dictBuffer
= new UChar
[destCap
+1];
119 (void) ucnv_toUChars(conv
, dictBuffer
, destCap
+1, dictRawData
, fileLength
, &status
);
120 dictData
= dictBuffer
;
121 dictDataLength
= destCap
;
122 if (U_SUCCESS(status
) && dictData
[0] == 0xFEFF) { // BOM? Skip it
130 // If it didn't need converting, just assume it's native-endian UTF-16, no BOM
131 if (U_SUCCESS(status
) && dictData
== NULL
) {
132 dictData
= (const UChar
*) dictRawData
;
133 dictDataLength
= fileLength
/sizeof(UChar
);
136 // OK, we now have a pointer to native-endian UTF-16. Process it as one word per line,
137 // stopping at the first space.
138 if (U_SUCCESS(status
)) {
139 UnicodeSet
breaks(UNICODE_STRING_SIMPLE("[[:lb=BK:][:lb=CR:][:lb=LF:][:lb=NL:]]"), status
);
140 const UChar
*candidate
= dictData
;
142 UChar uc
= nextUChar(dictData
, dictDataLength
);
143 while (U_SUCCESS(status
) && uc
) {
144 while (uc
&& !u_isspace(uc
)) {
146 uc
= nextUChar(dictData
, dictDataLength
);
150 to
->addWord(candidate
, length
, status
);
153 // Find beginning of next line
154 // 1. Skip non-line-break characters
155 while (uc
&& !breaks
.contains(uc
)) {
156 uc
= nextUChar(dictData
, dictDataLength
);
158 // 2. Skip line break characters
159 while (uc
&& breaks
.contains(uc
)) {
160 uc
= nextUChar(dictData
, dictDataLength
);
163 // Prepare for next line
164 candidate
= dictData
-1;
169 // Unmap the file if we mapped it
170 if ((intptr_t) dictRawData
!= -1) {
171 (void) munmap((void *)dictRawData
, (size_t) fileLength
);
174 // Delete any temporary buffer
175 delete [] dictBuffer
;
179 static const char sArchType
[] = "";
181 static const char sArchType
[] = ".le"; // little endian
186 const CompactTrieDictionary
*
187 AppleLanguageBreakFactory::loadDictionaryFor(UScriptCode script
, int32_t breakType
) {
188 const CompactTrieDictionary
*icuDict
= ICULanguageBreakFactory::loadDictionaryFor(script
, breakType
);
189 #if !TARGET_OS_EMBEDDED
190 // We only look for a user dictionary if there is actually an ICU dictionary
191 if (icuDict
!= NULL
) {
192 UErrorCode status
= U_ZERO_ERROR
;
193 const char *scriptName
= uscript_getName(script
);
194 char path
[256]; // PATH_MAX is overkill in this case
196 char cacheTargetPath
[256];
199 struct stat cacheStat
;
200 struct stat dictStat
;
201 bool cacheGood
= true;
202 int globFlags
= (GLOB_NOESCAPE
|GLOB_NOSORT
|GLOB_TILDE
);
203 const CompactTrieDictionary
*cacheDict
= NULL
;
205 // Iterate the dictionary directories and accumulate in dirGlob
206 NSSearchPathEnumerationState state
= NSStartSearchPathEnumeration(NSLibraryDirectory
, (NSSearchPathDomainMask
) (NSUserDomainMask
|NSLocalDomainMask
|NSNetworkDomainMask
));
207 while ((state
= NSGetNextSearchPathEnumeration(state
, path
)) != 0) {
208 // First get the directory itself. We should never overflow, but use strlcat anyway
209 // to avoid a crash if we do.
210 strlcat(path
, "/Dictionaries", sizeof(path
));
211 if (!glob(path
, globFlags
, NULL
, &dirGlob
)) {
212 globFlags
|= GLOB_APPEND
;
216 // If there are no Dictionaries directories, ignore any cache file and return the ICU
217 // standard dictionary
218 // TODO: Delete the cache?
219 if (dirGlob
.gl_pathc
== 0) {
224 // See if there is a cache file already; get its mod time
225 // TODO: should we be using geteuid() here instead of getuid()?
226 state
= NSStartSearchPathEnumeration(NSCachesDirectory
, NSLocalDomainMask
);
227 state
= NSGetNextSearchPathEnumeration(state
, cachePath
); // Just use first one
228 // Create the cache file name. We should never overflow, but use snprintf to avoid a crash
230 snprintf(cacheTargetPath
, sizeof(cacheTargetPath
), "%s/com.apple.ICUUserDictionaryCache%s.%s.%d", cachePath
, sArchType
, scriptName
, getuid());
231 if (stat(cacheTargetPath
, &cacheStat
) || cacheStat
.st_mode
!= (S_IFREG
|S_IRUSR
|S_IWUSR
)) {
232 cacheGood
= false; // No file or bad permissions or type
235 // Stat the dictionary folders, and glob the dictionary files
236 globFlags
&= ~GLOB_APPEND
;
237 char **pathsp
= dirGlob
.gl_pathv
;
238 const char *dictpath
;
239 while ((dictpath
= *pathsp
++) != NULL
) {
240 // Stat the directory -- ignore if stat failure
241 if (!stat(dictpath
, &dictStat
)) {
242 // Glob the dictionaries in the directory
243 snprintf(path
, sizeof(path
), "%s/*-%s.txt", dictpath
, scriptName
);
244 if (!glob(path
, globFlags
, NULL
, &fileGlob
)) {
245 globFlags
|= GLOB_APPEND
;
247 // If the directory has been modified after the cache file, we need to rebuild;
248 // a dictionary might have been deleted.
249 if (cacheGood
&& (dictStat
.st_mtimespec
.tv_sec
> cacheStat
.st_mtimespec
.tv_sec
|| (dictStat
.st_mtimespec
.tv_sec
== cacheStat
.st_mtimespec
.tv_sec
&& dictStat
.st_mtimespec
.tv_nsec
> cacheStat
.st_mtimespec
.tv_nsec
))) {
255 // No longer need the directory glob
258 // If there are no dictionaries, ignore the cache file and return the ICU dictionary
259 // TODO: Delete the cache?
260 if (fileGlob
.gl_pathc
== 0) {
265 // Now compare the last modified stamp for the cache against all the dictionaries
266 pathsp
= fileGlob
.gl_pathv
;
267 while (cacheGood
&& (dictpath
= *pathsp
++)) {
268 // Stat the dictionary -- ignore if stat failure
269 if (!stat(dictpath
, &dictStat
) && (dictStat
.st_mtimespec
.tv_sec
> cacheStat
.st_mtimespec
.tv_sec
|| (dictStat
.st_mtimespec
.tv_sec
== cacheStat
.st_mtimespec
.tv_sec
&& dictStat
.st_mtimespec
.tv_nsec
> cacheStat
.st_mtimespec
.tv_nsec
))) {
274 // Do we need to build the dictionary cache?
276 // Create a mutable dictionary from the ICU dictionary
277 MutableTrieDictionary
*sum
= icuDict
->cloneMutable(status
);
278 pathsp
= fileGlob
.gl_pathv
;
279 while (U_SUCCESS(status
) && (dictpath
= *pathsp
++)) {
280 // Add the contents of a file to the sum
281 addDictFile(sum
, dictpath
);
284 // Create a compact (read-only) dictionary
285 CompactTrieDictionary
compact(*sum
, status
);
288 if (U_SUCCESS(status
)) {
289 // Open a temp file to write out the cache
290 strlcat(cachePath
, "/temp.XXXXXXXXXX", sizeof(cachePath
));
291 int temp
= mkstemp(cachePath
);
293 status
= U_FILE_ACCESS_ERROR
;
295 size_t dictSize
= compact
.dataSize();
296 if (U_SUCCESS(status
) && write(temp
, compact
.data(), dictSize
) != dictSize
) {
297 status
= U_FILE_ACCESS_ERROR
;
299 // Rename the temp file to the cache. Note that race conditions here are
300 // fine, as the file system operations are atomic. If an outdated version wins
301 // over a newer version, it will get rebuilt at the next app launch due to the
302 // modification time checks above. We don't care that any given app launch gets
303 // the most up-to-date cache (impossible since we can't lock all the Dictionaries
304 // directories), only that the cache (eventually) reflects the current state of
305 // any user dictionaries. That will happen on the next app launch after changes
306 // to the user dictionaries quiesce.
307 if (U_SUCCESS(status
)) {
308 if (rename(cachePath
, cacheTargetPath
)) {
309 status
= U_FILE_ACCESS_ERROR
;
310 (void) unlink(cachePath
); // Clean up the temp file
319 // Done with dictionary paths; release memory allocated by glob()
322 // Map the cache and build the dictionary
323 if (U_SUCCESS(status
)) {
324 int cache
= open(cacheTargetPath
, O_RDONLY
, 0);
326 const void *cacheData
= (const void *) -1;
328 status
= U_FILE_ACCESS_ERROR
;
330 if (U_SUCCESS(status
)) {
331 length
= lseek(cache
, 0, SEEK_END
);
332 (void) lseek(cache
, 0, SEEK_SET
);
333 if (length
< 0 || length
> PTRDIFF_MAX
) {
334 status
= U_FILE_ACCESS_ERROR
;
338 // Map the cache. Note: it is left mapped until process exit. This is the normal
339 // behavior anyway, so it shouldn't be an issue.
340 if (U_SUCCESS(status
)) {
341 cacheData
= mmap(0, (size_t) length
, PROT_READ
, MAP_SHARED
, cache
, 0);
342 if ((intptr_t)cacheData
== -1) {
343 status
= U_FILE_ACCESS_ERROR
;
346 // We can close the cache file now that it's mapped (or not)
350 // If all was successful, try to create the dictionary. The constructor will
351 // check the magic number for us.
352 if (U_SUCCESS(status
)) {
353 cacheDict
= new CompactTrieDictionary(cacheData
, status
);
355 if (U_FAILURE(status
) && (intptr_t)cacheData
!= -1) {
357 (void) munmap((void *)cacheData
, (size_t) length
);
361 // If we were successful, free the ICU dictionary and return ours
362 if (U_SUCCESS(status
)) {
376 #endif /* #if !UCONFIG_NO_BREAK_ITERATION && U_PLATFORM_IS_DARWIN_BASED */