2 *******************************************************************************
3 * Copyright (C) 2007,2012 International Business Machines Corporation, Apple Inc.,*
4 * and others. All Rights Reserved. *
5 *******************************************************************************
8 #define __STDC_LIMIT_MACROS 1
9 #include "unicode/utypes.h"
11 #if !UCONFIG_NO_BREAK_ITERATION && U_PLATFORM_IS_DARWIN_BASED
16 #include "unicode/uscript.h"
17 #include "unicode/uniset.h"
18 #include "unicode/ucnv.h"
19 #include "unicode/uchar.h"
24 #include <NSSystemDirectories.h>
25 #include <sys/types.h>
32 // The following is now already included by platform.h (included indirectly by
33 // utypes.h) if U_PLATFORM_IS_DARWIN_BASED but it doesn't hurt to re-include here
34 #include <TargetConditionals.h>
39 ******************************************************************
42 AppleLanguageBreakFactory::AppleLanguageBreakFactory(UErrorCode
&status
)
43 : ICULanguageBreakFactory(status
)
47 AppleLanguageBreakFactory::~AppleLanguageBreakFactory() {
50 #if !TARGET_OS_EMBEDDED
52 // need to update loadDictionaryMatcherFor implementation below
54 // Helper function that makes a length-delimited buffer look NUL-terminated
55 static __attribute__((always_inline
)) inline UChar
nextUChar(const UChar
*&p
, ptrdiff_t &l
) {
65 // Add a file's worth of words to the supplied mutable dictionary
66 static void addDictFile(MutableTrieDictionary
*to
, const char *path
) {
67 UErrorCode status
= U_ZERO_ERROR
;
69 const char *dictRawData
= (const char *) -1;
70 const UChar
*dictData
= NULL
;
71 ptrdiff_t dictDataLength
= 0;
72 UChar
*dictBuffer
= NULL
;
73 const char *encoding
= NULL
;
74 int32_t signatureLength
= 0;
76 // Open the dictionary file
77 int dictFile
= open(path
, O_RDONLY
, 0);
79 status
= U_FILE_ACCESS_ERROR
;
82 // Determine its length
83 if (U_SUCCESS(status
)) {
84 fileLength
= lseek(dictFile
, 0, SEEK_END
);
85 (void) lseek(dictFile
, 0, SEEK_SET
);
86 if (fileLength
< 0 || fileLength
> PTRDIFF_MAX
) {
87 status
= U_FILE_ACCESS_ERROR
;
92 if (U_SUCCESS(status
)) {
93 dictRawData
= (const char *) mmap(0, (size_t) fileLength
, PROT_READ
, MAP_SHARED
, dictFile
, 0);
94 if ((intptr_t)dictRawData
== -1) {
95 status
= U_FILE_ACCESS_ERROR
;
99 // No longer need the file descriptor open
100 if (dictFile
!= -1) {
101 (void) close(dictFile
);
104 // Look for a Unicode signature
105 if (U_SUCCESS(status
)) {
106 encoding
= ucnv_detectUnicodeSignature(dictRawData
, fileLength
, &signatureLength
, &status
);
109 // If necessary, convert the data to UChars
110 if (U_SUCCESS(status
) && encoding
!= NULL
) {
111 UConverter
*conv
= ucnv_open(encoding
, &status
);
112 // Preflight to get buffer size
113 uint32_t destCap
= ucnv_toUChars(conv
, NULL
, 0, dictRawData
, fileLength
, &status
);
114 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
115 status
= U_ZERO_ERROR
;
117 if (U_SUCCESS(status
)) {
118 dictBuffer
= new UChar
[destCap
+1];
120 (void) ucnv_toUChars(conv
, dictBuffer
, destCap
+1, dictRawData
, fileLength
, &status
);
121 dictData
= dictBuffer
;
122 dictDataLength
= destCap
;
123 if (U_SUCCESS(status
) && dictData
[0] == 0xFEFF) { // BOM? Skip it
131 // If it didn't need converting, just assume it's native-endian UTF-16, no BOM
132 if (U_SUCCESS(status
) && dictData
== NULL
) {
133 dictData
= (const UChar
*) dictRawData
;
134 dictDataLength
= fileLength
/sizeof(UChar
);
137 // OK, we now have a pointer to native-endian UTF-16. Process it as one word per line,
138 // stopping at the first space.
139 if (U_SUCCESS(status
)) {
140 UnicodeSet
breaks(UNICODE_STRING_SIMPLE("[[:lb=BK:][:lb=CR:][:lb=LF:][:lb=NL:]]"), status
);
141 const UChar
*candidate
= dictData
;
143 UChar uc
= nextUChar(dictData
, dictDataLength
);
144 while (U_SUCCESS(status
) && uc
) {
145 while (uc
&& !u_isspace(uc
)) {
147 uc
= nextUChar(dictData
, dictDataLength
);
151 to
->addWord(candidate
, length
, status
);
154 // Find beginning of next line
155 // 1. Skip non-line-break characters
156 while (uc
&& !breaks
.contains(uc
)) {
157 uc
= nextUChar(dictData
, dictDataLength
);
159 // 2. Skip line break characters
160 while (uc
&& breaks
.contains(uc
)) {
161 uc
= nextUChar(dictData
, dictDataLength
);
164 // Prepare for next line
165 candidate
= dictData
-1;
170 // Unmap the file if we mapped it
171 if ((intptr_t) dictRawData
!= -1) {
172 (void) munmap((void *)dictRawData
, (size_t) fileLength
);
175 // Delete any temporary buffer
176 delete [] dictBuffer
;
180 static const char sArchType
[] = "";
182 static const char sArchType
[] = ".le"; // little endian
190 ICULanguageBreakFactory changes from
191 virtual const CompactTrieDictionary *loadDictionaryFor(UScriptCode script, int32_t breakType);
193 virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script, int32_t breakType);
194 and CompactTrieDictionary no longer exists. Need to work out new implementation below.
198 AppleLanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script
, int32_t breakType
) {
199 DictionaryMatcher
*icuDictMatcher
= ICULanguageBreakFactory::loadDictionaryMatcherFor(script
, breakType
);
200 #if !TARGET_OS_EMBEDDED
202 // need to update loadDictionaryMatcherFor implementation below
203 // We only look for a user dictionary if there is actually an ICU dictionary
204 if (icuDictMatcher
!= NULL
) {
205 UErrorCode status
= U_ZERO_ERROR
;
206 const char *scriptName
= uscript_getName(script
);
207 char path
[256]; // PATH_MAX is overkill in this case
209 char cacheTargetPath
[256];
212 struct stat cacheStat
;
213 struct stat dictStat
;
214 bool cacheGood
= true;
215 int globFlags
= (GLOB_NOESCAPE
|GLOB_NOSORT
|GLOB_TILDE
);
216 const CompactTrieDictionary
*cacheDict
= NULL
;
218 // Iterate the dictionary directories and accumulate in dirGlob
219 NSSearchPathEnumerationState state
= NSStartSearchPathEnumeration(NSLibraryDirectory
, (NSSearchPathDomainMask
) (NSUserDomainMask
|NSLocalDomainMask
|NSNetworkDomainMask
));
220 while ((state
= NSGetNextSearchPathEnumeration(state
, path
)) != 0) {
221 // First get the directory itself. We should never overflow, but use strlcat anyway
222 // to avoid a crash if we do.
223 strlcat(path
, "/Dictionaries", sizeof(path
));
224 if (!glob(path
, globFlags
, NULL
, &dirGlob
)) {
225 globFlags
|= GLOB_APPEND
;
229 // If there are no Dictionaries directories, ignore any cache file and return the ICU
230 // standard dictionary
231 // TODO: Delete the cache?
232 if (dirGlob
.gl_pathc
== 0) {
234 return icuDictMatcher
;
237 // See if there is a cache file already; get its mod time
238 // TODO: should we be using geteuid() here instead of getuid()?
239 state
= NSStartSearchPathEnumeration(NSCachesDirectory
, NSLocalDomainMask
);
240 state
= NSGetNextSearchPathEnumeration(state
, cachePath
); // Just use first one
241 // Create the cache file name. We should never overflow, but use snprintf to avoid a crash
243 snprintf(cacheTargetPath
, sizeof(cacheTargetPath
), "%s/com.apple.ICUUserDictionaryCache%s.%s.%d", cachePath
, sArchType
, scriptName
, getuid());
244 if (stat(cacheTargetPath
, &cacheStat
) || cacheStat
.st_mode
!= (S_IFREG
|S_IRUSR
|S_IWUSR
)) {
245 cacheGood
= false; // No file or bad permissions or type
248 // Stat the dictionary folders, and glob the dictionary files
249 globFlags
&= ~GLOB_APPEND
;
250 char **pathsp
= dirGlob
.gl_pathv
;
251 const char *dictpath
;
252 while ((dictpath
= *pathsp
++) != NULL
) {
253 // Stat the directory -- ignore if stat failure
254 if (!stat(dictpath
, &dictStat
)) {
255 // Glob the dictionaries in the directory
256 snprintf(path
, sizeof(path
), "%s/*-%s.txt", dictpath
, scriptName
);
257 if (!glob(path
, globFlags
, NULL
, &fileGlob
)) {
258 globFlags
|= GLOB_APPEND
;
260 // If the directory has been modified after the cache file, we need to rebuild;
261 // a dictionary might have been deleted.
262 if (cacheGood
&& (dictStat
.st_mtimespec
.tv_sec
> cacheStat
.st_mtimespec
.tv_sec
|| (dictStat
.st_mtimespec
.tv_sec
== cacheStat
.st_mtimespec
.tv_sec
&& dictStat
.st_mtimespec
.tv_nsec
> cacheStat
.st_mtimespec
.tv_nsec
))) {
268 // No longer need the directory glob
271 // If there are no dictionaries, ignore the cache file and return the ICU dictionary
272 // TODO: Delete the cache?
273 if (fileGlob
.gl_pathc
== 0) {
275 return icuDictMatcher
;
278 // Now compare the last modified stamp for the cache against all the dictionaries
279 pathsp
= fileGlob
.gl_pathv
;
280 while (cacheGood
&& (dictpath
= *pathsp
++)) {
281 // Stat the dictionary -- ignore if stat failure
282 if (!stat(dictpath
, &dictStat
) && (dictStat
.st_mtimespec
.tv_sec
> cacheStat
.st_mtimespec
.tv_sec
|| (dictStat
.st_mtimespec
.tv_sec
== cacheStat
.st_mtimespec
.tv_sec
&& dictStat
.st_mtimespec
.tv_nsec
> cacheStat
.st_mtimespec
.tv_nsec
))) {
287 // Do we need to build the dictionary cache?
289 // Create a mutable dictionary from the ICU dictionary
290 MutableTrieDictionary
*sum
= icuDictMatcher
->cloneMutable(status
);
291 pathsp
= fileGlob
.gl_pathv
;
292 while (U_SUCCESS(status
) && (dictpath
= *pathsp
++)) {
293 // Add the contents of a file to the sum
294 addDictFile(sum
, dictpath
);
297 // Create a compact (read-only) dictionary
298 CompactTrieDictionary
compact(*sum
, status
);
301 if (U_SUCCESS(status
)) {
302 // Open a temp file to write out the cache
303 strlcat(cachePath
, "/temp.XXXXXXXXXX", sizeof(cachePath
));
304 int temp
= mkstemp(cachePath
);
306 status
= U_FILE_ACCESS_ERROR
;
308 size_t dictSize
= compact
.dataSize();
309 if (U_SUCCESS(status
) && write(temp
, compact
.data(), dictSize
) != dictSize
) {
310 status
= U_FILE_ACCESS_ERROR
;
312 // Rename the temp file to the cache. Note that race conditions here are
313 // fine, as the file system operations are atomic. If an outdated version wins
314 // over a newer version, it will get rebuilt at the next app launch due to the
315 // modification time checks above. We don't care that any given app launch gets
316 // the most up-to-date cache (impossible since we can't lock all the Dictionaries
317 // directories), only that the cache (eventually) reflects the current state of
318 // any user dictionaries. That will happen on the next app launch after changes
319 // to the user dictionaries quiesce.
320 if (U_SUCCESS(status
)) {
321 if (rename(cachePath
, cacheTargetPath
)) {
322 status
= U_FILE_ACCESS_ERROR
;
323 (void) unlink(cachePath
); // Clean up the temp file
332 // Done with dictionary paths; release memory allocated by glob()
335 // Map the cache and build the dictionary
336 if (U_SUCCESS(status
)) {
337 int cache
= open(cacheTargetPath
, O_RDONLY
, 0);
339 const void *cacheData
= (const void *) -1;
341 status
= U_FILE_ACCESS_ERROR
;
343 if (U_SUCCESS(status
)) {
344 length
= lseek(cache
, 0, SEEK_END
);
345 (void) lseek(cache
, 0, SEEK_SET
);
346 if (length
< 0 || length
> PTRDIFF_MAX
) {
347 status
= U_FILE_ACCESS_ERROR
;
351 // Map the cache. Note: it is left mapped until process exit. This is the normal
352 // behavior anyway, so it shouldn't be an issue.
353 if (U_SUCCESS(status
)) {
354 cacheData
= mmap(0, (size_t) length
, PROT_READ
, MAP_SHARED
, cache
, 0);
355 if ((intptr_t)cacheData
== -1) {
356 status
= U_FILE_ACCESS_ERROR
;
359 // We can close the cache file now that it's mapped (or not)
363 // If all was successful, try to create the dictionary. The constructor will
364 // check the magic number for us.
365 if (U_SUCCESS(status
)) {
366 cacheDict
= new CompactTrieDictionary(cacheData
, status
);
368 if (U_FAILURE(status
) && (intptr_t)cacheData
!= -1) {
370 (void) munmap((void *)cacheData
, (size_t) length
);
374 // If we were successful, free the ICU dictionary and return ours
375 if (U_SUCCESS(status
)) {
376 delete icuDictMatcher
;
385 return icuDictMatcher
;
390 #endif /* #if !UCONFIG_NO_BREAK_ITERATION && U_PLATFORM_IS_DARWIN_BASED */