]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | ******************************************************************************* | |
3 | * | |
4 | * Copyright (C) 2005-2011, International Business Machines | |
5 | * Corporation and others. All Rights Reserved. | |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: ucasemap.cpp | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2005may06 | |
14 | * created by: Markus W. Scherer | |
15 | * | |
16 | * Case mapping service object and functions using it. | |
17 | */ | |
18 | ||
19 | #include "unicode/utypes.h" | |
20 | #include "unicode/brkiter.h" | |
21 | #include "unicode/ubrk.h" | |
22 | #include "unicode/uloc.h" | |
23 | #include "unicode/ustring.h" | |
24 | #include "unicode/ucasemap.h" | |
25 | #if !UCONFIG_NO_BREAK_ITERATION | |
26 | #include "unicode/utext.h" | |
27 | #endif | |
28 | #include "unicode/utf.h" | |
29 | #include "unicode/utf8.h" | |
30 | #include "unicode/utf16.h" | |
31 | #include "cmemory.h" | |
32 | #include "cstring.h" | |
33 | #include "ucase.h" | |
34 | #include "ustr_imp.h" | |
35 | ||
36 | U_NAMESPACE_USE | |
37 | ||
38 | /* UCaseMap service object -------------------------------------------------- */ | |
39 | ||
40 | U_CAPI UCaseMap * U_EXPORT2 | |
41 | ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) { | |
42 | UCaseMap *csm; | |
43 | ||
44 | if(U_FAILURE(*pErrorCode)) { | |
45 | return NULL; | |
46 | } | |
47 | ||
48 | csm=(UCaseMap *)uprv_malloc(sizeof(UCaseMap)); | |
49 | if(csm==NULL) { | |
50 | return NULL; | |
51 | } | |
52 | uprv_memset(csm, 0, sizeof(UCaseMap)); | |
53 | ||
54 | csm->csp=ucase_getSingleton(); | |
55 | ucasemap_setLocale(csm, locale, pErrorCode); | |
56 | if(U_FAILURE(*pErrorCode)) { | |
57 | uprv_free(csm); | |
58 | return NULL; | |
59 | } | |
60 | ||
61 | csm->options=options; | |
62 | return csm; | |
63 | } | |
64 | ||
65 | U_CAPI void U_EXPORT2 | |
66 | ucasemap_close(UCaseMap *csm) { | |
67 | if(csm!=NULL) { | |
68 | #if !UCONFIG_NO_BREAK_ITERATION | |
69 | // Do not call ubrk_close() so that we do not depend on all of the BreakIterator code. | |
70 | delete reinterpret_cast<BreakIterator *>(csm->iter); | |
71 | #endif | |
72 | uprv_free(csm); | |
73 | } | |
74 | } | |
75 | ||
76 | U_CAPI const char * U_EXPORT2 | |
77 | ucasemap_getLocale(const UCaseMap *csm) { | |
78 | return csm->locale; | |
79 | } | |
80 | ||
81 | U_CAPI uint32_t U_EXPORT2 | |
82 | ucasemap_getOptions(const UCaseMap *csm) { | |
83 | return csm->options; | |
84 | } | |
85 | ||
86 | U_CAPI void U_EXPORT2 | |
87 | ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) { | |
88 | int32_t length; | |
89 | ||
90 | if(U_FAILURE(*pErrorCode)) { | |
91 | return; | |
92 | } | |
93 | ||
94 | length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode); | |
95 | if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) { | |
96 | *pErrorCode=U_ZERO_ERROR; | |
97 | /* we only really need the language code for case mappings */ | |
98 | length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode); | |
99 | } | |
100 | if(length==sizeof(csm->locale)) { | |
101 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
102 | } | |
103 | csm->locCache=0; | |
104 | if(U_SUCCESS(*pErrorCode)) { | |
105 | ucase_getCaseLocale(csm->locale, &csm->locCache); | |
106 | } else { | |
107 | csm->locale[0]=0; | |
108 | } | |
109 | } | |
110 | ||
111 | U_CAPI void U_EXPORT2 | |
112 | ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode * /*pErrorCode*/) { | |
113 | csm->options=options; | |
114 | } | |
115 | ||
116 | /* UTF-8 string case mappings ----------------------------------------------- */ | |
117 | ||
118 | /* TODO(markus): Move to a new, separate utf8case.c file. */ | |
119 | ||
120 | /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */ | |
121 | static inline int32_t | |
122 | appendResult(uint8_t *dest, int32_t destIndex, int32_t destCapacity, | |
123 | int32_t result, const UChar *s) { | |
124 | UChar32 c; | |
125 | int32_t length, destLength; | |
126 | UErrorCode errorCode; | |
127 | ||
128 | /* decode the result */ | |
129 | if(result<0) { | |
130 | /* (not) original code point */ | |
131 | c=~result; | |
132 | length=-1; | |
133 | } else if(result<=UCASE_MAX_STRING_LENGTH) { | |
134 | c=U_SENTINEL; | |
135 | length=result; | |
136 | } else { | |
137 | c=result; | |
138 | length=-1; | |
139 | } | |
140 | ||
141 | if(destIndex<destCapacity) { | |
142 | /* append the result */ | |
143 | if(length<0) { | |
144 | /* code point */ | |
145 | UBool isError=FALSE; | |
146 | U8_APPEND(dest, destIndex, destCapacity, c, isError); | |
147 | if(isError) { | |
148 | /* overflow, nothing written */ | |
149 | destIndex+=U8_LENGTH(c); | |
150 | } | |
151 | } else { | |
152 | /* string */ | |
153 | errorCode=U_ZERO_ERROR; | |
154 | u_strToUTF8( | |
155 | (char *)(dest+destIndex), destCapacity-destIndex, &destLength, | |
156 | s, length, | |
157 | &errorCode); | |
158 | destIndex+=destLength; | |
159 | /* we might have an overflow, but we know the actual length */ | |
160 | } | |
161 | } else { | |
162 | /* preflight */ | |
163 | if(length<0) { | |
164 | destIndex+=U8_LENGTH(c); | |
165 | } else { | |
166 | errorCode=U_ZERO_ERROR; | |
167 | u_strToUTF8( | |
168 | NULL, 0, &destLength, | |
169 | s, length, | |
170 | &errorCode); | |
171 | destIndex+=destLength; | |
172 | } | |
173 | } | |
174 | return destIndex; | |
175 | } | |
176 | ||
177 | static UChar32 U_CALLCONV | |
178 | utf8_caseContextIterator(void *context, int8_t dir) { | |
179 | UCaseContext *csc=(UCaseContext *)context; | |
180 | UChar32 c; | |
181 | ||
182 | if(dir<0) { | |
183 | /* reset for backward iteration */ | |
184 | csc->index=csc->cpStart; | |
185 | csc->dir=dir; | |
186 | } else if(dir>0) { | |
187 | /* reset for forward iteration */ | |
188 | csc->index=csc->cpLimit; | |
189 | csc->dir=dir; | |
190 | } else { | |
191 | /* continue current iteration direction */ | |
192 | dir=csc->dir; | |
193 | } | |
194 | ||
195 | if(dir<0) { | |
196 | if(csc->start<csc->index) { | |
197 | U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c); | |
198 | return c; | |
199 | } | |
200 | } else { | |
201 | if(csc->index<csc->limit) { | |
202 | U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c); | |
203 | return c; | |
204 | } | |
205 | } | |
206 | return U_SENTINEL; | |
207 | } | |
208 | ||
209 | /* | |
210 | * Case-maps [srcStart..srcLimit[ but takes | |
211 | * context [0..srcLength[ into account. | |
212 | */ | |
213 | static int32_t | |
214 | _caseMap(const UCaseMap *csm, UCaseMapFull *map, | |
215 | uint8_t *dest, int32_t destCapacity, | |
216 | const uint8_t *src, UCaseContext *csc, | |
217 | int32_t srcStart, int32_t srcLimit, | |
218 | UErrorCode *pErrorCode) { | |
219 | const UChar *s; | |
220 | UChar32 c, c2 = 0; | |
221 | int32_t srcIndex, destIndex; | |
222 | int32_t locCache; | |
223 | ||
224 | locCache=csm->locCache; | |
225 | ||
226 | /* case mapping loop */ | |
227 | srcIndex=srcStart; | |
228 | destIndex=0; | |
229 | while(srcIndex<srcLimit) { | |
230 | csc->cpStart=srcIndex; | |
231 | U8_NEXT(src, srcIndex, srcLimit, c); | |
232 | csc->cpLimit=srcIndex; | |
233 | if(c<0) { | |
234 | int32_t i=csc->cpStart; | |
235 | while(destIndex<destCapacity && i<srcIndex) { | |
236 | dest[destIndex++]=src[i++]; | |
237 | } | |
238 | continue; | |
239 | } | |
240 | c=map(csm->csp, c, utf8_caseContextIterator, csc, &s, csm->locale, &locCache); | |
241 | if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0x7f : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0x7f)) { | |
242 | /* fast path version of appendResult() for ASCII results */ | |
243 | dest[destIndex++]=(uint8_t)c2; | |
244 | } else { | |
245 | destIndex=appendResult(dest, destIndex, destCapacity, c, s); | |
246 | } | |
247 | } | |
248 | ||
249 | if(destIndex>destCapacity) { | |
250 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
251 | } | |
252 | return destIndex; | |
253 | } | |
254 | ||
255 | #if !UCONFIG_NO_BREAK_ITERATION | |
256 | ||
257 | U_CFUNC int32_t U_CALLCONV | |
258 | ucasemap_internalUTF8ToTitle(const UCaseMap *csm, | |
259 | uint8_t *dest, int32_t destCapacity, | |
260 | const uint8_t *src, int32_t srcLength, | |
261 | UErrorCode *pErrorCode) { | |
262 | const UChar *s; | |
263 | UChar32 c; | |
264 | int32_t prev, titleStart, titleLimit, idx, destIndex, length; | |
265 | UBool isFirstIndex; | |
266 | ||
267 | if(U_FAILURE(*pErrorCode)) { | |
268 | return 0; | |
269 | } | |
270 | ||
271 | // Use the C++ abstract base class to minimize dependencies. | |
272 | // TODO: Change UCaseMap.iter to store a BreakIterator directly. | |
273 | BreakIterator *bi=reinterpret_cast<BreakIterator *>(csm->iter); | |
274 | ||
275 | /* set up local variables */ | |
276 | int32_t locCache=csm->locCache; | |
277 | UCaseContext csc=UCASECONTEXT_INITIALIZER; | |
278 | csc.p=(void *)src; | |
279 | csc.limit=srcLength; | |
280 | destIndex=0; | |
281 | prev=0; | |
282 | isFirstIndex=TRUE; | |
283 | ||
284 | /* titlecasing loop */ | |
285 | while(prev<srcLength) { | |
286 | /* find next index where to titlecase */ | |
287 | if(isFirstIndex) { | |
288 | isFirstIndex=FALSE; | |
289 | idx=bi->first(); | |
290 | } else { | |
291 | idx=bi->next(); | |
292 | } | |
293 | if(idx==UBRK_DONE || idx>srcLength) { | |
294 | idx=srcLength; | |
295 | } | |
296 | ||
297 | /* | |
298 | * Unicode 4 & 5 section 3.13 Default Case Operations: | |
299 | * | |
300 | * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex | |
301 | * #29, "Text Boundaries." Between each pair of word boundaries, find the first | |
302 | * cased character F. If F exists, map F to default_title(F); then map each | |
303 | * subsequent character C to default_lower(C). | |
304 | * | |
305 | * In this implementation, segment [prev..index[ into 3 parts: | |
306 | * a) uncased characters (copy as-is) [prev..titleStart[ | |
307 | * b) first case letter (titlecase) [titleStart..titleLimit[ | |
308 | * c) subsequent characters (lowercase) [titleLimit..index[ | |
309 | */ | |
310 | if(prev<idx) { | |
311 | /* find and copy uncased characters [prev..titleStart[ */ | |
312 | titleStart=titleLimit=prev; | |
313 | U8_NEXT(src, titleLimit, idx, c); | |
314 | if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) { | |
315 | /* Adjust the titlecasing index (titleStart) to the next cased character. */ | |
316 | for(;;) { | |
317 | titleStart=titleLimit; | |
318 | if(titleLimit==idx) { | |
319 | /* | |
320 | * only uncased characters in [prev..index[ | |
321 | * stop with titleStart==titleLimit==index | |
322 | */ | |
323 | break; | |
324 | } | |
325 | U8_NEXT(src, titleLimit, idx, c); | |
326 | if(UCASE_NONE!=ucase_getType(csm->csp, c)) { | |
327 | break; /* cased letter at [titleStart..titleLimit[ */ | |
328 | } | |
329 | } | |
330 | length=titleStart-prev; | |
331 | if(length>0) { | |
332 | if((destIndex+length)<=destCapacity) { | |
333 | uprv_memcpy(dest+destIndex, src+prev, length); | |
334 | } | |
335 | destIndex+=length; | |
336 | } | |
337 | } | |
338 | ||
339 | if(titleStart<titleLimit) { | |
340 | /* titlecase c which is from [titleStart..titleLimit[ */ | |
341 | csc.cpStart=titleStart; | |
342 | csc.cpLimit=titleLimit; | |
343 | c=ucase_toFullTitle(csm->csp, c, utf8_caseContextIterator, &csc, &s, csm->locale, &locCache); | |
344 | destIndex=appendResult(dest, destIndex, destCapacity, c, s); | |
345 | ||
346 | /* Special case Dutch IJ titlecasing */ | |
347 | if ( titleStart+1 < idx && | |
348 | ucase_getCaseLocale(csm->locale, &locCache) == UCASE_LOC_DUTCH && | |
349 | ( src[titleStart] == 0x0049 || src[titleStart] == 0x0069 ) && | |
350 | ( src[titleStart+1] == 0x004A || src[titleStart+1] == 0x006A )) { | |
351 | c=0x004A; | |
352 | destIndex=appendResult(dest, destIndex, destCapacity, c, s); | |
353 | titleLimit++; | |
354 | } | |
355 | /* lowercase [titleLimit..index[ */ | |
356 | if(titleLimit<idx) { | |
357 | if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) { | |
358 | /* Normal operation: Lowercase the rest of the word. */ | |
359 | destIndex+= | |
360 | _caseMap( | |
361 | csm, ucase_toFullLower, | |
362 | dest+destIndex, destCapacity-destIndex, | |
363 | src, &csc, | |
364 | titleLimit, idx, | |
365 | pErrorCode); | |
366 | } else { | |
367 | /* Optionally just copy the rest of the word unchanged. */ | |
368 | length=idx-titleLimit; | |
369 | if((destIndex+length)<=destCapacity) { | |
370 | uprv_memcpy(dest+destIndex, src+titleLimit, length); | |
371 | } | |
372 | destIndex+=length; | |
373 | } | |
374 | } | |
375 | } | |
376 | } | |
377 | ||
378 | prev=idx; | |
379 | } | |
380 | ||
381 | if(destIndex>destCapacity) { | |
382 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
383 | } | |
384 | return destIndex; | |
385 | } | |
386 | ||
387 | #endif | |
388 | ||
389 | static int32_t U_CALLCONV | |
390 | ucasemap_internalUTF8ToLower(const UCaseMap *csm, | |
391 | uint8_t *dest, int32_t destCapacity, | |
392 | const uint8_t *src, int32_t srcLength, | |
393 | UErrorCode *pErrorCode) { | |
394 | UCaseContext csc=UCASECONTEXT_INITIALIZER; | |
395 | csc.p=(void *)src; | |
396 | csc.limit=srcLength; | |
397 | return _caseMap( | |
398 | csm, ucase_toFullLower, | |
399 | dest, destCapacity, | |
400 | src, &csc, 0, srcLength, | |
401 | pErrorCode); | |
402 | } | |
403 | ||
404 | static int32_t U_CALLCONV | |
405 | ucasemap_internalUTF8ToUpper(const UCaseMap *csm, | |
406 | uint8_t *dest, int32_t destCapacity, | |
407 | const uint8_t *src, int32_t srcLength, | |
408 | UErrorCode *pErrorCode) { | |
409 | UCaseContext csc=UCASECONTEXT_INITIALIZER; | |
410 | csc.p=(void *)src; | |
411 | csc.limit=srcLength; | |
412 | return _caseMap( | |
413 | csm, ucase_toFullUpper, | |
414 | dest, destCapacity, | |
415 | src, &csc, 0, srcLength, | |
416 | pErrorCode); | |
417 | } | |
418 | ||
419 | static int32_t | |
420 | utf8_foldCase(const UCaseProps *csp, | |
421 | uint8_t *dest, int32_t destCapacity, | |
422 | const uint8_t *src, int32_t srcLength, | |
423 | uint32_t options, | |
424 | UErrorCode *pErrorCode) { | |
425 | int32_t srcIndex, destIndex; | |
426 | ||
427 | const UChar *s; | |
428 | UChar32 c, c2; | |
429 | int32_t start; | |
430 | ||
431 | /* case mapping loop */ | |
432 | srcIndex=destIndex=0; | |
433 | while(srcIndex<srcLength) { | |
434 | start=srcIndex; | |
435 | U8_NEXT(src, srcIndex, srcLength, c); | |
436 | if(c<0) { | |
437 | while(destIndex<destCapacity && start<srcIndex) { | |
438 | dest[destIndex++]=src[start++]; | |
439 | } | |
440 | continue; | |
441 | } | |
442 | c=ucase_toFullFolding(csp, c, &s, options); | |
443 | if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0x7f : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0x7f)) { | |
444 | /* fast path version of appendResult() for ASCII results */ | |
445 | dest[destIndex++]=(uint8_t)c2; | |
446 | } else { | |
447 | destIndex=appendResult(dest, destIndex, destCapacity, c, s); | |
448 | } | |
449 | } | |
450 | ||
451 | if(destIndex>destCapacity) { | |
452 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
453 | } | |
454 | return destIndex; | |
455 | } | |
456 | ||
457 | static int32_t U_CALLCONV | |
458 | ucasemap_internalUTF8Fold(const UCaseMap *csm, | |
459 | uint8_t *dest, int32_t destCapacity, | |
460 | const uint8_t *src, int32_t srcLength, | |
461 | UErrorCode *pErrorCode) { | |
462 | return utf8_foldCase(csm->csp, dest, destCapacity, src, srcLength, csm->options, pErrorCode); | |
463 | } | |
464 | ||
465 | U_CFUNC int32_t | |
466 | ucasemap_mapUTF8(const UCaseMap *csm, | |
467 | uint8_t *dest, int32_t destCapacity, | |
468 | const uint8_t *src, int32_t srcLength, | |
469 | UTF8CaseMapper *stringCaseMapper, | |
470 | UErrorCode *pErrorCode) { | |
471 | int32_t destLength; | |
472 | ||
473 | /* check argument values */ | |
474 | if(U_FAILURE(*pErrorCode)) { | |
475 | return 0; | |
476 | } | |
477 | if( destCapacity<0 || | |
478 | (dest==NULL && destCapacity>0) || | |
479 | src==NULL || | |
480 | srcLength<-1 | |
481 | ) { | |
482 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
483 | return 0; | |
484 | } | |
485 | ||
486 | /* get the string length */ | |
487 | if(srcLength==-1) { | |
488 | srcLength=(int32_t)uprv_strlen((const char *)src); | |
489 | } | |
490 | ||
491 | /* check for overlapping source and destination */ | |
492 | if( dest!=NULL && | |
493 | ((src>=dest && src<(dest+destCapacity)) || | |
494 | (dest>=src && dest<(src+srcLength))) | |
495 | ) { | |
496 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
497 | return 0; | |
498 | } | |
499 | ||
500 | destLength=stringCaseMapper(csm, dest, destCapacity, src, srcLength, pErrorCode); | |
501 | return u_terminateChars((char *)dest, destCapacity, destLength, pErrorCode); | |
502 | } | |
503 | ||
504 | /* public API functions */ | |
505 | ||
506 | U_CAPI int32_t U_EXPORT2 | |
507 | ucasemap_utf8ToLower(const UCaseMap *csm, | |
508 | char *dest, int32_t destCapacity, | |
509 | const char *src, int32_t srcLength, | |
510 | UErrorCode *pErrorCode) { | |
511 | return ucasemap_mapUTF8(csm, | |
512 | (uint8_t *)dest, destCapacity, | |
513 | (const uint8_t *)src, srcLength, | |
514 | ucasemap_internalUTF8ToLower, pErrorCode); | |
515 | } | |
516 | ||
517 | U_CAPI int32_t U_EXPORT2 | |
518 | ucasemap_utf8ToUpper(const UCaseMap *csm, | |
519 | char *dest, int32_t destCapacity, | |
520 | const char *src, int32_t srcLength, | |
521 | UErrorCode *pErrorCode) { | |
522 | return ucasemap_mapUTF8(csm, | |
523 | (uint8_t *)dest, destCapacity, | |
524 | (const uint8_t *)src, srcLength, | |
525 | ucasemap_internalUTF8ToUpper, pErrorCode); | |
526 | } | |
527 | ||
528 | U_CAPI int32_t U_EXPORT2 | |
529 | ucasemap_utf8FoldCase(const UCaseMap *csm, | |
530 | char *dest, int32_t destCapacity, | |
531 | const char *src, int32_t srcLength, | |
532 | UErrorCode *pErrorCode) { | |
533 | return ucasemap_mapUTF8(csm, | |
534 | (uint8_t *)dest, destCapacity, | |
535 | (const uint8_t *)src, srcLength, | |
536 | ucasemap_internalUTF8Fold, pErrorCode); | |
537 | } |