ICU-64252.0.1.tar.gz
[apple/icu.git] / icuSources / common / ucase.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
374ca955
A
3/*
4*******************************************************************************
5*
b331163b 6* Copyright (C) 2004-2014, International Business Machines
374ca955
A
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
4388f060 10* file name: ucase.cpp
f3c0d7a5 11* encoding: UTF-8
374ca955
A
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2004aug30
16* created by: Markus W. Scherer
17*
18* Low-level Unicode character/string case mapping code.
19* Much code moved here (and modified) from uchar.c.
20*/
21
22#include "unicode/utypes.h"
4388f060 23#include "unicode/unistr.h"
374ca955
A
24#include "unicode/uset.h"
25#include "unicode/udata.h" /* UDataInfo */
4388f060 26#include "unicode/utf16.h"
374ca955
A
27#include "ucmndata.h" /* DataHeader */
28#include "udatamem.h"
29#include "umutex.h"
30#include "uassert.h"
31#include "cmemory.h"
729e4ab9 32#include "utrie2.h"
374ca955 33#include "ucase.h"
374ca955
A
34
35struct UCaseProps {
36 UDataMemory *mem;
37 const int32_t *indexes;
38 const uint16_t *exceptions;
4388f060 39 const uint16_t *unfold;
374ca955 40
729e4ab9 41 UTrie2 trie;
374ca955
A
42 uint8_t formatVersion[4];
43};
44
4388f060
A
45/* ucase_props_data.h is machine-generated by gencase --csource */
46#define INCLUDED_FROM_UCASE_CPP
47#include "ucase_props_data.h"
73c04bcf 48
374ca955
A
49/* set of property starts for UnicodeSet ------------------------------------ */
50
51static UBool U_CALLCONV
4388f060 52_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
374ca955 53 /* add the start code point to the USet */
73c04bcf 54 const USetAdder *sa=(const USetAdder *)context;
374ca955
A
55 sa->add(sa->set, start);
56 return TRUE;
57}
58
46f4442e 59U_CFUNC void U_EXPORT2
f3c0d7a5 60ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
374ca955
A
61 if(U_FAILURE(*pErrorCode)) {
62 return;
63 }
64
65 /* add the start code point of each same-value range of the trie */
f3c0d7a5 66 utrie2_enum(&ucase_props_singleton.trie, NULL, _enumPropertyStartsRange, sa);
374ca955
A
67
68 /* add code points with hardcoded properties, plus the ones following them */
69
70 /* (none right now, see comment below) */
71
72 /*
73 * Omit code points with hardcoded specialcasing properties
74 * because we do not build property UnicodeSets for them right now.
75 */
76}
77
78/* data access primitives --------------------------------------------------- */
79
0f5d89e8
A
80U_CFUNC const UTrie2 * U_EXPORT2
81ucase_getTrie() {
82 return &ucase_props_singleton.trie;
83}
374ca955 84
0f5d89e8 85#define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
374ca955
A
86
87/* number of bits in an 8-bit integer value */
88static const uint8_t flagsOffset[256]={
89 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
90 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
91 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
92 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
93 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
94 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
95 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
96 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
97 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
98 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
99 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
100 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
101 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
102 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
103 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
104 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
105};
106
729e4ab9
A
107#define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
108#define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
374ca955
A
109
110/*
729e4ab9 111 * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
374ca955
A
112 *
113 * @param excWord (in) initial exceptions word
729e4ab9 114 * @param idx (in) desired slot index
374ca955
A
115 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
116 * moved to the last uint16_t of the value, use +1 for beginning of next slot
117 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
118 */
729e4ab9 119#define GET_SLOT_VALUE(excWord, idx, pExc16, value) \
374ca955 120 if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
729e4ab9 121 (pExc16)+=SLOT_OFFSET(excWord, idx); \
374ca955
A
122 (value)=*pExc16; \
123 } else { \
729e4ab9 124 (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
374ca955
A
125 (value)=*pExc16++; \
126 (value)=((value)<<16)|*pExc16; \
127 }
128
129/* simple case mappings ----------------------------------------------------- */
130
131U_CAPI UChar32 U_EXPORT2
f3c0d7a5
A
132ucase_tolower(UChar32 c) {
133 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
0f5d89e8
A
134 if(!UCASE_HAS_EXCEPTION(props)) {
135 if(UCASE_IS_UPPER_OR_TITLE(props)) {
73c04bcf 136 c+=UCASE_GET_DELTA(props);
374ca955
A
137 }
138 } else {
f3c0d7a5 139 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
374ca955 140 uint16_t excWord=*pe++;
0f5d89e8
A
141 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
142 int32_t delta;
143 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
144 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
145 }
374ca955
A
146 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
147 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
148 }
149 }
150 return c;
151}
152
153U_CAPI UChar32 U_EXPORT2
f3c0d7a5
A
154ucase_toupper(UChar32 c) {
155 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
0f5d89e8 156 if(!UCASE_HAS_EXCEPTION(props)) {
73c04bcf
A
157 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
158 c+=UCASE_GET_DELTA(props);
374ca955
A
159 }
160 } else {
f3c0d7a5 161 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
374ca955 162 uint16_t excWord=*pe++;
0f5d89e8
A
163 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
164 int32_t delta;
165 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
166 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
167 }
374ca955
A
168 if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
169 GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
170 }
171 }
172 return c;
173}
174
175U_CAPI UChar32 U_EXPORT2
f3c0d7a5
A
176ucase_totitle(UChar32 c) {
177 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
0f5d89e8 178 if(!UCASE_HAS_EXCEPTION(props)) {
73c04bcf
A
179 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
180 c+=UCASE_GET_DELTA(props);
374ca955
A
181 }
182 } else {
f3c0d7a5 183 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
374ca955 184 uint16_t excWord=*pe++;
0f5d89e8
A
185 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
186 int32_t delta;
187 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
188 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
189 }
729e4ab9 190 int32_t idx;
374ca955 191 if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
729e4ab9 192 idx=UCASE_EXC_TITLE;
374ca955 193 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
729e4ab9 194 idx=UCASE_EXC_UPPER;
374ca955
A
195 } else {
196 return c;
197 }
729e4ab9 198 GET_SLOT_VALUE(excWord, idx, pe, c);
374ca955
A
199 }
200 return c;
201}
202
46f4442e
A
203static const UChar iDot[2] = { 0x69, 0x307 };
204static const UChar jDot[2] = { 0x6a, 0x307 };
205static const UChar iOgonekDot[3] = { 0x12f, 0x307 };
206static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };
207static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };
208static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
209
210
211U_CFUNC void U_EXPORT2
f3c0d7a5 212ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
73c04bcf
A
213 uint16_t props;
214
215 /*
216 * Hardcode the case closure of i and its relatives and ignore the
217 * data file data for these characters.
218 * The Turkic dotless i and dotted I with their case mapping conditions
219 * and case folding option make the related characters behave specially.
220 * This code matches their closure behavior to their case folding behavior.
221 */
73c04bcf
A
222
223 switch(c) {
224 case 0x49:
225 /* regular i and I are in one equivalence class */
226 sa->add(sa->set, 0x69);
227 return;
228 case 0x69:
229 sa->add(sa->set, 0x49);
230 return;
231 case 0x130:
232 /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
233 sa->addString(sa->set, iDot, 2);
234 return;
235 case 0x131:
236 /* dotless i is in a class by itself */
237 return;
238 default:
239 /* otherwise use the data file data */
240 break;
241 }
242
f3c0d7a5 243 props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
0f5d89e8 244 if(!UCASE_HAS_EXCEPTION(props)) {
73c04bcf
A
245 if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
246 /* add the one simple case mapping, no matter what type it is */
247 int32_t delta=UCASE_GET_DELTA(props);
248 if(delta!=0) {
249 sa->add(sa->set, c+delta);
250 }
251 }
252 } else {
253 /*
254 * c has exceptions, so there may be multiple simple and/or
255 * full case mappings. Add them all.
256 */
f3c0d7a5 257 const uint16_t *pe0, *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
73c04bcf
A
258 const UChar *closure;
259 uint16_t excWord=*pe++;
729e4ab9 260 int32_t idx, closureLength, fullLength, length;
73c04bcf
A
261
262 pe0=pe;
263
264 /* add all simple case mappings */
729e4ab9
A
265 for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
266 if(HAS_SLOT(excWord, idx)) {
73c04bcf 267 pe=pe0;
729e4ab9 268 GET_SLOT_VALUE(excWord, idx, pe, c);
73c04bcf
A
269 sa->add(sa->set, c);
270 }
271 }
0f5d89e8
A
272 if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
273 pe=pe0;
274 int32_t delta;
275 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
276 sa->add(sa->set, (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta);
277 }
73c04bcf
A
278
279 /* get the closure string pointer & length */
280 if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
281 pe=pe0;
282 GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
283 closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
284 closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
285 } else {
286 closureLength=0;
287 closure=NULL;
288 }
289
290 /* add the full case folding */
291 if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
292 pe=pe0;
293 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
294
295 /* start of full case mapping strings */
296 ++pe;
297
298 fullLength&=0xffff; /* bits 16 and higher are reserved */
299
300 /* skip the lowercase result string */
301 pe+=fullLength&UCASE_FULL_LOWER;
302 fullLength>>=4;
303
304 /* add the full case folding string */
305 length=fullLength&0xf;
306 if(length!=0) {
307 sa->addString(sa->set, (const UChar *)pe, length);
308 pe+=length;
309 }
310
311 /* skip the uppercase and titlecase strings */
312 fullLength>>=4;
313 pe+=fullLength&0xf;
314 fullLength>>=4;
315 pe+=fullLength;
316
317 closure=(const UChar *)pe; /* behind full case mappings */
318 }
319
320 /* add each code point in the closure string */
729e4ab9
A
321 for(idx=0; idx<closureLength;) {
322 U16_NEXT_UNSAFE(closure, idx, c);
73c04bcf
A
323 sa->add(sa->set, c);
324 }
325 }
326}
327
328/*
329 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
330 * must be length>0 and max>0 and length<=max
331 */
4388f060 332static inline int32_t
73c04bcf
A
333strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
334 int32_t c1, c2;
335
336 max-=length; /* we require length<=max, so no need to decrement max in the loop */
337 do {
338 c1=*s++;
339 c2=*t++;
340 if(c2==0) {
341 return 1; /* reached the end of t but not of s */
342 }
343 c1-=c2;
344 if(c1!=0) {
345 return c1; /* return difference result */
346 }
347 } while(--length>0);
348 /* ends with length==0 */
349
350 if(max==0 || *t==0) {
351 return 0; /* equal to length of both strings */
352 } else {
353 return -max; /* return lengh difference */
354 }
355}
356
46f4442e 357U_CFUNC UBool U_EXPORT2
f3c0d7a5 358ucase_addStringCaseClosure(const UChar *s, int32_t length, const USetAdder *sa) {
73c04bcf
A
359 int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
360
f3c0d7a5 361 if(ucase_props_singleton.unfold==NULL || s==NULL) {
73c04bcf
A
362 return FALSE; /* no reverse case folding data, or no string */
363 }
364 if(length<=1) {
365 /* the string is too short to find any match */
366 /*
367 * more precise would be:
368 * if(!u_strHasMoreChar32Than(s, length, 1))
369 * but this does not make much practical difference because
370 * a single supplementary code point would just not be found
371 */
372 return FALSE;
373 }
374
f3c0d7a5 375 const uint16_t *unfold=ucase_props_singleton.unfold;
73c04bcf
A
376 unfoldRows=unfold[UCASE_UNFOLD_ROWS];
377 unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
378 unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
379 unfold+=unfoldRowWidth;
380
381 if(length>unfoldStringWidth) {
382 /* the string is too long to find any match */
383 return FALSE;
384 }
385
386 /* do a binary search for the string */
387 start=0;
388 limit=unfoldRows;
389 while(start<limit) {
390 i=(start+limit)/2;
4388f060 391 const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth));
73c04bcf
A
392 result=strcmpMax(s, length, p, unfoldStringWidth);
393
394 if(result==0) {
395 /* found the string: add each code point, and its case closure */
396 UChar32 c;
397
398 for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
399 U16_NEXT_UNSAFE(p, i, c);
400 sa->add(sa->set, c);
f3c0d7a5 401 ucase_addCaseClosure(c, sa);
73c04bcf
A
402 }
403 return TRUE;
404 } else if(result<0) {
405 limit=i;
406 } else /* result>0 */ {
407 start=i+1;
408 }
409 }
410
411 return FALSE; /* string not found */
412}
413
4388f060
A
414U_NAMESPACE_BEGIN
415
416FullCaseFoldingIterator::FullCaseFoldingIterator()
417 : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)),
418 unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
419 unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
420 unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
421 currentRow(0),
422 rowCpIndex(unfoldStringWidth) {
423 unfold+=unfoldRowWidth;
424}
425
426UChar32
427FullCaseFoldingIterator::next(UnicodeString &full) {
428 // Advance past the last-delivered code point.
429 const UChar *p=unfold+(currentRow*unfoldRowWidth);
430 if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
431 ++currentRow;
432 p+=unfoldRowWidth;
433 rowCpIndex=unfoldStringWidth;
434 }
435 if(currentRow>=unfoldRows) { return U_SENTINEL; }
436 // Set "full" to the NUL-terminated string in the first unfold column.
437 int32_t length=unfoldStringWidth;
438 while(length>0 && p[length-1]==0) { --length; }
439 full.setTo(FALSE, p, length);
440 // Return the code point.
441 UChar32 c;
442 U16_NEXT_UNSAFE(p, rowCpIndex, c);
443 return c;
444}
445
0f5d89e8
A
446namespace LatinCase {
447
448const int8_t TO_LOWER_NORMAL[LIMIT] = {
449 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
450 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
451 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
452 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
453
454 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
455 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
456 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
457 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
458
459 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
460 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
461 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
462 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
463
464 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
465 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
466 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
467 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
468
469 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
470 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
471 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
472 EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
473
474 0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
475 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
476 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
477 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
478};
479
480const int8_t TO_LOWER_TR_LT[LIMIT] = {
481 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
482 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
483 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
484 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
485
486 0, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32, 32, 32, 32,
487 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
488 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
489 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
490
491 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
492 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
493 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
494 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
495
496 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32,
497 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
498 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
499 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
500
501 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
502 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
503 1, 0, 1, 0, 1, 0, 1, 0, EXC, 0, 1, 0, 1, 0, EXC, 0,
504 EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
505
506 0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
507 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
508 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
509 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
510};
511
512const int8_t TO_UPPER_NORMAL[LIMIT] = {
513 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
514 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
515 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
516 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
517
518 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
519 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
520 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
521 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
522
523 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
524 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
525 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
526 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
527
528 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
529 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
530 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
531 -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
532
533 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
534 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
535 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
536 0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
537
538 -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
539 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
540 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
541 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
542};
543
544const int8_t TO_UPPER_TR[LIMIT] = {
545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
546 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
547 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
548 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
549
550 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
551 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
552 0, -32, -32, -32, -32, -32, -32, -32, -32, EXC, -32, -32, -32, -32, -32, -32,
553 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
554
555 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
556 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
557 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
558 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
559
560 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
561 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
562 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
563 -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
564
565 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
566 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
567 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
568 0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
569
570 -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
571 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
572 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
573 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
574};
575
576} // namespace LatinCase
577
4388f060
A
578U_NAMESPACE_END
579
374ca955
A
580/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
581U_CAPI int32_t U_EXPORT2
f3c0d7a5
A
582ucase_getType(UChar32 c) {
583 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
73c04bcf 584 return UCASE_GET_TYPE(props);
374ca955
A
585}
586
729e4ab9 587/** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
374ca955 588U_CAPI int32_t U_EXPORT2
f3c0d7a5
A
589ucase_getTypeOrIgnorable(UChar32 c) {
590 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
4388f060 591 return UCASE_GET_TYPE_AND_IGNORABLE(props);
374ca955
A
592}
593
594/** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
4388f060 595static inline int32_t
f3c0d7a5
A
596getDotType(UChar32 c) {
597 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
0f5d89e8 598 if(!UCASE_HAS_EXCEPTION(props)) {
374ca955
A
599 return props&UCASE_DOT_MASK;
600 } else {
f3c0d7a5 601 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
374ca955
A
602 return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
603 }
604}
605
606U_CAPI UBool U_EXPORT2
f3c0d7a5
A
607ucase_isSoftDotted(UChar32 c) {
608 return (UBool)(getDotType(c)==UCASE_SOFT_DOTTED);
374ca955
A
609}
610
611U_CAPI UBool U_EXPORT2
f3c0d7a5
A
612ucase_isCaseSensitive(UChar32 c) {
613 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
0f5d89e8
A
614 if(!UCASE_HAS_EXCEPTION(props)) {
615 return (UBool)((props&UCASE_SENSITIVE)!=0);
616 } else {
617 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
618 return (UBool)((*pe&UCASE_EXC_SENSITIVE)!=0);
619 }
374ca955
A
620}
621
374ca955
A
622/* string casing ------------------------------------------------------------ */
623
624/*
625 * These internal functions form the core of string case mappings.
626 * They map single code points to result code points or strings and take
627 * all necessary conditions (context, locale ID, options) into account.
628 *
629 * They do not iterate over the source or write to the destination
630 * so that the same functions are useful for non-standard string storage,
631 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
632 * For the same reason, the "surrounding text" context is passed in as a
633 * UCaseContextIterator which does not make any assumptions about
634 * the underlying storage.
635 *
636 * This section contains helper functions that check for conditions
637 * in the input text surrounding the current code point
638 * according to SpecialCasing.txt.
639 *
640 * Each helper function gets the index
641 * - after the current code point if it looks at following text
642 * - before the current code point if it looks at preceding text
643 *
644 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
645 *
646 * Final_Sigma
647 * C is preceded by a sequence consisting of
648 * a cased letter and a case-ignorable sequence,
649 * and C is not followed by a sequence consisting of
650 * an ignorable sequence and then a cased letter.
651 *
652 * More_Above
653 * C is followed by one or more characters of combining class 230 (ABOVE)
654 * in the combining character sequence.
655 *
656 * After_Soft_Dotted
657 * The last preceding character with combining class of zero before C
658 * was Soft_Dotted,
659 * and there is no intervening combining character class 230 (ABOVE).
660 *
661 * Before_Dot
662 * C is followed by combining dot above (U+0307).
663 * Any sequence of characters with a combining class that is neither 0 nor 230
664 * may intervene between the current character and the combining dot above.
665 *
666 * The erratum from 2002-10-31 adds the condition
667 *
668 * After_I
669 * The last preceding base character was an uppercase I, and there is no
670 * intervening combining character class 230 (ABOVE).
671 *
672 * (See Jitterbug 2344 and the comments on After_I below.)
673 *
674 * Helper definitions in Unicode 3.2 UAX 21:
675 *
676 * D1. A character C is defined to be cased
677 * if it meets any of the following criteria:
678 *
679 * - The general category of C is Titlecase Letter (Lt)
680 * - In [CoreProps], C has one of the properties Uppercase, or Lowercase
681 * - Given D = NFD(C), then it is not the case that:
682 * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
683 * (This third criterium does not add any characters to the list
684 * for Unicode 3.2. Ignored.)
685 *
686 * D2. A character C is defined to be case-ignorable
687 * if it meets either of the following criteria:
688 *
689 * - The general category of C is
690 * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
691 * Letter Modifier (Lm), or Symbol Modifier (Sk)
692 * - C is one of the following characters
693 * U+0027 APOSTROPHE
694 * U+00AD SOFT HYPHEN (SHY)
695 * U+2019 RIGHT SINGLE QUOTATION MARK
696 * (the preferred character for apostrophe)
697 *
698 * D3. A case-ignorable sequence is a sequence of
699 * zero or more case-ignorable characters.
700 */
701
46f4442e 702#define is_d(c) ((c)=='d' || (c)=='D')
374ca955
A
703#define is_e(c) ((c)=='e' || (c)=='E')
704#define is_i(c) ((c)=='i' || (c)=='I')
705#define is_l(c) ((c)=='l' || (c)=='L')
706#define is_r(c) ((c)=='r' || (c)=='R')
707#define is_t(c) ((c)=='t' || (c)=='T')
708#define is_u(c) ((c)=='u' || (c)=='U')
709#define is_z(c) ((c)=='z' || (c)=='Z')
710
711/* separator? */
712#define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
713
73c04bcf 714/**
374ca955
A
715 * Requires non-NULL locale ID but otherwise does the equivalent of
716 * checking for language codes as if uloc_getLanguage() were called:
717 * Accepts both 2- and 3-letter codes and accepts case variants.
718 */
73c04bcf 719U_CFUNC int32_t
f3c0d7a5 720ucase_getCaseLocale(const char *locale) {
374ca955
A
721 /*
722 * This function used to use uloc_getLanguage(), but the current code
723 * removes the dependency of this low-level code on uloc implementation code
724 * and is faster because not the whole locale ID has to be
725 * examined and copied/transformed.
726 *
727 * Because this code does not want to depend on uloc, the caller must
728 * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
729 */
f3c0d7a5
A
730 char c=*locale++;
731 // Fastpath for English "en" which is often used for default (=root locale) case mappings,
732 // and for Chinese "zh": Very common but no special case mapping behavior.
733 // Then check lowercase vs. uppercase to reduce the number of comparisons
734 // for other locales without special behavior.
735 if(c=='e') {
736 /* el or ell? */
374ca955 737 c=*locale++;
f3c0d7a5 738 if(is_l(c)) {
374ca955 739 c=*locale++;
f3c0d7a5
A
740 if(is_l(c)) {
741 c=*locale;
742 }
374ca955 743 if(is_sep(c)) {
f3c0d7a5 744 return UCASE_LOC_GREEK;
374ca955
A
745 }
746 }
f3c0d7a5
A
747 // en, es, ... -> root
748 } else if(c=='z') {
749 return UCASE_LOC_ROOT;
750#if U_CHARSET_FAMILY==U_ASCII_FAMILY
751 } else if(c>='a') { // ASCII a-z = 0x61..0x7a, after A-Z
752#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
753 } else if(c<='z') { // EBCDIC a-z = 0x81..0xa9 with two gaps, before A-Z
754#else
755# error Unknown charset family!
756#endif
757 // lowercase c
758 if(c=='t') {
759 /* tr or tur? */
374ca955 760 c=*locale++;
f3c0d7a5
A
761 if(is_u(c)) {
762 c=*locale++;
763 }
764 if(is_r(c)) {
374ca955 765 c=*locale;
f3c0d7a5
A
766 if(is_sep(c)) {
767 return UCASE_LOC_TURKISH;
768 }
374ca955 769 }
f3c0d7a5
A
770 } else if(c=='a') {
771 /* az or aze? */
772 c=*locale++;
773 if(is_z(c)) {
774 c=*locale++;
775 if(is_e(c)) {
776 c=*locale;
777 }
778 if(is_sep(c)) {
779 return UCASE_LOC_TURKISH;
780 }
374ca955 781 }
f3c0d7a5
A
782 } else if(c=='l') {
783 /* lt or lit? */
374ca955 784 c=*locale++;
f3c0d7a5
A
785 if(is_i(c)) {
786 c=*locale++;
787 }
788 if(is_t(c)) {
789 c=*locale;
790 if(is_sep(c)) {
791 return UCASE_LOC_LITHUANIAN;
792 }
793 }
794 } else if(c=='n') {
795 /* nl or nld? */
796 c=*locale++;
797 if(is_l(c)) {
798 c=*locale++;
799 if(is_d(c)) {
800 c=*locale;
801 }
802 if(is_sep(c)) {
803 return UCASE_LOC_DUTCH;
804 }
46f4442e
A
805 }
806 }
f3c0d7a5
A
807 } else {
808 // uppercase c
809 // Same code as for lowercase c but also check for 'E'.
810 if(c=='T') {
811 /* tr or tur? */
46f4442e 812 c=*locale++;
f3c0d7a5
A
813 if(is_u(c)) {
814 c=*locale++;
815 }
816 if(is_r(c)) {
46f4442e 817 c=*locale;
f3c0d7a5
A
818 if(is_sep(c)) {
819 return UCASE_LOC_TURKISH;
820 }
46f4442e 821 }
f3c0d7a5
A
822 } else if(c=='A') {
823 /* az or aze? */
824 c=*locale++;
825 if(is_z(c)) {
826 c=*locale++;
827 if(is_e(c)) {
828 c=*locale;
829 }
830 if(is_sep(c)) {
831 return UCASE_LOC_TURKISH;
832 }
833 }
834 } else if(c=='L') {
835 /* lt or lit? */
836 c=*locale++;
837 if(is_i(c)) {
838 c=*locale++;
839 }
840 if(is_t(c)) {
841 c=*locale;
842 if(is_sep(c)) {
843 return UCASE_LOC_LITHUANIAN;
844 }
845 }
846 } else if(c=='E') {
847 /* el or ell? */
848 c=*locale++;
849 if(is_l(c)) {
850 c=*locale++;
851 if(is_l(c)) {
852 c=*locale;
853 }
854 if(is_sep(c)) {
855 return UCASE_LOC_GREEK;
856 }
857 }
858 } else if(c=='N') {
859 /* nl or nld? */
860 c=*locale++;
861 if(is_l(c)) {
862 c=*locale++;
863 if(is_d(c)) {
864 c=*locale;
865 }
866 if(is_sep(c)) {
867 return UCASE_LOC_DUTCH;
868 }
374ca955
A
869 }
870 }
871 }
f3c0d7a5 872 return UCASE_LOC_ROOT;
374ca955
A
873}
874
729e4ab9
A
875/*
876 * Is followed by
877 * {case-ignorable}* cased
878 * ?
879 * (dir determines looking forward/backward)
880 * If a character is case-ignorable, it is skipped regardless of whether
881 * it is also cased or not.
882 */
374ca955 883static UBool
f3c0d7a5 884isFollowedByCasedLetter(UCaseContextIterator *iter, void *context, int8_t dir) {
374ca955 885 UChar32 c;
374ca955
A
886
887 if(iter==NULL) {
888 return FALSE;
889 }
890
891 for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
f3c0d7a5 892 int32_t type=ucase_getTypeOrIgnorable(c);
729e4ab9 893 if(type&4) {
374ca955 894 /* case-ignorable, continue with the loop */
729e4ab9
A
895 } else if(type!=UCASE_NONE) {
896 return TRUE; /* followed by cased letter */
374ca955 897 } else {
729e4ab9 898 return FALSE; /* uncased and not case-ignorable */
374ca955
A
899 }
900 }
901
902 return FALSE; /* not followed by cased letter */
903}
904
905/* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
906static UBool
f3c0d7a5 907isPrecededBySoftDotted(UCaseContextIterator *iter, void *context) {
374ca955
A
908 UChar32 c;
909 int32_t dotType;
910 int8_t dir;
911
912 if(iter==NULL) {
913 return FALSE;
914 }
915
916 for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
f3c0d7a5 917 dotType=getDotType(c);
374ca955
A
918 if(dotType==UCASE_SOFT_DOTTED) {
919 return TRUE; /* preceded by TYPE_i */
920 } else if(dotType!=UCASE_OTHER_ACCENT) {
921 return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
922 }
923 }
924
925 return FALSE; /* not preceded by TYPE_i */
926}
927
928/*
929 * See Jitterbug 2344:
930 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
931 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
932 * we made those releases compatible with Unicode 3.2 which had not fixed
933 * a related bug in SpecialCasing.txt.
934 *
935 * From the Jitterbug 2344 text:
936 * ... this bug is listed as a Unicode erratum
937 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
938 * <quote>
939 * There are two errors in SpecialCasing.txt.
940 * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
941 * 2. An incorrect context definition. Correct as follows:
942 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
943 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
944 * ---
945 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
946 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
947 * where the context After_I is defined as:
948 * The last preceding base character was an uppercase I, and there is no
949 * intervening combining character class 230 (ABOVE).
950 * </quote>
951 *
952 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
953 *
954 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
955 * # This matches the behavior of the canonically equivalent I-dot_above
956 *
957 * See also the description in this place in older versions of uchar.c (revision 1.100).
958 *
959 * Markus W. Scherer 2003-feb-15
960 */
961
962/* Is preceded by base character 'I' with no intervening cc=230 ? */
963static UBool
f3c0d7a5 964isPrecededBy_I(UCaseContextIterator *iter, void *context) {
374ca955
A
965 UChar32 c;
966 int32_t dotType;
967 int8_t dir;
968
969 if(iter==NULL) {
970 return FALSE;
971 }
972
973 for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
974 if(c==0x49) {
975 return TRUE; /* preceded by I */
976 }
f3c0d7a5 977 dotType=getDotType(c);
374ca955
A
978 if(dotType!=UCASE_OTHER_ACCENT) {
979 return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
980 }
981 }
982
983 return FALSE; /* not preceded by I */
984}
985
986/* Is followed by one or more cc==230 ? */
987static UBool
f3c0d7a5 988isFollowedByMoreAbove(UCaseContextIterator *iter, void *context) {
374ca955
A
989 UChar32 c;
990 int32_t dotType;
991 int8_t dir;
992
993 if(iter==NULL) {
994 return FALSE;
995 }
996
997 for(dir=1; (c=iter(context, dir))>=0; dir=0) {
f3c0d7a5 998 dotType=getDotType(c);
374ca955
A
999 if(dotType==UCASE_ABOVE) {
1000 return TRUE; /* at least one cc==230 following */
1001 } else if(dotType!=UCASE_OTHER_ACCENT) {
1002 return FALSE; /* next base character, no more cc==230 following */
1003 }
1004 }
1005
1006 return FALSE; /* no more cc==230 following */
1007}
1008
1009/* Is followed by a dot above (without cc==230 in between) ? */
1010static UBool
f3c0d7a5 1011isFollowedByDotAbove(UCaseContextIterator *iter, void *context) {
374ca955
A
1012 UChar32 c;
1013 int32_t dotType;
1014 int8_t dir;
1015
1016 if(iter==NULL) {
1017 return FALSE;
1018 }
1019
1020 for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1021 if(c==0x307) {
1022 return TRUE;
1023 }
f3c0d7a5 1024 dotType=getDotType(c);
374ca955
A
1025 if(dotType!=UCASE_OTHER_ACCENT) {
1026 return FALSE; /* next base character or cc==230 in between */
1027 }
1028 }
1029
1030 return FALSE; /* no dot above following */
1031}
1032
1033U_CAPI int32_t U_EXPORT2
f3c0d7a5 1034ucase_toFullLower(UChar32 c,
374ca955
A
1035 UCaseContextIterator *iter, void *context,
1036 const UChar **pString,
f3c0d7a5
A
1037 int32_t loc) {
1038 // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1039 U_ASSERT(c >= 0);
729e4ab9 1040 UChar32 result=c;
f3c0d7a5 1041 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
0f5d89e8
A
1042 if(!UCASE_HAS_EXCEPTION(props)) {
1043 if(UCASE_IS_UPPER_OR_TITLE(props)) {
73c04bcf 1044 result=c+UCASE_GET_DELTA(props);
374ca955
A
1045 }
1046 } else {
f3c0d7a5 1047 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
374ca955
A
1048 uint16_t excWord=*pe++;
1049 int32_t full;
1050
1051 pe2=pe;
1052
1053 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1054 /* use hardcoded conditions and mappings */
374ca955
A
1055
1056 /*
1057 * Test for conditional mappings first
1058 * (otherwise the unconditional default mappings are always taken),
1059 * then test for characters that have unconditional mappings in SpecialCasing.txt,
1060 * then get the UnicodeData.txt mappings.
1061 */
46f4442e 1062 if( loc==UCASE_LOC_LITHUANIAN &&
374ca955
A
1063 /* base characters, find accents above */
1064 (((c==0x49 || c==0x4a || c==0x12e) &&
f3c0d7a5 1065 isFollowedByMoreAbove(iter, context)) ||
374ca955
A
1066 /* precomposed with accent above, no need to find one */
1067 (c==0xcc || c==0xcd || c==0x128))
1068 ) {
1069 /*
1070 # Lithuanian
1071
1072 # Lithuanian retains the dot in a lowercase i when followed by accents.
1073
1074 # Introduce an explicit dot above when lowercasing capital I's and J's
1075 # whenever there are more accents above.
1076 # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
1077
1078 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
1079 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
1080 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
1081 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
1082 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
1083 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
1084 */
1085 switch(c) {
1086 case 0x49: /* LATIN CAPITAL LETTER I */
1087 *pString=iDot;
1088 return 2;
1089 case 0x4a: /* LATIN CAPITAL LETTER J */
1090 *pString=jDot;
1091 return 2;
1092 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
1093 *pString=iOgonekDot;
1094 return 2;
1095 case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */
1096 *pString=iDotGrave;
1097 return 3;
1098 case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */
1099 *pString=iDotAcute;
1100 return 3;
1101 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
1102 *pString=iDotTilde;
1103 return 3;
1104 default:
1105 return 0; /* will not occur */
1106 }
1107 /* # Turkish and Azeri */
46f4442e 1108 } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
374ca955
A
1109 /*
1110 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1111 # The following rules handle those cases.
1112
1113 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
1114 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
1115 */
1116 return 0x69;
f3c0d7a5 1117 } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(iter, context)) {
374ca955
A
1118 /*
1119 # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1120 # This matches the behavior of the canonically equivalent I-dot_above
1121
1122 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1123 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1124 */
0f5d89e8 1125 *pString=nullptr;
374ca955 1126 return 0; /* remove the dot (continue without output) */
f3c0d7a5 1127 } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, context)) {
374ca955
A
1128 /*
1129 # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
1130
1131 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
1132 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
1133 */
1134 return 0x131;
1135 } else if(c==0x130) {
1136 /*
1137 # Preserve canonical equivalence for I with dot. Turkic is handled below.
1138
1139 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1140 */
1141 *pString=iDot;
1142 return 2;
1143 } else if( c==0x3a3 &&
f3c0d7a5
A
1144 !isFollowedByCasedLetter(iter, context, 1) &&
1145 isFollowedByCasedLetter(iter, context, -1) /* -1=preceded */
374ca955
A
1146 ) {
1147 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
1148 /*
1149 # Special case for final form of sigma
1150
1151 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
1152 */
1153 return 0x3c2; /* greek small final sigma */
1154 } else {
1155 /* no known conditional special case mapping, use a normal mapping */
1156 }
1157 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1158 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1159 full&=UCASE_FULL_LOWER;
1160 if(full!=0) {
1161 /* set the output pointer to the lowercase mapping */
4388f060 1162 *pString=reinterpret_cast<const UChar *>(pe+1);
374ca955
A
1163
1164 /* return the string length */
1165 return full;
1166 }
1167 }
1168
0f5d89e8
A
1169 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1170 int32_t delta;
1171 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1172 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1173 }
374ca955
A
1174 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1175 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
1176 }
1177 }
1178
1179 return (result==c) ? ~result : result;
1180}
1181
1182/* internal */
1183static int32_t
f3c0d7a5 1184toUpperOrTitle(UChar32 c,
374ca955
A
1185 UCaseContextIterator *iter, void *context,
1186 const UChar **pString,
f3c0d7a5 1187 int32_t loc,
374ca955 1188 UBool upperNotTitle) {
f3c0d7a5
A
1189 // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1190 U_ASSERT(c >= 0);
729e4ab9 1191 UChar32 result=c;
f3c0d7a5 1192 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
0f5d89e8 1193 if(!UCASE_HAS_EXCEPTION(props)) {
73c04bcf
A
1194 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
1195 result=c+UCASE_GET_DELTA(props);
374ca955
A
1196 }
1197 } else {
f3c0d7a5 1198 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
374ca955 1199 uint16_t excWord=*pe++;
729e4ab9 1200 int32_t full, idx;
374ca955
A
1201
1202 pe2=pe;
1203
1204 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1205 /* use hardcoded conditions and mappings */
46f4442e 1206 if(loc==UCASE_LOC_TURKISH && c==0x69) {
374ca955
A
1207 /*
1208 # Turkish and Azeri
1209
1210 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1211 # The following rules handle those cases.
1212
1213 # When uppercasing, i turns into a dotted capital I
1214
1215 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1216 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1217 */
1218 return 0x130;
f3c0d7a5 1219 } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter, context)) {
374ca955
A
1220 /*
1221 # Lithuanian
1222
1223 # Lithuanian retains the dot in a lowercase i when followed by accents.
1224
1225 # Remove DOT ABOVE after "i" with upper or titlecase
1226
1227 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1228 */
0f5d89e8 1229 *pString=nullptr;
374ca955
A
1230 return 0; /* remove the dot (continue without output) */
1231 } else {
1232 /* no known conditional special case mapping, use a normal mapping */
1233 }
1234 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1235 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1236
1237 /* start of full case mapping strings */
1238 ++pe;
1239
1240 /* skip the lowercase and case-folding result strings */
1241 pe+=full&UCASE_FULL_LOWER;
1242 full>>=4;
1243 pe+=full&0xf;
1244 full>>=4;
1245
1246 if(upperNotTitle) {
1247 full&=0xf;
1248 } else {
1249 /* skip the uppercase result string */
1250 pe+=full&0xf;
1251 full=(full>>4)&0xf;
1252 }
1253
1254 if(full!=0) {
1255 /* set the output pointer to the result string */
4388f060 1256 *pString=reinterpret_cast<const UChar *>(pe);
374ca955
A
1257
1258 /* return the string length */
1259 return full;
1260 }
1261 }
1262
0f5d89e8
A
1263 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
1264 int32_t delta;
1265 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1266 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1267 }
374ca955 1268 if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
729e4ab9 1269 idx=UCASE_EXC_TITLE;
374ca955
A
1270 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1271 /* here, titlecase is same as uppercase */
729e4ab9 1272 idx=UCASE_EXC_UPPER;
374ca955
A
1273 } else {
1274 return ~c;
1275 }
729e4ab9 1276 GET_SLOT_VALUE(excWord, idx, pe2, result);
374ca955
A
1277 }
1278
1279 return (result==c) ? ~result : result;
1280}
1281
1282U_CAPI int32_t U_EXPORT2
f3c0d7a5 1283ucase_toFullUpper(UChar32 c,
374ca955
A
1284 UCaseContextIterator *iter, void *context,
1285 const UChar **pString,
f3c0d7a5
A
1286 int32_t caseLocale) {
1287 return toUpperOrTitle(c, iter, context, pString, caseLocale, TRUE);
374ca955
A
1288}
1289
1290U_CAPI int32_t U_EXPORT2
f3c0d7a5 1291ucase_toFullTitle(UChar32 c,
374ca955
A
1292 UCaseContextIterator *iter, void *context,
1293 const UChar **pString,
f3c0d7a5
A
1294 int32_t caseLocale) {
1295 return toUpperOrTitle(c, iter, context, pString, caseLocale, FALSE);
374ca955
A
1296}
1297
1298/* case folding ------------------------------------------------------------- */
1299
1300/*
1301 * Case folding is similar to lowercasing.
1302 * The result may be a simple mapping, i.e., a single code point, or
1303 * a full mapping, i.e., a string.
1304 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1305 * then only the lowercase mapping is stored.
1306 *
1307 * Some special cases are hardcoded because their conditions cannot be
1308 * parsed and processed from CaseFolding.txt.
1309 *
1310 * Unicode 3.2 CaseFolding.txt specifies for its status field:
1311
1312# C: common case folding, common mappings shared by both simple and full mappings.
1313# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1314# S: simple case folding, mappings to single characters where different from F.
1315# T: special case for uppercase I and dotted uppercase I
1316# - For non-Turkic languages, this mapping is normally not used.
1317# - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1318#
1319# Usage:
1320# A. To do a simple case folding, use the mappings with status C + S.
1321# B. To do a full case folding, use the mappings with status C + F.
1322#
1323# The mappings with status T can be used or omitted depending on the desired case-folding
1324# behavior. (The default option is to exclude them.)
1325
1326 * Unicode 3.2 has 'T' mappings as follows:
1327
13280049; T; 0131; # LATIN CAPITAL LETTER I
13290130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1330
1331 * while the default mappings for these code points are:
1332
13330049; C; 0069; # LATIN CAPITAL LETTER I
13340130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1335
73c04bcf 1336 * U+0130 has no simple case folding (simple-case-folds to itself).
374ca955
A
1337 */
1338
1339/* return the simple case folding mapping for c */
1340U_CAPI UChar32 U_EXPORT2
f3c0d7a5
A
1341ucase_fold(UChar32 c, uint32_t options) {
1342 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
0f5d89e8
A
1343 if(!UCASE_HAS_EXCEPTION(props)) {
1344 if(UCASE_IS_UPPER_OR_TITLE(props)) {
73c04bcf 1345 c+=UCASE_GET_DELTA(props);
374ca955
A
1346 }
1347 } else {
f3c0d7a5 1348 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
374ca955 1349 uint16_t excWord=*pe++;
729e4ab9 1350 int32_t idx;
374ca955
A
1351 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1352 /* special case folding mappings, hardcoded */
1353 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1354 /* default mappings */
1355 if(c==0x49) {
1356 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1357 return 0x69;
1358 } else if(c==0x130) {
73c04bcf
A
1359 /* no simple case folding for U+0130 */
1360 return c;
374ca955
A
1361 }
1362 } else {
1363 /* Turkic mappings */
1364 if(c==0x49) {
1365 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1366 return 0x131;
1367 } else if(c==0x130) {
1368 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1369 return 0x69;
1370 }
1371 }
1372 }
0f5d89e8
A
1373 if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1374 return c;
1375 }
1376 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1377 int32_t delta;
1378 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
1379 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1380 }
374ca955 1381 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
729e4ab9 1382 idx=UCASE_EXC_FOLD;
374ca955 1383 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
729e4ab9 1384 idx=UCASE_EXC_LOWER;
374ca955
A
1385 } else {
1386 return c;
1387 }
729e4ab9 1388 GET_SLOT_VALUE(excWord, idx, pe, c);
374ca955
A
1389 }
1390 return c;
1391}
1392
1393/*
1394 * Issue for canonical caseless match (UAX #21):
1395 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1396 * canonical equivalence, unlike default-option casefolding.
1397 * For example, I-grave and I + grave fold to strings that are not canonically
1398 * equivalent.
1399 * For more details, see the comment in unorm_compare() in unorm.cpp
1400 * and the intermediate prototype changes for Jitterbug 2021.
1401 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1402 *
1403 * This did not get fixed because it appears that it is not possible to fix
1404 * it for uppercase and lowercase characters (I-grave vs. i-grave)
1405 * together in a way that they still fold to common result strings.
1406 */
1407
1408U_CAPI int32_t U_EXPORT2
f3c0d7a5 1409ucase_toFullFolding(UChar32 c,
374ca955 1410 const UChar **pString,
f3c0d7a5
A
1411 uint32_t options) {
1412 // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1413 U_ASSERT(c >= 0);
729e4ab9 1414 UChar32 result=c;
f3c0d7a5 1415 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
0f5d89e8
A
1416 if(!UCASE_HAS_EXCEPTION(props)) {
1417 if(UCASE_IS_UPPER_OR_TITLE(props)) {
73c04bcf 1418 result=c+UCASE_GET_DELTA(props);
374ca955
A
1419 }
1420 } else {
f3c0d7a5 1421 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
374ca955 1422 uint16_t excWord=*pe++;
729e4ab9 1423 int32_t full, idx;
374ca955
A
1424
1425 pe2=pe;
1426
1427 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1428 /* use hardcoded conditions and mappings */
1429 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1430 /* default mappings */
1431 if(c==0x49) {
1432 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1433 return 0x69;
1434 } else if(c==0x130) {
1435 /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1436 *pString=iDot;
1437 return 2;
1438 }
1439 } else {
1440 /* Turkic mappings */
1441 if(c==0x49) {
1442 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1443 return 0x131;
1444 } else if(c==0x130) {
1445 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1446 return 0x69;
1447 }
1448 }
1449 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1450 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1451
1452 /* start of full case mapping strings */
1453 ++pe;
1454
1455 /* skip the lowercase result string */
1456 pe+=full&UCASE_FULL_LOWER;
1457 full=(full>>4)&0xf;
1458
1459 if(full!=0) {
1460 /* set the output pointer to the result string */
4388f060 1461 *pString=reinterpret_cast<const UChar *>(pe);
374ca955
A
1462
1463 /* return the string length */
1464 return full;
1465 }
1466 }
1467
0f5d89e8
A
1468 if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1469 return ~c;
1470 }
1471 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1472 int32_t delta;
1473 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1474 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1475 }
374ca955 1476 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
729e4ab9 1477 idx=UCASE_EXC_FOLD;
374ca955 1478 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
729e4ab9 1479 idx=UCASE_EXC_LOWER;
374ca955
A
1480 } else {
1481 return ~c;
1482 }
729e4ab9 1483 GET_SLOT_VALUE(excWord, idx, pe2, result);
374ca955
A
1484 }
1485
1486 return (result==c) ? ~result : result;
1487}
73c04bcf
A
1488
1489/* case mapping properties API ---------------------------------------------- */
1490
73c04bcf
A
1491/* public API (see uchar.h) */
1492
1493U_CAPI UBool U_EXPORT2
1494u_isULowercase(UChar32 c) {
f3c0d7a5 1495 return (UBool)(UCASE_LOWER==ucase_getType(c));
73c04bcf
A
1496}
1497
1498U_CAPI UBool U_EXPORT2
1499u_isUUppercase(UChar32 c) {
f3c0d7a5 1500 return (UBool)(UCASE_UPPER==ucase_getType(c));
73c04bcf
A
1501}
1502
1503/* Transforms the Unicode character to its lower case equivalent.*/
1504U_CAPI UChar32 U_EXPORT2
1505u_tolower(UChar32 c) {
f3c0d7a5 1506 return ucase_tolower(c);
73c04bcf
A
1507}
1508
1509/* Transforms the Unicode character to its upper case equivalent.*/
1510U_CAPI UChar32 U_EXPORT2
1511u_toupper(UChar32 c) {
f3c0d7a5 1512 return ucase_toupper(c);
73c04bcf
A
1513}
1514
1515/* Transforms the Unicode character to its title case equivalent.*/
1516U_CAPI UChar32 U_EXPORT2
1517u_totitle(UChar32 c) {
f3c0d7a5 1518 return ucase_totitle(c);
73c04bcf
A
1519}
1520
1521/* return the simple case folding mapping for c */
1522U_CAPI UChar32 U_EXPORT2
1523u_foldCase(UChar32 c, uint32_t options) {
f3c0d7a5 1524 return ucase_fold(c, options);
73c04bcf
A
1525}
1526
1527U_CFUNC int32_t U_EXPORT2
1528ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1529 /* case mapping properties */
729e4ab9 1530 const UChar *resultString;
73c04bcf
A
1531 switch(which) {
1532 case UCHAR_LOWERCASE:
f3c0d7a5 1533 return (UBool)(UCASE_LOWER==ucase_getType(c));
73c04bcf 1534 case UCHAR_UPPERCASE:
f3c0d7a5 1535 return (UBool)(UCASE_UPPER==ucase_getType(c));
73c04bcf 1536 case UCHAR_SOFT_DOTTED:
f3c0d7a5 1537 return ucase_isSoftDotted(c);
73c04bcf 1538 case UCHAR_CASE_SENSITIVE:
f3c0d7a5 1539 return ucase_isCaseSensitive(c);
729e4ab9 1540 case UCHAR_CASED:
f3c0d7a5 1541 return (UBool)(UCASE_NONE!=ucase_getType(c));
729e4ab9 1542 case UCHAR_CASE_IGNORABLE:
f3c0d7a5 1543 return (UBool)(ucase_getTypeOrIgnorable(c)>>2);
729e4ab9
A
1544 /*
1545 * Note: The following Changes_When_Xyz are defined as testing whether
1546 * the NFD form of the input changes when Xyz-case-mapped.
1547 * However, this simpler implementation of these properties,
1548 * ignoring NFD, passes the tests.
1549 * The implementation needs to be changed if the tests start failing.
1550 * When that happens, optimizations should be used to work with the
1551 * per-single-code point ucase_toFullXyz() functions unless
1552 * the NFD form has more than one code point,
1553 * and the property starts set needs to be the union of the
1554 * start sets for normalization and case mappings.
1555 */
1556 case UCHAR_CHANGES_WHEN_LOWERCASED:
f3c0d7a5 1557 return (UBool)(ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
729e4ab9 1558 case UCHAR_CHANGES_WHEN_UPPERCASED:
f3c0d7a5 1559 return (UBool)(ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
729e4ab9 1560 case UCHAR_CHANGES_WHEN_TITLECASED:
f3c0d7a5 1561 return (UBool)(ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
729e4ab9
A
1562 /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
1563 case UCHAR_CHANGES_WHEN_CASEMAPPED:
729e4ab9 1564 return (UBool)(
f3c0d7a5
A
1565 ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
1566 ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
1567 ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
73c04bcf
A
1568 default:
1569 return FALSE;
1570 }
1571}