1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
5 // created: 2017feb08 Markus W. Scherer
7 #ifndef __UCASEMAP_IMP_H__
8 #define __UCASEMAP_IMP_H__
10 #include "unicode/utypes.h"
11 #include "unicode/ucasemap.h"
12 #include "unicode/uchar.h"
16 * Bit mask for the titlecasing iterator options bit field.
17 * Currently only 3 out of 8 values are used:
18 * 0 (words), U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
19 * See stringoptions.h.
22 #define U_TITLECASE_ITERATOR_MASK 0xe0
25 * Bit mask for the titlecasing index adjustment options bit set.
26 * Currently two bits are defined:
27 * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED.
28 * See stringoptions.h.
31 #define U_TITLECASE_ADJUSTMENT_MASK 0x600
34 * Internal API, used by u_strcasecmp() etc.
35 * Compare strings case-insensitively,
36 * in code point order or code unit order.
39 u_strcmpFold(const UChar
*s1
, int32_t length1
,
40 const UChar
*s2
, int32_t length2
,
42 UErrorCode
*pErrorCode
);
45 * Internal API, used for detecting length of
46 * shared prefix case-insensitively.
47 * @param s1 input string 1
48 * @param length1 length of string 1, or -1 (NULL terminated)
49 * @param s2 input string 2
50 * @param length2 length of string 2, or -1 (NULL terminated)
51 * @param options compare options
52 * @param matchLen1 (output) length of partial prefix match in s1
53 * @param matchLen2 (output) length of partial prefix match in s2
54 * @param pErrorCode receives error status
57 u_caseInsensitivePrefixMatch(const UChar
*s1
, int32_t length1
,
58 const UChar
*s2
, int32_t length2
,
60 int32_t *matchLen1
, int32_t *matchLen2
,
61 UErrorCode
*pErrorCode
);
67 class BreakIterator
; // unicode/brkiter.h
69 class Locale
; // unicode/locid.h
71 /** Returns TRUE if the options are valid. Otherwise FALSE, and sets an error. */
72 inline UBool
ustrcase_checkTitleAdjustmentOptions(uint32_t options
, UErrorCode
&errorCode
) {
73 if (U_FAILURE(errorCode
)) { return FALSE
; }
74 if ((options
& U_TITLECASE_ADJUSTMENT_MASK
) == U_TITLECASE_ADJUSTMENT_MASK
) {
75 // Both options together.
76 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
82 inline UBool
ustrcase_isLNS(UChar32 c
) {
83 // Letter, number, symbol,
84 // or a private use code point because those are typically used as letters or numbers.
85 // Consider modifier letters only if they are cased.
86 const uint32_t LNS
= (U_GC_L_MASK
|U_GC_N_MASK
|U_GC_S_MASK
|U_GC_CO_MASK
) & ~U_GC_LM_MASK
;
87 int gc
= u_charType(c
);
88 return (U_MASK(gc
) & LNS
) != 0 || (gc
== U_MODIFIER_LETTER
&& ucase_getType(c
) != UCASE_NONE
);
91 #if !UCONFIG_NO_BREAK_ITERATION
93 /** Returns nullptr if error. Pass in either locale or locID, not both. */
95 BreakIterator
*ustrcase_getTitleBreakIterator(
96 const Locale
*locale
, const char *locID
, uint32_t options
, BreakIterator
*iter
,
97 LocalPointer
<BreakIterator
> &ownedIter
, UErrorCode
&errorCode
);
103 #include "unicode/unistr.h" // for UStringCaseMapper
106 * Internal string casing functions implementing
107 * ustring.h/ustrcase.cpp and UnicodeString case mapping functions.
110 struct UCaseMap
: public icu::UMemory
{
111 /** Implements most of ucasemap_open(). */
112 UCaseMap(const char *localeID
, uint32_t opts
, UErrorCode
*pErrorCode
);
115 #if !UCONFIG_NO_BREAK_ITERATION
116 icu::BreakIterator
*iter
; /* We adopt the iterator, so we own it. */
123 #if UCONFIG_NO_BREAK_ITERATION
124 # define UCASEMAP_BREAK_ITERATOR_PARAM
125 # define UCASEMAP_BREAK_ITERATOR_UNUSED
126 # define UCASEMAP_BREAK_ITERATOR
127 # define UCASEMAP_BREAK_ITERATOR_NULL
129 # define UCASEMAP_BREAK_ITERATOR_PARAM icu::BreakIterator *iter,
130 # define UCASEMAP_BREAK_ITERATOR_UNUSED icu::BreakIterator *,
131 # define UCASEMAP_BREAK_ITERATOR iter,
132 # define UCASEMAP_BREAK_ITERATOR_NULL NULL,
136 ustrcase_getCaseLocale(const char *locale
);
138 // TODO: swap src / dest if approved for new public api
139 /** Implements UStringCaseMapper. */
140 U_CFUNC
int32_t U_CALLCONV
141 ustrcase_internalToLower(int32_t caseLocale
, uint32_t options
, UCASEMAP_BREAK_ITERATOR_PARAM
142 UChar
*dest
, int32_t destCapacity
,
143 const UChar
*src
, int32_t srcLength
,
145 UErrorCode
&errorCode
);
147 /** Implements UStringCaseMapper. */
148 U_CFUNC
int32_t U_CALLCONV
149 ustrcase_internalToUpper(int32_t caseLocale
, uint32_t options
, UCASEMAP_BREAK_ITERATOR_PARAM
150 UChar
*dest
, int32_t destCapacity
,
151 const UChar
*src
, int32_t srcLength
,
153 UErrorCode
&errorCode
);
155 #if !UCONFIG_NO_BREAK_ITERATION
157 /** Implements UStringCaseMapper. */
158 U_CFUNC
int32_t U_CALLCONV
159 ustrcase_internalToTitle(int32_t caseLocale
, uint32_t options
,
160 icu::BreakIterator
*iter
,
161 UChar
*dest
, int32_t destCapacity
,
162 const UChar
*src
, int32_t srcLength
,
164 UErrorCode
&errorCode
);
168 /** Implements UStringCaseMapper. */
169 U_CFUNC
int32_t U_CALLCONV
170 ustrcase_internalFold(int32_t caseLocale
, uint32_t options
, UCASEMAP_BREAK_ITERATOR_PARAM
171 UChar
*dest
, int32_t destCapacity
,
172 const UChar
*src
, int32_t srcLength
,
174 UErrorCode
&errorCode
);
177 * Common string case mapping implementation for ucasemap_toXyz() and UnicodeString::toXyz().
178 * Implements argument checking.
181 ustrcase_map(int32_t caseLocale
, uint32_t options
, UCASEMAP_BREAK_ITERATOR_PARAM
182 UChar
*dest
, int32_t destCapacity
,
183 const UChar
*src
, int32_t srcLength
,
184 UStringCaseMapper
*stringCaseMapper
,
186 UErrorCode
&errorCode
);
189 * Common string case mapping implementation for old-fashioned u_strToXyz() functions
190 * that allow the source string to overlap the destination buffer.
191 * Implements argument checking and internally works with an intermediate buffer if necessary.
194 ustrcase_mapWithOverlap(int32_t caseLocale
, uint32_t options
, UCASEMAP_BREAK_ITERATOR_PARAM
195 UChar
*dest
, int32_t destCapacity
,
196 const UChar
*src
, int32_t srcLength
,
197 UStringCaseMapper
*stringCaseMapper
,
198 UErrorCode
&errorCode
);
201 * UTF-8 string case mapping function type, used by ucasemap_mapUTF8().
202 * UTF-8 version of UStringCaseMapper.
203 * All error checking must be done.
204 * The UCaseMap must be fully initialized, with locale and/or iter set as needed.
206 typedef void U_CALLCONV
207 UTF8CaseMapper(int32_t caseLocale
, uint32_t options
,
208 #if !UCONFIG_NO_BREAK_ITERATION
209 icu::BreakIterator
*iter
,
211 const uint8_t *src
, int32_t srcLength
,
212 icu::ByteSink
&sink
, icu::Edits
*edits
,
213 UErrorCode
&errorCode
);
215 #if !UCONFIG_NO_BREAK_ITERATION
217 /** Implements UTF8CaseMapper. */
218 U_CFUNC
void U_CALLCONV
219 ucasemap_internalUTF8ToTitle(int32_t caseLocale
, uint32_t options
,
220 icu::BreakIterator
*iter
,
221 const uint8_t *src
, int32_t srcLength
,
222 icu::ByteSink
&sink
, icu::Edits
*edits
,
223 UErrorCode
&errorCode
);
228 ucasemap_mapUTF8(int32_t caseLocale
, uint32_t options
, UCASEMAP_BREAK_ITERATOR_PARAM
229 const char *src
, int32_t srcLength
,
230 UTF8CaseMapper
*stringCaseMapper
,
231 icu::ByteSink
&sink
, icu::Edits
*edits
,
232 UErrorCode
&errorCode
);
235 * Implements argument checking and buffer handling
236 * for UTF-8 string case mapping as a common function.
239 ucasemap_mapUTF8(int32_t caseLocale
, uint32_t options
, UCASEMAP_BREAK_ITERATOR_PARAM
240 char *dest
, int32_t destCapacity
,
241 const char *src
, int32_t srcLength
,
242 UTF8CaseMapper
*stringCaseMapper
,
244 UErrorCode
&errorCode
);
247 namespace GreekUpper
{
250 static const uint32_t UPPER_MASK
= 0x3ff;
251 static const uint32_t HAS_VOWEL
= 0x1000;
252 static const uint32_t HAS_YPOGEGRAMMENI
= 0x2000;
253 static const uint32_t HAS_ACCENT
= 0x4000;
254 static const uint32_t HAS_DIALYTIKA
= 0x8000;
255 // Further bits during data building and processing, not stored in the data map.
256 static const uint32_t HAS_COMBINING_DIALYTIKA
= 0x10000;
257 static const uint32_t HAS_OTHER_GREEK_DIACRITIC
= 0x20000;
259 static const uint32_t HAS_VOWEL_AND_ACCENT
= HAS_VOWEL
| HAS_ACCENT
;
260 static const uint32_t HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA
=
261 HAS_VOWEL_AND_ACCENT
| HAS_DIALYTIKA
;
262 static const uint32_t HAS_EITHER_DIALYTIKA
= HAS_DIALYTIKA
| HAS_COMBINING_DIALYTIKA
;
265 static const uint32_t AFTER_CASED
= 1;
266 static const uint32_t AFTER_VOWEL_WITH_ACCENT
= 2;
268 uint32_t getLetterData(UChar32 c
);
271 * Returns a non-zero value for each of the Greek combining diacritics
272 * listed in The Unicode Standard, version 8, chapter 7.2 Greek,
273 * plus some perispomeni look-alikes.
275 uint32_t getDiacriticData(UChar32 c
);
277 } // namespace GreekUpper
280 #endif // __cplusplus
282 #endif // __UCASEMAP_IMP_H__