1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2001-2015 IBM and others. All rights reserved.
6 **********************************************************************
7 * Date Name Description
8 * 08/13/2001 synwee Creation.
9 **********************************************************************
14 #include "unicode/utypes.h"
16 #if !UCONFIG_NO_COLLATION
18 #include "unicode/normalizer2.h"
19 #include "unicode/ucol.h"
20 #include "unicode/ucoleitr.h"
21 #include "unicode/ubrk.h"
23 /* mask off anything but primary order */
24 #define UCOL_PRIMARYORDERMASK 0xffff0000
25 /* mask off anything but secondary order */
26 #define UCOL_SECONDARYORDERMASK 0x0000ff00
27 /* mask off anything but tertiary order */
28 #define UCOL_TERTIARYORDERMASK 0x000000ff
29 /* primary order shift */
30 #define UCOL_PRIMARYORDERSHIFT 16
31 /* secondary order shift */
32 #define UCOL_SECONDARYORDERSHIFT 8
34 #define UCOL_IGNORABLE 0
36 /* get weights from a CE */
37 #define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff)
38 #define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT)
39 #define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK)
41 #define UCOL_CONTINUATION_MARKER 0xC0
43 #define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER)
46 * This indicates an error has occured during processing or there are no more CEs
49 #ifndef UCOL_PROCESSED_NULLORDER
50 #define UCOL_PROCESSED_NULLORDER ((int64_t)U_INT64_MAX)
55 class CollationElementIterator
;
67 PCEI defaultBuffer
[16];
76 UBool
isEmpty() const;
77 void put(uint64_t ce
, int32_t ixLow
, int32_t ixHigh
, UErrorCode
&errorCode
);
81 class UCollationPCE
: public UMemory
{
84 CollationElementIterator
*cei
;
85 UCollationStrength strength
;
91 UCollationPCE(UCollationElements
*elems
);
92 UCollationPCE(CollationElementIterator
*iter
);
95 void init(UCollationElements
*elems
);
96 void init(CollationElementIterator
*iter
);
99 * Get the processed ordering priority of the next collation element in the text.
100 * A single character may contain more than one collation element.
102 * @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE.
103 * @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE.
104 * @param status A pointer to an UErrorCode to receive any errors.
105 * @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER
106 * if an error has occured or if the end of string has been reached
108 int64_t nextProcessed(int32_t *ixLow
, int32_t *ixHigh
, UErrorCode
*status
);
110 * Get the processed ordering priority of the previous collation element in the text.
111 * A single character may contain more than one collation element.
113 * @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE
114 * @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE
115 * @param status A pointer to an UErrorCode to receive any errors. Noteably
116 * a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack
117 * buffer has been exhausted.
118 * @return The previous collation elements ordering, otherwise returns
119 * UCOL_PROCESSED_NULLORDER if an error has occured or if the start of
120 * string has been reached.
122 int64_t previousProcessed(int32_t *ixLow
, int32_t *ixHigh
, UErrorCode
*status
);
125 void init(const Collator
&coll
);
126 uint64_t processCE(uint32_t ce
);
131 #define INITIAL_ARRAY_SIZE_ 256
132 #define MAX_TABLE_SIZE_ 257
135 // required since collation element iterator does not have a getText API
137 int32_t textLength
; // exact length
139 UBool isCanonicalMatch
;
140 int16_t elementComparisonType
;
141 UBreakIterator
*internalBreakIter
; //internal character breakiterator
142 UBreakIterator
*breakIter
;
143 // value USEARCH_DONE is the default value
144 // if we are not at the start of the text or the end of the text,
145 // depending on the iteration direction and matchedIndex is USEARCH_DONE
146 // it means that we can't find any more matches in that particular direction
147 int32_t matchedIndex
;
148 int32_t matchedLength
;
149 UBool isForwardSearching
;
155 int32_t textLength
; // exact length
156 // length required for backwards ce comparison
159 int32_t cesBuffer
[INITIAL_ARRAY_SIZE_
];
162 int64_t pcesBuffer
[INITIAL_ARRAY_SIZE_
];
163 UBool hasPrefixAccents
;
164 UBool hasSuffixAccents
;
165 int16_t defaultShiftSize
;
166 int16_t shift
[MAX_TABLE_SIZE_
];
167 int16_t backShift
[MAX_TABLE_SIZE_
];
170 struct UStringSearch
{
171 struct USearch
*search
;
172 struct UPattern pattern
;
173 const UCollator
*collator
;
174 const icu::Normalizer2
*nfd
;
175 // positions within the collation element iterator is used to determine
176 // if we are at the start of the text.
177 UCollationElements
*textIter
;
178 icu::UCollationPCE
*textProcessedIter
;
179 // utility collation element, used throughout program for temporary
181 UCollationElements
*utilIter
;
183 UCollationStrength strength
;
185 uint32_t variableTop
;
187 UChar canonicalPrefixAccents
[INITIAL_ARRAY_SIZE_
];
188 UChar canonicalSuffixAccents
[INITIAL_ARRAY_SIZE_
];
192 * Exact matches without checking for the ends for extra accents.
193 * The match after the position within the collation element iterator is to be
195 * After a match is found the offset in the collation element iterator will be
196 * shifted to the start of the match.
197 * Implementation note:
198 * For tertiary we can't use the collator->tertiaryMask, that is a
199 * preprocessed mask that takes into account case options. since we are only
200 * concerned with exact matches, we don't need that.
201 * Alternate handling - since only the 16 most significant digits is only used,
202 * we can safely do a compare without masking if the ce is a variable, we mask
203 * and get only the primary values no shifting to quartenary is required since
204 * all primary values less than variabletop will need to be masked off anyway.
205 * If the end character is composite and the pattern ce does not match the text
206 * ce, we skip it until we find a match in the end composite character or when
207 * it has passed the character. This is so that we can match pattern "a" with
209 * @param strsrch string search data
210 * @param status error status if any
211 * @return TRUE if an exact match is found, FALSE otherwise
214 UBool
usearch_handleNextExact(UStringSearch
*strsrch
, UErrorCode
*status
);
218 * According to the definition, matches found here will include the whole span
219 * of beginning and ending accents if it overlaps that region.
220 * @param strsrch string search data
221 * @param status error status if any
222 * @return TRUE if a canonical match is found, FALSE otherwise
225 UBool
usearch_handleNextCanonical(UStringSearch
*strsrch
, UErrorCode
*status
);
228 * Gets the previous match.
229 * Comments follows from handleNextExact
230 * @param strsrch string search data
231 * @param status error status if any
232 * @return True if a exact math is found, FALSE otherwise.
235 UBool
usearch_handlePreviousExact(UStringSearch
*strsrch
, UErrorCode
*status
);
239 * According to the definition, matches found here will include the whole span
240 * of beginning and ending accents if it overlaps that region.
241 * @param strsrch string search data
242 * @param status error status if any
243 * @return TRUE if a canonical match is found, FALSE otherwise
246 UBool
usearch_handlePreviousCanonical(UStringSearch
*strsrch
,
249 #endif /* #if !UCONFIG_NO_COLLATION */