]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/usrchimp.h
ICU-64243.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / usrchimp.h
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f
A
3/*
4**********************************************************************
2ca993e8 5* Copyright (C) 2001-2015 IBM and others. All rights reserved.
b75a7d8f
A
6**********************************************************************
7* Date Name Description
8* 08/13/2001 synwee Creation.
9**********************************************************************
10*/
11#ifndef USRCHIMP_H
12#define USRCHIMP_H
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_COLLATION
17
729e4ab9 18#include "unicode/normalizer2.h"
b75a7d8f
A
19#include "unicode/ucol.h"
20#include "unicode/ucoleitr.h"
21#include "unicode/ubrk.h"
22
57a6839d
A
23/* mask off anything but primary order */
24#define UCOL_PRIMARYORDERMASK 0xffff0000
25/* mask off anything but secondary order */
26#define UCOL_SECONDARYORDERMASK 0x0000ff00
27/* mask off anything but tertiary order */
28#define UCOL_TERTIARYORDERMASK 0x000000ff
29/* primary order shift */
30#define UCOL_PRIMARYORDERSHIFT 16
31/* secondary order shift */
32#define UCOL_SECONDARYORDERSHIFT 8
33
34#define UCOL_IGNORABLE 0
35
36/* get weights from a CE */
37#define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff)
38#define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT)
39#define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK)
40
41#define UCOL_CONTINUATION_MARKER 0xC0
42
43#define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER)
44
45/**
46 * This indicates an error has occured during processing or there are no more CEs
47 * to be returned.
48 */
49#ifndef UCOL_PROCESSED_NULLORDER
50#define UCOL_PROCESSED_NULLORDER ((int64_t)U_INT64_MAX)
51#endif
52
53U_NAMESPACE_BEGIN
54
55class CollationElementIterator;
56class Collator;
57
58struct PCEI
59{
60 uint64_t ce;
61 int32_t low;
62 int32_t high;
63};
64
65struct PCEBuffer
66{
67 PCEI defaultBuffer[16];
68 PCEI *buffer;
69 int32_t bufferIndex;
70 int32_t bufferSize;
71
72 PCEBuffer();
73 ~PCEBuffer();
74
75 void reset();
2ca993e8
A
76 UBool isEmpty() const;
77 void put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);
57a6839d
A
78 const PCEI *get();
79};
80
81class UCollationPCE : public UMemory {
82private:
83 PCEBuffer pceBuffer;
84 CollationElementIterator *cei;
85 UCollationStrength strength;
86 UBool toShift;
87 UBool isShifted;
88 uint32_t variableTop;
89
90public:
91 UCollationPCE(UCollationElements *elems);
92 UCollationPCE(CollationElementIterator *iter);
93 ~UCollationPCE();
94
95 void init(UCollationElements *elems);
96 void init(CollationElementIterator *iter);
97
98 /**
99 * Get the processed ordering priority of the next collation element in the text.
100 * A single character may contain more than one collation element.
101 *
102 * @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE.
103 * @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE.
104 * @param status A pointer to an UErrorCode to receive any errors.
105 * @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER
106 * if an error has occured or if the end of string has been reached
107 */
108 int64_t nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
109 /**
110 * Get the processed ordering priority of the previous collation element in the text.
111 * A single character may contain more than one collation element.
112 *
113 * @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE
114 * @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE
115 * @param status A pointer to an UErrorCode to receive any errors. Noteably
116 * a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack
117 * buffer has been exhausted.
118 * @return The previous collation elements ordering, otherwise returns
119 * UCOL_PROCESSED_NULLORDER if an error has occured or if the start of
120 * string has been reached.
121 */
122 int64_t previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
123
124private:
125 void init(const Collator &coll);
126 uint64_t processCE(uint32_t ce);
127};
128
129U_NAMESPACE_END
130
b75a7d8f
A
131#define INITIAL_ARRAY_SIZE_ 256
132#define MAX_TABLE_SIZE_ 257
133
134struct USearch {
135 // required since collation element iterator does not have a getText API
136 const UChar *text;
137 int32_t textLength; // exact length
138 UBool isOverlap;
139 UBool isCanonicalMatch;
729e4ab9
A
140 int16_t elementComparisonType;
141 UBreakIterator *internalBreakIter; //internal character breakiterator
b75a7d8f
A
142 UBreakIterator *breakIter;
143 // value USEARCH_DONE is the default value
144 // if we are not at the start of the text or the end of the text,
145 // depending on the iteration direction and matchedIndex is USEARCH_DONE
46f4442e
A
146 // it means that we can't find any more matches in that particular direction
147 int32_t matchedIndex;
b75a7d8f
A
148 int32_t matchedLength;
149 UBool isForwardSearching;
150 UBool reset;
151};
152
153struct UPattern {
154 const UChar *text;
155 int32_t textLength; // exact length
156 // length required for backwards ce comparison
b331163b
A
157 int32_t cesLength;
158 int32_t *ces;
159 int32_t cesBuffer[INITIAL_ARRAY_SIZE_];
160 int32_t pcesLength;
161 int64_t *pces;
162 int64_t pcesBuffer[INITIAL_ARRAY_SIZE_];
b75a7d8f
A
163 UBool hasPrefixAccents;
164 UBool hasSuffixAccents;
165 int16_t defaultShiftSize;
166 int16_t shift[MAX_TABLE_SIZE_];
167 int16_t backShift[MAX_TABLE_SIZE_];
168};
169
170struct UStringSearch {
171 struct USearch *search;
172 struct UPattern pattern;
173 const UCollator *collator;
4388f060 174 const icu::Normalizer2 *nfd;
b75a7d8f
A
175 // positions within the collation element iterator is used to determine
176 // if we are at the start of the text.
177 UCollationElements *textIter;
57a6839d 178 icu::UCollationPCE *textProcessedIter;
b75a7d8f
A
179 // utility collation element, used throughout program for temporary
180 // iteration.
181 UCollationElements *utilIter;
182 UBool ownCollator;
183 UCollationStrength strength;
184 uint32_t ceMask;
185 uint32_t variableTop;
186 UBool toShift;
187 UChar canonicalPrefixAccents[INITIAL_ARRAY_SIZE_];
188 UChar canonicalSuffixAccents[INITIAL_ARRAY_SIZE_];
189};
190
191/**
192* Exact matches without checking for the ends for extra accents.
193* The match after the position within the collation element iterator is to be
194* found.
195* After a match is found the offset in the collation element iterator will be
196* shifted to the start of the match.
197* Implementation note:
198* For tertiary we can't use the collator->tertiaryMask, that is a
199* preprocessed mask that takes into account case options. since we are only
200* concerned with exact matches, we don't need that.
201* Alternate handling - since only the 16 most significant digits is only used,
202* we can safely do a compare without masking if the ce is a variable, we mask
203* and get only the primary values no shifting to quartenary is required since
204* all primary values less than variabletop will need to be masked off anyway.
205* If the end character is composite and the pattern ce does not match the text
206* ce, we skip it until we find a match in the end composite character or when
207* it has passed the character. This is so that we can match pattern "a" with
208* the text "\u00e6"
209* @param strsrch string search data
210* @param status error status if any
211* @return TRUE if an exact match is found, FALSE otherwise
212*/
213U_CFUNC
214UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status);
215
216/**
217* Canonical matches.
218* According to the definition, matches found here will include the whole span
219* of beginning and ending accents if it overlaps that region.
220* @param strsrch string search data
221* @param status error status if any
222* @return TRUE if a canonical match is found, FALSE otherwise
223*/
224U_CFUNC
225UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status);
226
227/**
228* Gets the previous match.
229* Comments follows from handleNextExact
230* @param strsrch string search data
231* @param status error status if any
232* @return True if a exact math is found, FALSE otherwise.
233*/
234U_CFUNC
235UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status);
236
237/**
238* Canonical matches.
239* According to the definition, matches found here will include the whole span
240* of beginning and ending accents if it overlaps that region.
241* @param strsrch string search data
242* @param status error status if any
243* @return TRUE if a canonical match is found, FALSE otherwise
244*/
245U_CFUNC
246UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
247 UErrorCode *status);
248
249#endif /* #if !UCONFIG_NO_COLLATION */
250
251#endif