icuSources/i18n/usrchimp.h

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 **********************************************************************
   5 *   Copyright (C) 2001-2015 IBM and others. All rights reserved.
   6 **********************************************************************
   7 *   Date        Name        Description
   8 *  08/13/2001   synwee      Creation.
   9 **********************************************************************
  10 */
  11 #ifndef USRCHIMP_H
  12 #define USRCHIMP_H
  13
  14 #include "unicode/utypes.h"
  15
  16 #if !UCONFIG_NO_COLLATION
  17
  18 #include "unicode/normalizer2.h"
  19 #include "unicode/ucol.h"
  20 #include "unicode/ucoleitr.h"
  21 #include "unicode/ubrk.h"
  22
  23 /* mask off anything but primary order */
  24 #define UCOL_PRIMARYORDERMASK 0xffff0000
  25 /* mask off anything but secondary order */
  26 #define UCOL_SECONDARYORDERMASK 0x0000ff00
  27 /* mask off anything but tertiary order */
  28 #define UCOL_TERTIARYORDERMASK 0x000000ff
  29 /* primary order shift */
  30 #define UCOL_PRIMARYORDERSHIFT 16
  31 /* secondary order shift */
  32 #define UCOL_SECONDARYORDERSHIFT 8
  33
  34 #define UCOL_IGNORABLE 0
  35
  36 /* get weights from a CE */
  37 #define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff)
  38 #define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT)
  39 #define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK)
  40
  41 #define UCOL_CONTINUATION_MARKER 0xC0
  42
  43 #define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER)
  44
  45 /**
  46  * This indicates an error has occured during processing or there are no more CEs
  47  * to be returned.
  48  */
  49 #ifndef UCOL_PROCESSED_NULLORDER
  50 #define UCOL_PROCESSED_NULLORDER        ((int64_t)U_INT64_MAX)
  51 #endif
  52
  53 U_NAMESPACE_BEGIN
  54
  55 class CollationElementIterator;
  56 class Collator;
  57
  58 struct PCEI
  59 {
  60     uint64_t ce;
  61     int32_t  low;
  62     int32_t  high;
  63 };
  64
  65 struct PCEBuffer
  66 {
  67     PCEI    defaultBuffer[16];
  68     PCEI   *buffer;
  69     int32_t bufferIndex;
  70     int32_t bufferSize;
  71
  72     PCEBuffer();
  73     ~PCEBuffer();
  74
  75     void  reset();
  76     UBool isEmpty() const;
  77     void  put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);
  78     const PCEI *get();
  79 };
  80
  81 class UCollationPCE : public UMemory {
  82 private:
  83     PCEBuffer          pceBuffer;
  84     CollationElementIterator *cei;
  85     UCollationStrength strength;
  86     UBool              toShift;
  87     UBool              isShifted;
  88     uint32_t           variableTop;
  89
  90 public:
  91     UCollationPCE(UCollationElements *elems);
  92     UCollationPCE(CollationElementIterator *iter);
  93     ~UCollationPCE();
  94
  95     void init(UCollationElements *elems);
  96     void init(CollationElementIterator *iter);
  97
  98     /**
  99      * Get the processed ordering priority of the next collation element in the text.
 100      * A single character may contain more than one collation element.
 101      *
 102      * @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE.
 103      * @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE.
 104      * @param status A pointer to an UErrorCode to receive any errors.
 105      * @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER
 106      *         if an error has occured or if the end of string has been reached
 107      */
 108     int64_t nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
 109     /**
 110      * Get the processed ordering priority of the previous collation element in the text.
 111      * A single character may contain more than one collation element.
 112      *
 113      * @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE
 114      * @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE
 115      * @param status A pointer to an UErrorCode to receive any errors. Noteably
 116      *               a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack
 117      *               buffer has been exhausted.
 118      * @return The previous collation elements ordering, otherwise returns
 119      *         UCOL_PROCESSED_NULLORDER if an error has occured or if the start of
 120      *         string has been reached.
 121      */
 122     int64_t previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
 123
 124 private:
 125     void init(const Collator &coll);
 126     uint64_t processCE(uint32_t ce);
 127 };
 128
 129 U_NAMESPACE_END
 130
 131 #define INITIAL_ARRAY_SIZE_       256
 132 #define MAX_TABLE_SIZE_           257
 133
 134 struct USearch {
 135     // required since collation element iterator does not have a getText API
 136     const UChar              *text;
 137           int32_t             textLength; // exact length
 138           UBool               isOverlap;
 139           UBool               isCanonicalMatch;
 140           int16_t             elementComparisonType;
 141           UBreakIterator     *internalBreakIter;  //internal character breakiterator
 142           UBreakIterator     *breakIter;
 143     // value USEARCH_DONE is the default value
 144     // if we are not at the start of the text or the end of the text,
 145     // depending on the iteration direction and matchedIndex is USEARCH_DONE
 146     // it means that we can't find any more matches in that particular direction
 147           int32_t             matchedIndex;
 148           int32_t             matchedLength;
 149           UBool               isForwardSearching;
 150           UBool               reset;
 151 };
 152
 153 struct UPattern {
 154     const UChar              *text;
 155           int32_t             textLength; // exact length
 156           // length required for backwards ce comparison
 157           int32_t             cesLength;
 158           int32_t            *ces;
 159           int32_t             cesBuffer[INITIAL_ARRAY_SIZE_];
 160           int32_t             pcesLength;
 161           int64_t            *pces;
 162           int64_t             pcesBuffer[INITIAL_ARRAY_SIZE_];
 163           UBool               hasPrefixAccents;
 164           UBool               hasSuffixAccents;
 165           int16_t             defaultShiftSize;
 166           int16_t             shift[MAX_TABLE_SIZE_];
 167           int16_t             backShift[MAX_TABLE_SIZE_];
 168 };
 169
 170 struct UStringSearch {
 171     struct USearch            *search;
 172     struct UPattern            pattern;
 173     const  UCollator          *collator;
 174     const  icu::Normalizer2   *nfd;
 175     // positions within the collation element iterator is used to determine
 176     // if we are at the start of the text.
 177            UCollationElements *textIter;
 178            icu::UCollationPCE *textProcessedIter;
 179     // utility collation element, used throughout program for temporary
 180     // iteration.
 181            UCollationElements *utilIter;
 182            UBool               ownCollator;
 183            UCollationStrength  strength;
 184            uint32_t            ceMask;
 185            uint32_t            variableTop;
 186            UBool               toShift;
 187            UChar               canonicalPrefixAccents[INITIAL_ARRAY_SIZE_];
 188            UChar               canonicalSuffixAccents[INITIAL_ARRAY_SIZE_];
 189 };
 190
 191 /**
 192 * Exact matches without checking for the ends for extra accents.
 193 * The match after the position within the collation element iterator is to be
 194 * found.
 195 * After a match is found the offset in the collation element iterator will be
 196 * shifted to the start of the match.
 197 * Implementation note:
 198 * For tertiary we can't use the collator->tertiaryMask, that is a
 199 * preprocessed mask that takes into account case options. since we are only
 200 * concerned with exact matches, we don't need that.
 201 * Alternate handling - since only the 16 most significant digits is only used,
 202 * we can safely do a compare without masking if the ce is a variable, we mask
 203 * and get only the primary values no shifting to quartenary is required since
 204 * all primary values less than variabletop will need to be masked off anyway.
 205 * If the end character is composite and the pattern ce does not match the text
 206 * ce, we skip it until we find a match in the end composite character or when
 207 * it has passed the character. This is so that we can match pattern "a" with
 208 * the text "\u00e6"
 209 * @param strsrch string search data
 210 * @param status error status if any
 211 * @return TRUE if an exact match is found, FALSE otherwise
 212 */
 213 U_CFUNC
 214 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status);
 215
 216 /**
 217 * Canonical matches.
 218 * According to the definition, matches found here will include the whole span
 219 * of beginning and ending accents if it overlaps that region.
 220 * @param strsrch string search data
 221 * @param status error status if any
 222 * @return TRUE if a canonical match is found, FALSE otherwise
 223 */
 224 U_CFUNC
 225 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status);
 226
 227 /**
 228 * Gets the previous match.
 229 * Comments follows from handleNextExact
 230 * @param strsrch string search data
 231 * @param status error status if any
 232 * @return True if a exact math is found, FALSE otherwise.
 233 */
 234 U_CFUNC
 235 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status);
 236
 237 /**
 238 * Canonical matches.
 239 * According to the definition, matches found here will include the whole span
 240 * of beginning and ending accents if it overlaps that region.
 241 * @param strsrch string search data
 242 * @param status error status if any
 243 * @return TRUE if a canonical match is found, FALSE otherwise
 244 */
 245 U_CFUNC
 246 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
 247                                       UErrorCode    *status);
 248
 249 #endif /* #if !UCONFIG_NO_COLLATION */
 250
 251 #endif