icuSources/i18n/utf8collationiterator.h

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 *******************************************************************************
   5 * Copyright (C) 2012-2016, International Business Machines
   6 * Corporation and others.  All Rights Reserved.
   7 *******************************************************************************
   8 * utf8collationiterator.h
   9 *
  10 * created on: 2012nov12 (from utf16collationiterator.h & uitercollationiterator.h)
  11 * created by: Markus W. Scherer
  12 */
  13
  14 #ifndef __UTF8COLLATIONITERATOR_H__
  15 #define __UTF8COLLATIONITERATOR_H__
  16
  17 #include "unicode/utypes.h"
  18
  19 #if !UCONFIG_NO_COLLATION
  20
  21 #include "cmemory.h"
  22 #include "collation.h"
  23 #include "collationdata.h"
  24 #include "collationiterator.h"
  25 #include "normalizer2impl.h"
  26
  27 U_NAMESPACE_BEGIN
  28
  29 /**
  30  * UTF-8 collation element and character iterator.
  31  * Handles normalized UTF-8 text inline, with length or NUL-terminated.
  32  * Unnormalized text is handled by a subclass.
  33  */
  34 class U_I18N_API UTF8CollationIterator : public CollationIterator {
  35 public:
  36     UTF8CollationIterator(const CollationData *d, UBool numeric,
  37                           const uint8_t *s, int32_t p, int32_t len)
  38             : CollationIterator(d, numeric),
  39               u8(s), pos(p), length(len) {}
  40
  41     virtual ~UTF8CollationIterator();
  42
  43     virtual void resetToOffset(int32_t newOffset);
  44
  45     virtual int32_t getOffset() const;
  46
  47     virtual UChar32 nextCodePoint(UErrorCode &errorCode);
  48
  49     virtual UChar32 previousCodePoint(UErrorCode &errorCode);
  50
  51 protected:
  52     /**
  53      * For byte sequences that are illegal in UTF-8, an error value may be returned
  54      * together with a bogus code point. The caller will ignore that code point.
  55      *
  56      * Special values may be returned for surrogate code points, which are also illegal in UTF-8,
  57      * but the caller will treat them like U+FFFD because forbidSurrogateCodePoints() returns TRUE.
  58      *
  59      * Valid lead surrogates are returned from inside a normalized text segment,
  60      * where handleGetTrailSurrogate() will return the matching trail surrogate.
  61      */
  62     virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode);
  63
  64     virtual UBool foundNULTerminator();
  65
  66     virtual UBool forbidSurrogateCodePoints() const;
  67
  68     virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode);
  69
  70     virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode);
  71
  72     const uint8_t *u8;
  73     int32_t pos;
  74     int32_t length;  // <0 for NUL-terminated strings
  75 };
  76
  77 /**
  78  * Incrementally checks the input text for FCD and normalizes where necessary.
  79  */
  80 class U_I18N_API FCDUTF8CollationIterator : public UTF8CollationIterator {
  81 public:
  82     FCDUTF8CollationIterator(const CollationData *data, UBool numeric,
  83                              const uint8_t *s, int32_t p, int32_t len)
  84             : UTF8CollationIterator(data, numeric, s, p, len),
  85               state(CHECK_FWD), start(p),
  86               nfcImpl(data->nfcImpl) {}
  87
  88     virtual ~FCDUTF8CollationIterator();
  89
  90     virtual void resetToOffset(int32_t newOffset);
  91
  92     virtual int32_t getOffset() const;
  93
  94     virtual UChar32 nextCodePoint(UErrorCode &errorCode);
  95
  96     virtual UChar32 previousCodePoint(UErrorCode &errorCode);
  97
  98 protected:
  99     virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode);
 100
 101     virtual UChar handleGetTrailSurrogate();
 102
 103     virtual UBool foundNULTerminator();
 104
 105     virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode);
 106
 107     virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode);
 108
 109 private:
 110     UBool nextHasLccc() const;
 111     UBool previousHasTccc() const;
 112
 113     /**
 114      * Switches to forward checking if possible.
 115      */
 116     void switchToForward();
 117
 118     /**
 119      * Extends the FCD text segment forward or normalizes around pos.
 120      * @return TRUE if success
 121      */
 122     UBool nextSegment(UErrorCode &errorCode);
 123
 124     /**
 125      * Switches to backward checking.
 126      */
 127     void switchToBackward();
 128
 129     /**
 130      * Extends the FCD text segment backward or normalizes around pos.
 131      * @return TRUE if success
 132      */
 133     UBool previousSegment(UErrorCode &errorCode);
 134
 135     UBool normalize(const UnicodeString &s, UErrorCode &errorCode);
 136
 137     enum State {
 138         /**
 139          * The input text [start..pos[ passes the FCD check.
 140          * Moving forward checks incrementally.
 141          * limit is undefined.
 142          */
 143         CHECK_FWD,
 144         /**
 145          * The input text [pos..limit[ passes the FCD check.
 146          * Moving backward checks incrementally.
 147          * start is undefined.
 148          */
 149         CHECK_BWD,
 150         /**
 151          * The input text [start..limit[ passes the FCD check.
 152          * pos tracks the current text index.
 153          */
 154         IN_FCD_SEGMENT,
 155         /**
 156          * The input text [start..limit[ failed the FCD check and was normalized.
 157          * pos tracks the current index in the normalized string.
 158          */
 159         IN_NORMALIZED
 160     };
 161
 162     State state;
 163
 164     int32_t start;
 165     int32_t limit;
 166
 167     const Normalizer2Impl &nfcImpl;
 168     UnicodeString normalized;
 169 };
 170
 171 U_NAMESPACE_END
 172
 173 #endif  // !UCONFIG_NO_COLLATION
 174 #endif  // __UTF8COLLATIONITERATOR_H__