icuSources/common/unicode/caniter.h

   1 /*
   2  *******************************************************************************
   3  * Copyright (C) 1996-2004, International Business Machines Corporation and    *
   4  * others. All Rights Reserved.                                                *
   5  *******************************************************************************
   6  */
   7
   8 #ifndef CANITER_H
   9 #define CANITER_H
  10
  11 #include "unicode/utypes.h"
  12
  13 #if !UCONFIG_NO_NORMALIZATION
  14
  15 #include "unicode/uobject.h"
  16 #include "unicode/unistr.h"
  17
  18 /** Should permutation skip characters with combining class zero
  19  *  Should be either TRUE or FALSE. This is a compile time option
  20  *  @stable ICU 2.4
  21  */
  22 #ifndef CANITER_SKIP_ZEROES
  23 #define CANITER_SKIP_ZEROES TRUE
  24 #endif
  25
  26 U_NAMESPACE_BEGIN
  27
  28 class Hashtable;
  29
  30 /**
  31  * This class allows one to iterate through all the strings that are canonically equivalent to a given
  32  * string. For example, here are some sample results:
  33 Results for: {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
  34 1: \\u0041\\u030A\\u0064\\u0307\\u0327
  35  = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
  36 2: \\u0041\\u030A\\u0064\\u0327\\u0307
  37  = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
  38 3: \\u0041\\u030A\\u1E0B\\u0327
  39  = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
  40 4: \\u0041\\u030A\\u1E11\\u0307
  41  = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
  42 5: \\u00C5\\u0064\\u0307\\u0327
  43  = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
  44 6: \\u00C5\\u0064\\u0327\\u0307
  45  = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
  46 7: \\u00C5\\u1E0B\\u0327
  47  = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
  48 8: \\u00C5\\u1E11\\u0307
  49  = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
  50 9: \\u212B\\u0064\\u0307\\u0327
  51  = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
  52 10: \\u212B\\u0064\\u0327\\u0307
  53  = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
  54 11: \\u212B\\u1E0B\\u0327
  55  = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
  56 12: \\u212B\\u1E11\\u0307
  57  = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
  58  *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
  59  * since it has not been optimized for that situation.
  60  * Note, CanonicalIterator is not intended to be subclassed.
  61  * @author M. Davis
  62  * @author C++ port by V. Weinstein
  63  * @stable ICU 2.4
  64  */
  65 class U_COMMON_API CanonicalIterator : public UObject {
  66 public:
  67     /**
  68      * Construct a CanonicalIterator object
  69      * @param source    string to get results for
  70      * @param status    Fill-in parameter which receives the status of this operation.
  71      * @stable ICU 2.4
  72      */
  73     CanonicalIterator(const UnicodeString &source, UErrorCode &status);
  74
  75     /** Destructor
  76      *  Cleans pieces
  77      * @stable ICU 2.4
  78      */
  79     virtual ~CanonicalIterator();
  80
  81     /**
  82      * Gets the NFD form of the current source we are iterating over.
  83      * @return gets the source: NOTE: it is the NFD form of source
  84      * @stable ICU 2.4
  85      */
  86     UnicodeString getSource();
  87
  88     /**
  89      * Resets the iterator so that one can start again from the beginning.
  90      * @stable ICU 2.4
  91      */
  92     void reset();
  93
  94     /**
  95      * Get the next canonically equivalent string.
  96      * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
  97      * @return the next string that is canonically equivalent. A bogus string is returned when
  98      * the iteration is done.
  99      * @stable ICU 2.4
 100      */
 101     UnicodeString next();
 102
 103     /**
 104      * Set a new source for this iterator. Allows object reuse.
 105      * @param newSource     the source string to iterate against. This allows the same iterator to be used
 106      *                     while changing the source string, saving object creation.
 107      * @param status        Fill-in parameter which receives the status of this operation.
 108      * @stable ICU 2.4
 109      */
 110     void setSource(const UnicodeString &newSource, UErrorCode &status);
 111
 112     /**
 113      * Dumb recursive implementation of permutation.
 114      * TODO: optimize
 115      * @param source     the string to find permutations for
 116      * @param skipZeros  determine if skip zeros
 117      * @param result     the results in a set.
 118      * @param status       Fill-in parameter which receives the status of this operation.
 119      * @internal
 120      */
 121     static void U_EXPORT2 permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status);
 122
 123     /**
 124      * ICU "poor man's RTTI", returns a UClassID for this class.
 125      *
 126      * @stable ICU 2.2
 127      */
 128     static UClassID U_EXPORT2 getStaticClassID();
 129
 130     /**
 131      * ICU "poor man's RTTI", returns a UClassID for the actual class.
 132      *
 133      * @stable ICU 2.2
 134      */
 135     virtual UClassID getDynamicClassID() const;
 136
 137 private:
 138     // ===================== PRIVATES ==============================
 139     // private default constructor
 140     CanonicalIterator();
 141
 142
 143     /**
 144      * Copy constructor. Private for now.
 145      * @internal
 146      */
 147     CanonicalIterator(const CanonicalIterator& other);
 148
 149     /**
 150      * Assignment operator. Private for now.
 151      * @internal
 152      */
 153     CanonicalIterator& operator=(const CanonicalIterator& other);
 154
 155     // fields
 156     UnicodeString source;
 157     UBool done;
 158
 159     // 2 dimensional array holds the pieces of the string with
 160     // their different canonically equivalent representations
 161     UnicodeString **pieces;
 162     int32_t pieces_length;
 163     int32_t *pieces_lengths;
 164
 165     // current is used in iterating to combine pieces
 166     int32_t *current;
 167     int32_t current_length;
 168
 169     // transient fields
 170     UnicodeString buffer;
 171
 172     // we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
 173     UnicodeString *getEquivalents(const UnicodeString &segment, int32_t &result_len, UErrorCode &status); //private String[] getEquivalents(String segment)
 174
 175     //Set getEquivalents2(String segment);
 176     Hashtable *getEquivalents2(const UChar *segment, int32_t segLen, UErrorCode &status);
 177     //Hashtable *getEquivalents2(const UnicodeString &segment, int32_t segLen, UErrorCode &status);
 178
 179     /**
 180      * See if the decomposition of cp2 is at segment starting at segmentPos
 181      * (with canonical rearrangment!)
 182      * If so, take the remainder, and return the equivalents
 183      */
 184     //Set extract(int comp, String segment, int segmentPos, StringBuffer buffer);
 185     Hashtable *extract(UChar32 comp, const UChar *segment, int32_t segLen, int32_t segmentPos, UErrorCode &status);
 186     //Hashtable *extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status);
 187
 188     void cleanPieces();
 189
 190 };
 191
 192 U_NAMESPACE_END
 193
 194 #endif /* #if !UCONFIG_NO_NORMALIZATION */
 195
 196 #endif