]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/collationruleparser.h
ICU-531.31.tar.gz
[apple/icu.git] / icuSources / i18n / collationruleparser.h
1 /*
2 *******************************************************************************
3 * Copyright (C) 2013-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationruleparser.h
7 *
8 * created on: 2013apr10
9 * created by: Markus W. Scherer
10 */
11
12 #ifndef __COLLATIONRULEPARSER_H__
13 #define __COLLATIONRULEPARSER_H__
14
15 #include "unicode/utypes.h"
16
17 #if !UCONFIG_NO_COLLATION
18
19 #include "unicode/ucol.h"
20 #include "unicode/uniset.h"
21 #include "unicode/unistr.h"
22
23 struct UParseError;
24
25 U_NAMESPACE_BEGIN
26
27 struct CollationData;
28 struct CollationTailoring;
29
30 class Locale;
31 class Normalizer2;
32
33 struct CollationSettings;
34
35 class U_I18N_API CollationRuleParser : public UMemory {
36 public:
37 /** Special reset positions. */
38 enum Position {
39 FIRST_TERTIARY_IGNORABLE,
40 LAST_TERTIARY_IGNORABLE,
41 FIRST_SECONDARY_IGNORABLE,
42 LAST_SECONDARY_IGNORABLE,
43 FIRST_PRIMARY_IGNORABLE,
44 LAST_PRIMARY_IGNORABLE,
45 FIRST_VARIABLE,
46 LAST_VARIABLE,
47 FIRST_REGULAR,
48 LAST_REGULAR,
49 FIRST_IMPLICIT,
50 LAST_IMPLICIT,
51 FIRST_TRAILING,
52 LAST_TRAILING
53 };
54
55 /**
56 * First character of contractions that encode special reset positions.
57 * U+FFFE cannot be tailored via rule syntax.
58 *
59 * The second contraction character is POS_BASE + Position.
60 */
61 static const UChar POS_LEAD = 0xfffe;
62 /**
63 * Base for the second character of contractions that encode special reset positions.
64 * Braille characters U+28xx are printable and normalization-inert.
65 * @see POS_LEAD
66 */
67 static const UChar POS_BASE = 0x2800;
68
69 class U_I18N_API Sink : public UObject {
70 public:
71 virtual ~Sink();
72 /**
73 * Adds a reset.
74 * strength=UCOL_IDENTICAL for &str.
75 * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3.
76 */
77 virtual void addReset(int32_t strength, const UnicodeString &str,
78 const char *&errorReason, UErrorCode &errorCode) = 0;
79 /**
80 * Adds a relation with strength and prefix | str / extension.
81 */
82 virtual void addRelation(int32_t strength, const UnicodeString &prefix,
83 const UnicodeString &str, const UnicodeString &extension,
84 const char *&errorReason, UErrorCode &errorCode) = 0;
85
86 virtual void suppressContractions(const UnicodeSet &set, const char *&errorReason,
87 UErrorCode &errorCode);
88
89 virtual void optimize(const UnicodeSet &set, const char *&errorReason,
90 UErrorCode &errorCode);
91 };
92
93 class U_I18N_API Importer : public UObject {
94 public:
95 virtual ~Importer();
96 virtual const UnicodeString *getRules(
97 const char *localeID, const char *collationType,
98 const char *&errorReason, UErrorCode &errorCode) = 0;
99 };
100
101 /**
102 * Constructor.
103 * The Sink must be set before parsing.
104 * The Importer can be set, otherwise [import locale] syntax is not supported.
105 */
106 CollationRuleParser(const CollationData *base, UErrorCode &errorCode);
107 ~CollationRuleParser();
108
109 /**
110 * Sets the pointer to a Sink object.
111 * The pointer is aliased: Pointer copy without cloning or taking ownership.
112 */
113 void setSink(Sink *sinkAlias) {
114 sink = sinkAlias;
115 }
116
117 /**
118 * Sets the pointer to an Importer object.
119 * The pointer is aliased: Pointer copy without cloning or taking ownership.
120 */
121 void setImporter(Importer *importerAlias) {
122 importer = importerAlias;
123 }
124
125 void parse(const UnicodeString &ruleString,
126 CollationSettings &outSettings,
127 UParseError *outParseError,
128 UErrorCode &errorCode);
129
130 const char *getErrorReason() const { return errorReason; }
131
132 /**
133 * Gets a script or reorder code from its string representation.
134 * @return the script/reorder code, or
135 * -1==UCOL_REORDER_CODE_DEFAULT, or
136 * -2 if not recognized
137 */
138 static int32_t getReorderCode(const char *word);
139
140 private:
141 /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */
142 static const int32_t STRENGTH_MASK = 0xf;
143 static const int32_t STARRED_FLAG = 0x10;
144 static const int32_t OFFSET_SHIFT = 8;
145
146 void parse(const UnicodeString &ruleString, UErrorCode &errorCode);
147 void parseRuleChain(UErrorCode &errorCode);
148 int32_t parseResetAndPosition(UErrorCode &errorCode);
149 int32_t parseRelationOperator(UErrorCode &errorCode);
150 void parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode);
151 void parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode);
152 int32_t parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode);
153 int32_t parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode);
154
155 /**
156 * Sets str to a contraction of U+FFFE and (U+2800 + Position).
157 * @return rule index after the special reset position
158 */
159 int32_t parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode);
160 void parseSetting(UErrorCode &errorCode);
161 void parseReordering(const UnicodeString &raw, UErrorCode &errorCode);
162 static UColAttributeValue getOnOffValue(const UnicodeString &s);
163
164 int32_t parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode);
165 int32_t readWords(int32_t i, UnicodeString &raw) const;
166 int32_t skipComment(int32_t i) const;
167
168 void setParseError(const char *reason, UErrorCode &errorCode);
169 void setErrorContext();
170
171 /**
172 * ASCII [:P:] and [:S:]:
173 * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E]
174 */
175 static UBool isSyntaxChar(UChar32 c);
176 int32_t skipWhiteSpace(int32_t i) const;
177
178 const Normalizer2 &nfd, &nfc;
179
180 const UnicodeString *rules;
181 const CollationData *const baseData;
182 CollationSettings *settings;
183 UParseError *parseError;
184 const char *errorReason;
185
186 Sink *sink;
187 Importer *importer;
188
189 int32_t ruleIndex;
190 };
191
192 U_NAMESPACE_END
193
194 #endif // !UCONFIG_NO_COLLATION
195 #endif // __COLLATIONRULEPARSER_H__