]>
Commit | Line | Data |
---|---|---|
57a6839d A |
1 | /* |
2 | ******************************************************************************* | |
3 | * Copyright (C) 2013-2014, International Business Machines | |
4 | * Corporation and others. All Rights Reserved. | |
5 | ******************************************************************************* | |
6 | * collationruleparser.h | |
7 | * | |
8 | * created on: 2013apr10 | |
9 | * created by: Markus W. Scherer | |
10 | */ | |
11 | ||
12 | #ifndef __COLLATIONRULEPARSER_H__ | |
13 | #define __COLLATIONRULEPARSER_H__ | |
14 | ||
15 | #include "unicode/utypes.h" | |
16 | ||
17 | #if !UCONFIG_NO_COLLATION | |
18 | ||
19 | #include "unicode/ucol.h" | |
20 | #include "unicode/uniset.h" | |
21 | #include "unicode/unistr.h" | |
22 | ||
23 | struct UParseError; | |
24 | ||
25 | U_NAMESPACE_BEGIN | |
26 | ||
27 | struct CollationData; | |
28 | struct CollationTailoring; | |
29 | ||
30 | class Locale; | |
31 | class Normalizer2; | |
32 | ||
33 | struct CollationSettings; | |
34 | ||
35 | class U_I18N_API CollationRuleParser : public UMemory { | |
36 | public: | |
37 | /** Special reset positions. */ | |
38 | enum Position { | |
39 | FIRST_TERTIARY_IGNORABLE, | |
40 | LAST_TERTIARY_IGNORABLE, | |
41 | FIRST_SECONDARY_IGNORABLE, | |
42 | LAST_SECONDARY_IGNORABLE, | |
43 | FIRST_PRIMARY_IGNORABLE, | |
44 | LAST_PRIMARY_IGNORABLE, | |
45 | FIRST_VARIABLE, | |
46 | LAST_VARIABLE, | |
47 | FIRST_REGULAR, | |
48 | LAST_REGULAR, | |
49 | FIRST_IMPLICIT, | |
50 | LAST_IMPLICIT, | |
51 | FIRST_TRAILING, | |
52 | LAST_TRAILING | |
53 | }; | |
54 | ||
55 | /** | |
56 | * First character of contractions that encode special reset positions. | |
57 | * U+FFFE cannot be tailored via rule syntax. | |
58 | * | |
59 | * The second contraction character is POS_BASE + Position. | |
60 | */ | |
61 | static const UChar POS_LEAD = 0xfffe; | |
62 | /** | |
63 | * Base for the second character of contractions that encode special reset positions. | |
64 | * Braille characters U+28xx are printable and normalization-inert. | |
65 | * @see POS_LEAD | |
66 | */ | |
67 | static const UChar POS_BASE = 0x2800; | |
68 | ||
69 | class U_I18N_API Sink : public UObject { | |
70 | public: | |
71 | virtual ~Sink(); | |
72 | /** | |
73 | * Adds a reset. | |
74 | * strength=UCOL_IDENTICAL for &str. | |
75 | * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3. | |
76 | */ | |
77 | virtual void addReset(int32_t strength, const UnicodeString &str, | |
78 | const char *&errorReason, UErrorCode &errorCode) = 0; | |
79 | /** | |
80 | * Adds a relation with strength and prefix | str / extension. | |
81 | */ | |
82 | virtual void addRelation(int32_t strength, const UnicodeString &prefix, | |
83 | const UnicodeString &str, const UnicodeString &extension, | |
84 | const char *&errorReason, UErrorCode &errorCode) = 0; | |
85 | ||
86 | virtual void suppressContractions(const UnicodeSet &set, const char *&errorReason, | |
87 | UErrorCode &errorCode); | |
88 | ||
89 | virtual void optimize(const UnicodeSet &set, const char *&errorReason, | |
90 | UErrorCode &errorCode); | |
91 | }; | |
92 | ||
93 | class U_I18N_API Importer : public UObject { | |
94 | public: | |
95 | virtual ~Importer(); | |
b331163b | 96 | virtual void getRules( |
57a6839d | 97 | const char *localeID, const char *collationType, |
b331163b | 98 | UnicodeString &rules, |
57a6839d A |
99 | const char *&errorReason, UErrorCode &errorCode) = 0; |
100 | }; | |
101 | ||
102 | /** | |
103 | * Constructor. | |
104 | * The Sink must be set before parsing. | |
105 | * The Importer can be set, otherwise [import locale] syntax is not supported. | |
106 | */ | |
107 | CollationRuleParser(const CollationData *base, UErrorCode &errorCode); | |
108 | ~CollationRuleParser(); | |
109 | ||
110 | /** | |
111 | * Sets the pointer to a Sink object. | |
112 | * The pointer is aliased: Pointer copy without cloning or taking ownership. | |
113 | */ | |
114 | void setSink(Sink *sinkAlias) { | |
115 | sink = sinkAlias; | |
116 | } | |
117 | ||
118 | /** | |
119 | * Sets the pointer to an Importer object. | |
120 | * The pointer is aliased: Pointer copy without cloning or taking ownership. | |
121 | */ | |
122 | void setImporter(Importer *importerAlias) { | |
123 | importer = importerAlias; | |
124 | } | |
125 | ||
126 | void parse(const UnicodeString &ruleString, | |
127 | CollationSettings &outSettings, | |
128 | UParseError *outParseError, | |
129 | UErrorCode &errorCode); | |
130 | ||
131 | const char *getErrorReason() const { return errorReason; } | |
132 | ||
133 | /** | |
134 | * Gets a script or reorder code from its string representation. | |
135 | * @return the script/reorder code, or | |
b331163b | 136 | * -1 if not recognized |
57a6839d A |
137 | */ |
138 | static int32_t getReorderCode(const char *word); | |
139 | ||
140 | private: | |
141 | /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */ | |
142 | static const int32_t STRENGTH_MASK = 0xf; | |
143 | static const int32_t STARRED_FLAG = 0x10; | |
144 | static const int32_t OFFSET_SHIFT = 8; | |
145 | ||
146 | void parse(const UnicodeString &ruleString, UErrorCode &errorCode); | |
147 | void parseRuleChain(UErrorCode &errorCode); | |
148 | int32_t parseResetAndPosition(UErrorCode &errorCode); | |
149 | int32_t parseRelationOperator(UErrorCode &errorCode); | |
150 | void parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode); | |
151 | void parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode); | |
152 | int32_t parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode); | |
153 | int32_t parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode); | |
154 | ||
155 | /** | |
156 | * Sets str to a contraction of U+FFFE and (U+2800 + Position). | |
157 | * @return rule index after the special reset position | |
158 | */ | |
159 | int32_t parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode); | |
160 | void parseSetting(UErrorCode &errorCode); | |
161 | void parseReordering(const UnicodeString &raw, UErrorCode &errorCode); | |
162 | static UColAttributeValue getOnOffValue(const UnicodeString &s); | |
163 | ||
164 | int32_t parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode); | |
165 | int32_t readWords(int32_t i, UnicodeString &raw) const; | |
166 | int32_t skipComment(int32_t i) const; | |
167 | ||
168 | void setParseError(const char *reason, UErrorCode &errorCode); | |
169 | void setErrorContext(); | |
170 | ||
171 | /** | |
172 | * ASCII [:P:] and [:S:]: | |
173 | * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E] | |
174 | */ | |
175 | static UBool isSyntaxChar(UChar32 c); | |
176 | int32_t skipWhiteSpace(int32_t i) const; | |
177 | ||
178 | const Normalizer2 &nfd, &nfc; | |
179 | ||
180 | const UnicodeString *rules; | |
181 | const CollationData *const baseData; | |
182 | CollationSettings *settings; | |
183 | UParseError *parseError; | |
184 | const char *errorReason; | |
185 | ||
186 | Sink *sink; | |
187 | Importer *importer; | |
188 | ||
189 | int32_t ruleIndex; | |
190 | }; | |
191 | ||
192 | U_NAMESPACE_END | |
193 | ||
194 | #endif // !UCONFIG_NO_COLLATION | |
195 | #endif // __COLLATIONRULEPARSER_H__ |