]>
Commit | Line | Data |
---|---|---|
1 | // © 2016 and later: Unicode, Inc. and others. | |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
3 | /* | |
4 | ******************************************************************************* | |
5 | * Copyright (C) 2013-2014, International Business Machines | |
6 | * Corporation and others. All Rights Reserved. | |
7 | ******************************************************************************* | |
8 | * collationruleparser.h | |
9 | * | |
10 | * created on: 2013apr10 | |
11 | * created by: Markus W. Scherer | |
12 | */ | |
13 | ||
14 | #ifndef __COLLATIONRULEPARSER_H__ | |
15 | #define __COLLATIONRULEPARSER_H__ | |
16 | ||
17 | #include "unicode/utypes.h" | |
18 | ||
19 | #if !UCONFIG_NO_COLLATION | |
20 | ||
21 | #include "unicode/ucol.h" | |
22 | #include "unicode/uniset.h" | |
23 | #include "unicode/unistr.h" | |
24 | ||
25 | struct UParseError; | |
26 | ||
27 | U_NAMESPACE_BEGIN | |
28 | ||
29 | struct CollationData; | |
30 | struct CollationTailoring; | |
31 | ||
32 | class Locale; | |
33 | class Normalizer2; | |
34 | ||
35 | struct CollationSettings; | |
36 | ||
37 | class U_I18N_API CollationRuleParser : public UMemory { | |
38 | public: | |
39 | /** Special reset positions. */ | |
40 | enum Position { | |
41 | FIRST_TERTIARY_IGNORABLE, | |
42 | LAST_TERTIARY_IGNORABLE, | |
43 | FIRST_SECONDARY_IGNORABLE, | |
44 | LAST_SECONDARY_IGNORABLE, | |
45 | FIRST_PRIMARY_IGNORABLE, | |
46 | LAST_PRIMARY_IGNORABLE, | |
47 | FIRST_VARIABLE, | |
48 | LAST_VARIABLE, | |
49 | FIRST_REGULAR, | |
50 | LAST_REGULAR, | |
51 | FIRST_IMPLICIT, | |
52 | LAST_IMPLICIT, | |
53 | FIRST_TRAILING, | |
54 | LAST_TRAILING | |
55 | }; | |
56 | ||
57 | /** | |
58 | * First character of contractions that encode special reset positions. | |
59 | * U+FFFE cannot be tailored via rule syntax. | |
60 | * | |
61 | * The second contraction character is POS_BASE + Position. | |
62 | */ | |
63 | static const UChar POS_LEAD = 0xfffe; | |
64 | /** | |
65 | * Base for the second character of contractions that encode special reset positions. | |
66 | * Braille characters U+28xx are printable and normalization-inert. | |
67 | * @see POS_LEAD | |
68 | */ | |
69 | static const UChar POS_BASE = 0x2800; | |
70 | ||
71 | class U_I18N_API Sink : public UObject { | |
72 | public: | |
73 | virtual ~Sink(); | |
74 | /** | |
75 | * Adds a reset. | |
76 | * strength=UCOL_IDENTICAL for &str. | |
77 | * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3. | |
78 | */ | |
79 | virtual void addReset(int32_t strength, const UnicodeString &str, | |
80 | const char *&errorReason, UErrorCode &errorCode) = 0; | |
81 | /** | |
82 | * Adds a relation with strength and prefix | str / extension. | |
83 | */ | |
84 | virtual void addRelation(int32_t strength, const UnicodeString &prefix, | |
85 | const UnicodeString &str, const UnicodeString &extension, | |
86 | const char *&errorReason, UErrorCode &errorCode) = 0; | |
87 | ||
88 | virtual void suppressContractions(const UnicodeSet &set, const char *&errorReason, | |
89 | UErrorCode &errorCode); | |
90 | ||
91 | virtual void optimize(const UnicodeSet &set, const char *&errorReason, | |
92 | UErrorCode &errorCode); | |
93 | }; | |
94 | ||
95 | class U_I18N_API Importer : public UObject { | |
96 | public: | |
97 | virtual ~Importer(); | |
98 | virtual void getRules( | |
99 | const char *localeID, const char *collationType, | |
100 | UnicodeString &rules, | |
101 | const char *&errorReason, UErrorCode &errorCode) = 0; | |
102 | }; | |
103 | ||
104 | /** | |
105 | * Constructor. | |
106 | * The Sink must be set before parsing. | |
107 | * The Importer can be set, otherwise [import locale] syntax is not supported. | |
108 | */ | |
109 | CollationRuleParser(const CollationData *base, UErrorCode &errorCode); | |
110 | ~CollationRuleParser(); | |
111 | ||
112 | /** | |
113 | * Sets the pointer to a Sink object. | |
114 | * The pointer is aliased: Pointer copy without cloning or taking ownership. | |
115 | */ | |
116 | void setSink(Sink *sinkAlias) { | |
117 | sink = sinkAlias; | |
118 | } | |
119 | ||
120 | /** | |
121 | * Sets the pointer to an Importer object. | |
122 | * The pointer is aliased: Pointer copy without cloning or taking ownership. | |
123 | */ | |
124 | void setImporter(Importer *importerAlias) { | |
125 | importer = importerAlias; | |
126 | } | |
127 | ||
128 | void parse(const UnicodeString &ruleString, | |
129 | CollationSettings &outSettings, | |
130 | UParseError *outParseError, | |
131 | UErrorCode &errorCode); | |
132 | ||
133 | const char *getErrorReason() const { return errorReason; } | |
134 | ||
135 | /** | |
136 | * Gets a script or reorder code from its string representation. | |
137 | * @return the script/reorder code, or | |
138 | * -1 if not recognized | |
139 | */ | |
140 | static int32_t getReorderCode(const char *word); | |
141 | ||
142 | private: | |
143 | /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */ | |
144 | static const int32_t STRENGTH_MASK = 0xf; | |
145 | static const int32_t STARRED_FLAG = 0x10; | |
146 | static const int32_t OFFSET_SHIFT = 8; | |
147 | ||
148 | void parse(const UnicodeString &ruleString, UErrorCode &errorCode); | |
149 | void parseRuleChain(UErrorCode &errorCode); | |
150 | int32_t parseResetAndPosition(UErrorCode &errorCode); | |
151 | int32_t parseRelationOperator(UErrorCode &errorCode); | |
152 | void parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode); | |
153 | void parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode); | |
154 | int32_t parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode); | |
155 | int32_t parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode); | |
156 | ||
157 | /** | |
158 | * Sets str to a contraction of U+FFFE and (U+2800 + Position). | |
159 | * @return rule index after the special reset position | |
160 | */ | |
161 | int32_t parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode); | |
162 | void parseSetting(UErrorCode &errorCode); | |
163 | void parseReordering(const UnicodeString &raw, UErrorCode &errorCode); | |
164 | static UColAttributeValue getOnOffValue(const UnicodeString &s); | |
165 | ||
166 | int32_t parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode); | |
167 | int32_t readWords(int32_t i, UnicodeString &raw) const; | |
168 | int32_t skipComment(int32_t i) const; | |
169 | ||
170 | void setParseError(const char *reason, UErrorCode &errorCode); | |
171 | void setErrorContext(); | |
172 | ||
173 | /** | |
174 | * ASCII [:P:] and [:S:]: | |
175 | * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E] | |
176 | */ | |
177 | static UBool isSyntaxChar(UChar32 c); | |
178 | int32_t skipWhiteSpace(int32_t i) const; | |
179 | ||
180 | const Normalizer2 &nfd, &nfc; | |
181 | ||
182 | const UnicodeString *rules; | |
183 | const CollationData *const baseData; | |
184 | CollationSettings *settings; | |
185 | UParseError *parseError; | |
186 | const char *errorReason; | |
187 | ||
188 | Sink *sink; | |
189 | Importer *importer; | |
190 | ||
191 | int32_t ruleIndex; | |
192 | }; | |
193 | ||
194 | U_NAMESPACE_END | |
195 | ||
196 | #endif // !UCONFIG_NO_COLLATION | |
197 | #endif // __COLLATIONRULEPARSER_H__ |