]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/collationruleparser.h
ICU-57131.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / collationruleparser.h
CommitLineData
57a6839d
A
1/*
2*******************************************************************************
3* Copyright (C) 2013-2014, International Business Machines
4* Corporation and others. All Rights Reserved.
5*******************************************************************************
6* collationruleparser.h
7*
8* created on: 2013apr10
9* created by: Markus W. Scherer
10*/
11
12#ifndef __COLLATIONRULEPARSER_H__
13#define __COLLATIONRULEPARSER_H__
14
15#include "unicode/utypes.h"
16
17#if !UCONFIG_NO_COLLATION
18
19#include "unicode/ucol.h"
20#include "unicode/uniset.h"
21#include "unicode/unistr.h"
22
23struct UParseError;
24
25U_NAMESPACE_BEGIN
26
27struct CollationData;
28struct CollationTailoring;
29
30class Locale;
31class Normalizer2;
32
33struct CollationSettings;
34
35class U_I18N_API CollationRuleParser : public UMemory {
36public:
37 /** Special reset positions. */
38 enum Position {
39 FIRST_TERTIARY_IGNORABLE,
40 LAST_TERTIARY_IGNORABLE,
41 FIRST_SECONDARY_IGNORABLE,
42 LAST_SECONDARY_IGNORABLE,
43 FIRST_PRIMARY_IGNORABLE,
44 LAST_PRIMARY_IGNORABLE,
45 FIRST_VARIABLE,
46 LAST_VARIABLE,
47 FIRST_REGULAR,
48 LAST_REGULAR,
49 FIRST_IMPLICIT,
50 LAST_IMPLICIT,
51 FIRST_TRAILING,
52 LAST_TRAILING
53 };
54
55 /**
56 * First character of contractions that encode special reset positions.
57 * U+FFFE cannot be tailored via rule syntax.
58 *
59 * The second contraction character is POS_BASE + Position.
60 */
61 static const UChar POS_LEAD = 0xfffe;
62 /**
63 * Base for the second character of contractions that encode special reset positions.
64 * Braille characters U+28xx are printable and normalization-inert.
65 * @see POS_LEAD
66 */
67 static const UChar POS_BASE = 0x2800;
68
69 class U_I18N_API Sink : public UObject {
70 public:
71 virtual ~Sink();
72 /**
73 * Adds a reset.
74 * strength=UCOL_IDENTICAL for &str.
75 * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3.
76 */
77 virtual void addReset(int32_t strength, const UnicodeString &str,
78 const char *&errorReason, UErrorCode &errorCode) = 0;
79 /**
80 * Adds a relation with strength and prefix | str / extension.
81 */
82 virtual void addRelation(int32_t strength, const UnicodeString &prefix,
83 const UnicodeString &str, const UnicodeString &extension,
84 const char *&errorReason, UErrorCode &errorCode) = 0;
85
86 virtual void suppressContractions(const UnicodeSet &set, const char *&errorReason,
87 UErrorCode &errorCode);
88
89 virtual void optimize(const UnicodeSet &set, const char *&errorReason,
90 UErrorCode &errorCode);
91 };
92
93 class U_I18N_API Importer : public UObject {
94 public:
95 virtual ~Importer();
b331163b 96 virtual void getRules(
57a6839d 97 const char *localeID, const char *collationType,
b331163b 98 UnicodeString &rules,
57a6839d
A
99 const char *&errorReason, UErrorCode &errorCode) = 0;
100 };
101
102 /**
103 * Constructor.
104 * The Sink must be set before parsing.
105 * The Importer can be set, otherwise [import locale] syntax is not supported.
106 */
107 CollationRuleParser(const CollationData *base, UErrorCode &errorCode);
108 ~CollationRuleParser();
109
110 /**
111 * Sets the pointer to a Sink object.
112 * The pointer is aliased: Pointer copy without cloning or taking ownership.
113 */
114 void setSink(Sink *sinkAlias) {
115 sink = sinkAlias;
116 }
117
118 /**
119 * Sets the pointer to an Importer object.
120 * The pointer is aliased: Pointer copy without cloning or taking ownership.
121 */
122 void setImporter(Importer *importerAlias) {
123 importer = importerAlias;
124 }
125
126 void parse(const UnicodeString &ruleString,
127 CollationSettings &outSettings,
128 UParseError *outParseError,
129 UErrorCode &errorCode);
130
131 const char *getErrorReason() const { return errorReason; }
132
133 /**
134 * Gets a script or reorder code from its string representation.
135 * @return the script/reorder code, or
b331163b 136 * -1 if not recognized
57a6839d
A
137 */
138 static int32_t getReorderCode(const char *word);
139
140private:
141 /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */
142 static const int32_t STRENGTH_MASK = 0xf;
143 static const int32_t STARRED_FLAG = 0x10;
144 static const int32_t OFFSET_SHIFT = 8;
145
146 void parse(const UnicodeString &ruleString, UErrorCode &errorCode);
147 void parseRuleChain(UErrorCode &errorCode);
148 int32_t parseResetAndPosition(UErrorCode &errorCode);
149 int32_t parseRelationOperator(UErrorCode &errorCode);
150 void parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode);
151 void parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode);
152 int32_t parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode);
153 int32_t parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode);
154
155 /**
156 * Sets str to a contraction of U+FFFE and (U+2800 + Position).
157 * @return rule index after the special reset position
158 */
159 int32_t parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode);
160 void parseSetting(UErrorCode &errorCode);
161 void parseReordering(const UnicodeString &raw, UErrorCode &errorCode);
162 static UColAttributeValue getOnOffValue(const UnicodeString &s);
163
164 int32_t parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode);
165 int32_t readWords(int32_t i, UnicodeString &raw) const;
166 int32_t skipComment(int32_t i) const;
167
168 void setParseError(const char *reason, UErrorCode &errorCode);
169 void setErrorContext();
170
171 /**
172 * ASCII [:P:] and [:S:]:
173 * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E]
174 */
175 static UBool isSyntaxChar(UChar32 c);
176 int32_t skipWhiteSpace(int32_t i) const;
177
178 const Normalizer2 &nfd, &nfc;
179
180 const UnicodeString *rules;
181 const CollationData *const baseData;
182 CollationSettings *settings;
183 UParseError *parseError;
184 const char *errorReason;
185
186 Sink *sink;
187 Importer *importer;
188
189 int32_t ruleIndex;
190};
191
192U_NAMESPACE_END
193
194#endif // !UCONFIG_NO_COLLATION
195#endif // __COLLATIONRULEPARSER_H__