2 *******************************************************************************
3 * Copyright (C) 2013-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationruleparser.h
8 * created on: 2013apr10
9 * created by: Markus W. Scherer
12 #ifndef __COLLATIONRULEPARSER_H__
13 #define __COLLATIONRULEPARSER_H__
15 #include "unicode/utypes.h"
17 #if !UCONFIG_NO_COLLATION
19 #include "unicode/ucol.h"
20 #include "unicode/uniset.h"
21 #include "unicode/unistr.h"
28 struct CollationTailoring
;
33 struct CollationSettings
;
35 class U_I18N_API CollationRuleParser
: public UMemory
{
37 /** Special reset positions. */
39 FIRST_TERTIARY_IGNORABLE
,
40 LAST_TERTIARY_IGNORABLE
,
41 FIRST_SECONDARY_IGNORABLE
,
42 LAST_SECONDARY_IGNORABLE
,
43 FIRST_PRIMARY_IGNORABLE
,
44 LAST_PRIMARY_IGNORABLE
,
56 * First character of contractions that encode special reset positions.
57 * U+FFFE cannot be tailored via rule syntax.
59 * The second contraction character is POS_BASE + Position.
61 static const UChar POS_LEAD
= 0xfffe;
63 * Base for the second character of contractions that encode special reset positions.
64 * Braille characters U+28xx are printable and normalization-inert.
67 static const UChar POS_BASE
= 0x2800;
69 class U_I18N_API Sink
: public UObject
{
74 * strength=UCOL_IDENTICAL for &str.
75 * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3.
77 virtual void addReset(int32_t strength
, const UnicodeString
&str
,
78 const char *&errorReason
, UErrorCode
&errorCode
) = 0;
80 * Adds a relation with strength and prefix | str / extension.
82 virtual void addRelation(int32_t strength
, const UnicodeString
&prefix
,
83 const UnicodeString
&str
, const UnicodeString
&extension
,
84 const char *&errorReason
, UErrorCode
&errorCode
) = 0;
86 virtual void suppressContractions(const UnicodeSet
&set
, const char *&errorReason
,
87 UErrorCode
&errorCode
);
89 virtual void optimize(const UnicodeSet
&set
, const char *&errorReason
,
90 UErrorCode
&errorCode
);
93 class U_I18N_API Importer
: public UObject
{
96 virtual const UnicodeString
*getRules(
97 const char *localeID
, const char *collationType
,
98 const char *&errorReason
, UErrorCode
&errorCode
) = 0;
103 * The Sink must be set before parsing.
104 * The Importer can be set, otherwise [import locale] syntax is not supported.
106 CollationRuleParser(const CollationData
*base
, UErrorCode
&errorCode
);
107 ~CollationRuleParser();
110 * Sets the pointer to a Sink object.
111 * The pointer is aliased: Pointer copy without cloning or taking ownership.
113 void setSink(Sink
*sinkAlias
) {
118 * Sets the pointer to an Importer object.
119 * The pointer is aliased: Pointer copy without cloning or taking ownership.
121 void setImporter(Importer
*importerAlias
) {
122 importer
= importerAlias
;
125 void parse(const UnicodeString
&ruleString
,
126 CollationSettings
&outSettings
,
127 UParseError
*outParseError
,
128 UErrorCode
&errorCode
);
130 const char *getErrorReason() const { return errorReason
; }
133 * Gets a script or reorder code from its string representation.
134 * @return the script/reorder code, or
135 * -1==UCOL_REORDER_CODE_DEFAULT, or
136 * -2 if not recognized
138 static int32_t getReorderCode(const char *word
);
141 /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */
142 static const int32_t STRENGTH_MASK
= 0xf;
143 static const int32_t STARRED_FLAG
= 0x10;
144 static const int32_t OFFSET_SHIFT
= 8;
146 void parse(const UnicodeString
&ruleString
, UErrorCode
&errorCode
);
147 void parseRuleChain(UErrorCode
&errorCode
);
148 int32_t parseResetAndPosition(UErrorCode
&errorCode
);
149 int32_t parseRelationOperator(UErrorCode
&errorCode
);
150 void parseRelationStrings(int32_t strength
, int32_t i
, UErrorCode
&errorCode
);
151 void parseStarredCharacters(int32_t strength
, int32_t i
, UErrorCode
&errorCode
);
152 int32_t parseTailoringString(int32_t i
, UnicodeString
&raw
, UErrorCode
&errorCode
);
153 int32_t parseString(int32_t i
, UnicodeString
&raw
, UErrorCode
&errorCode
);
156 * Sets str to a contraction of U+FFFE and (U+2800 + Position).
157 * @return rule index after the special reset position
159 int32_t parseSpecialPosition(int32_t i
, UnicodeString
&str
, UErrorCode
&errorCode
);
160 void parseSetting(UErrorCode
&errorCode
);
161 void parseReordering(const UnicodeString
&raw
, UErrorCode
&errorCode
);
162 static UColAttributeValue
getOnOffValue(const UnicodeString
&s
);
164 int32_t parseUnicodeSet(int32_t i
, UnicodeSet
&set
, UErrorCode
&errorCode
);
165 int32_t readWords(int32_t i
, UnicodeString
&raw
) const;
166 int32_t skipComment(int32_t i
) const;
168 void setParseError(const char *reason
, UErrorCode
&errorCode
);
169 void setErrorContext();
172 * ASCII [:P:] and [:S:]:
173 * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E]
175 static UBool
isSyntaxChar(UChar32 c
);
176 int32_t skipWhiteSpace(int32_t i
) const;
178 const Normalizer2
&nfd
, &nfc
;
180 const UnicodeString
*rules
;
181 const CollationData
*const baseData
;
182 CollationSettings
*settings
;
183 UParseError
*parseError
;
184 const char *errorReason
;
194 #endif // !UCONFIG_NO_COLLATION
195 #endif // __COLLATIONRULEPARSER_H__