1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2013-2014, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * collationruleparser.h
10 * created on: 2013apr10
11 * created by: Markus W. Scherer
14 #ifndef __COLLATIONRULEPARSER_H__
15 #define __COLLATIONRULEPARSER_H__
17 #include "unicode/utypes.h"
19 #if !UCONFIG_NO_COLLATION
21 #include "unicode/ucol.h"
22 #include "unicode/uniset.h"
23 #include "unicode/unistr.h"
30 struct CollationTailoring
;
35 struct CollationSettings
;
37 class U_I18N_API CollationRuleParser
: public UMemory
{
39 /** Special reset positions. */
41 FIRST_TERTIARY_IGNORABLE
,
42 LAST_TERTIARY_IGNORABLE
,
43 FIRST_SECONDARY_IGNORABLE
,
44 LAST_SECONDARY_IGNORABLE
,
45 FIRST_PRIMARY_IGNORABLE
,
46 LAST_PRIMARY_IGNORABLE
,
58 * First character of contractions that encode special reset positions.
59 * U+FFFE cannot be tailored via rule syntax.
61 * The second contraction character is POS_BASE + Position.
63 static const UChar POS_LEAD
= 0xfffe;
65 * Base for the second character of contractions that encode special reset positions.
66 * Braille characters U+28xx are printable and normalization-inert.
69 static const UChar POS_BASE
= 0x2800;
71 class U_I18N_API Sink
: public UObject
{
76 * strength=UCOL_IDENTICAL for &str.
77 * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3.
79 virtual void addReset(int32_t strength
, const UnicodeString
&str
,
80 const char *&errorReason
, UErrorCode
&errorCode
) = 0;
82 * Adds a relation with strength and prefix | str / extension.
84 virtual void addRelation(int32_t strength
, const UnicodeString
&prefix
,
85 const UnicodeString
&str
, const UnicodeString
&extension
,
86 const char *&errorReason
, UErrorCode
&errorCode
) = 0;
88 virtual void suppressContractions(const UnicodeSet
&set
, const char *&errorReason
,
89 UErrorCode
&errorCode
);
91 virtual void optimize(const UnicodeSet
&set
, const char *&errorReason
,
92 UErrorCode
&errorCode
);
95 class U_I18N_API Importer
: public UObject
{
98 virtual void getRules(
99 const char *localeID
, const char *collationType
,
100 UnicodeString
&rules
,
101 const char *&errorReason
, UErrorCode
&errorCode
) = 0;
106 * The Sink must be set before parsing.
107 * The Importer can be set, otherwise [import locale] syntax is not supported.
109 CollationRuleParser(const CollationData
*base
, UErrorCode
&errorCode
);
110 ~CollationRuleParser();
113 * Sets the pointer to a Sink object.
114 * The pointer is aliased: Pointer copy without cloning or taking ownership.
116 void setSink(Sink
*sinkAlias
) {
121 * Sets the pointer to an Importer object.
122 * The pointer is aliased: Pointer copy without cloning or taking ownership.
124 void setImporter(Importer
*importerAlias
) {
125 importer
= importerAlias
;
128 void parse(const UnicodeString
&ruleString
,
129 CollationSettings
&outSettings
,
130 UParseError
*outParseError
,
131 UErrorCode
&errorCode
);
133 const char *getErrorReason() const { return errorReason
; }
136 * Gets a script or reorder code from its string representation.
137 * @return the script/reorder code, or
138 * -1 if not recognized
140 static int32_t getReorderCode(const char *word
);
143 /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */
144 static const int32_t STRENGTH_MASK
= 0xf;
145 static const int32_t STARRED_FLAG
= 0x10;
146 static const int32_t OFFSET_SHIFT
= 8;
148 void parse(const UnicodeString
&ruleString
, UErrorCode
&errorCode
);
149 void parseRuleChain(UErrorCode
&errorCode
);
150 int32_t parseResetAndPosition(UErrorCode
&errorCode
);
151 int32_t parseRelationOperator(UErrorCode
&errorCode
);
152 void parseRelationStrings(int32_t strength
, int32_t i
, UErrorCode
&errorCode
);
153 void parseStarredCharacters(int32_t strength
, int32_t i
, UErrorCode
&errorCode
);
154 int32_t parseTailoringString(int32_t i
, UnicodeString
&raw
, UErrorCode
&errorCode
);
155 int32_t parseString(int32_t i
, UnicodeString
&raw
, UErrorCode
&errorCode
);
158 * Sets str to a contraction of U+FFFE and (U+2800 + Position).
159 * @return rule index after the special reset position
161 int32_t parseSpecialPosition(int32_t i
, UnicodeString
&str
, UErrorCode
&errorCode
);
162 void parseSetting(UErrorCode
&errorCode
);
163 void parseReordering(const UnicodeString
&raw
, UErrorCode
&errorCode
);
164 static UColAttributeValue
getOnOffValue(const UnicodeString
&s
);
166 int32_t parseUnicodeSet(int32_t i
, UnicodeSet
&set
, UErrorCode
&errorCode
);
167 int32_t readWords(int32_t i
, UnicodeString
&raw
) const;
168 int32_t skipComment(int32_t i
) const;
170 void setParseError(const char *reason
, UErrorCode
&errorCode
);
171 void setErrorContext();
174 * ASCII [:P:] and [:S:]:
175 * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E]
177 static UBool
isSyntaxChar(UChar32 c
);
178 int32_t skipWhiteSpace(int32_t i
) const;
180 const Normalizer2
&nfd
, &nfc
;
182 const UnicodeString
*rules
;
183 const CollationData
*const baseData
;
184 CollationSettings
*settings
;
185 UParseError
*parseError
;
186 const char *errorReason
;
196 #endif // !UCONFIG_NO_COLLATION
197 #endif // __COLLATIONRULEPARSER_H__