]> git.saurik.com Git - apple/icu.git/blob - icuSources/tools/toolutil/ppucd.h
ICU-57131.0.1.tar.gz
[apple/icu.git] / icuSources / tools / toolutil / ppucd.h
1 /*
2 *******************************************************************************
3 * Copyright (C) 2011-2013, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: ppucd.h
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2011dec11
12 * created by: Markus W. Scherer
13 */
14
15 #ifndef __PPUCD_H__
16 #define __PPUCD_H__
17
18 #include "unicode/utypes.h"
19 #include "unicode/uniset.h"
20 #include "unicode/unistr.h"
21
22 #include <stdio.h>
23
24 /** Additions to the uchar.h enum UProperty. */
25 enum {
26 /** Name_Alias */
27 PPUCD_NAME_ALIAS=UCHAR_STRING_LIMIT,
28 PPUCD_CONDITIONAL_CASE_MAPPINGS,
29 PPUCD_TURKIC_CASE_FOLDING
30 };
31
32 U_NAMESPACE_BEGIN
33
34 class U_TOOLUTIL_API PropertyNames {
35 public:
36 virtual ~PropertyNames();
37 virtual int32_t getPropertyEnum(const char *name) const;
38 virtual int32_t getPropertyValueEnum(int32_t property, const char *name) const;
39 };
40
41 struct U_TOOLUTIL_API UniProps {
42 UniProps();
43 ~UniProps();
44
45 int32_t getIntProp(int32_t prop) const { return intProps[prop-UCHAR_INT_START]; }
46
47 UChar32 start, end;
48 UBool binProps[UCHAR_BINARY_LIMIT];
49 int32_t intProps[UCHAR_INT_LIMIT-UCHAR_INT_START];
50 UVersionInfo age;
51 UChar32 bmg, bpb;
52 UChar32 scf, slc, stc, suc;
53 int32_t digitValue;
54 const char *numericValue;
55 const char *name;
56 const char *nameAlias;
57 UnicodeString cf, lc, tc, uc;
58 UnicodeSet scx;
59 };
60
61 class U_TOOLUTIL_API PreparsedUCD {
62 public:
63 enum LineType {
64 /** No line, end of file. */
65 NO_LINE,
66 /** Empty line. (Might contain a comment.) */
67 EMPTY_LINE,
68
69 /** ucd;6.1.0 */
70 UNICODE_VERSION_LINE,
71
72 /** property;Binary;Alpha;Alphabetic */
73 PROPERTY_LINE,
74 /** binary;N;No;F;False */
75 BINARY_LINE,
76 /** value;gc;Zs;Space_Separator */
77 VALUE_LINE,
78
79 /** defaults;0000..10FFFF;age=NA;bc=L;... */
80 DEFAULTS_LINE,
81 /** block;0000..007F;age=1.1;blk=ASCII;ea=Na;... */
82 BLOCK_LINE,
83 /** cp;0030;AHex;bc=EN;gc=Nd;na=DIGIT ZERO;... */
84 CP_LINE,
85
86 /** algnamesrange;4E00..9FCC;han;CJK UNIFIED IDEOGRAPH- */
87 ALG_NAMES_RANGE_LINE,
88
89 LINE_TYPE_COUNT
90 };
91
92 /**
93 * Constructor.
94 * Prepare this object for a new, empty package.
95 */
96 PreparsedUCD(const char *filename, UErrorCode &errorCode);
97
98 /** Destructor. */
99 ~PreparsedUCD();
100
101 /** Sets (aliases) a non-standard PropertyNames implementation. Caller retains ownership. */
102 void setPropertyNames(const PropertyNames *pn) { pnames=pn; }
103
104 /**
105 * Reads a line from the preparsed UCD file.
106 * Splits the line by replacing each ';' with a NUL.
107 */
108 LineType readLine(UErrorCode &errorCode);
109
110 /** Returns the number of the line read by readLine(). */
111 int32_t getLineNumber() const { return lineNumber; }
112
113 /** Returns the line's next field, or NULL. */
114 const char *nextField();
115
116 /** Returns the Unicode version when or after the UNICODE_VERSION_LINE has been read. */
117 const UVersionInfo &getUnicodeVersion() const { return ucdVersion; }
118
119 /** Returns TRUE if the current line has property values. */
120 UBool lineHasPropertyValues() const { return DEFAULTS_LINE<=lineType && lineType<=CP_LINE; }
121
122 /**
123 * Parses properties from the current line.
124 * Clears newValues and sets UProperty codes for property values mentioned
125 * on the current line (as opposed to being inherited).
126 * Returns a pointer to the filled-in UniProps, or NULL if something went wrong.
127 * The returned UniProps are usable until the next line of the same type is read.
128 */
129 const UniProps *getProps(UnicodeSet &newValues, UErrorCode &errorCode);
130
131 /**
132 * Returns the code point range for the current algnamesrange line.
133 * Calls & parses nextField().
134 * Further nextField() calls will yield the range's type & prefix string.
135 * Returns U_SUCCESS(errorCode).
136 */
137 UBool getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode);
138
139 private:
140 UBool isLineBufferAvailable(int32_t i) {
141 return defaultLineIndex!=i && blockLineIndex!=i;
142 }
143
144 /** Resets the field iterator and returns the line's first field (the line type field). */
145 const char *firstField();
146
147 UBool parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
148 UErrorCode &errorCode);
149 UChar32 parseCodePoint(const char *s, UErrorCode &errorCode);
150 UBool parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode);
151 void parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode);
152 void parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode);
153
154 static const int32_t kNumLineBuffers=3;
155
156 PropertyNames *icuPnames; // owned
157 const PropertyNames *pnames; // aliased
158 FILE *file;
159 int32_t defaultLineIndex, blockLineIndex, lineIndex;
160 int32_t lineNumber;
161 LineType lineType;
162 char *fieldLimit;
163 char *lineLimit;
164
165 UVersionInfo ucdVersion;
166 UniProps defaultProps, blockProps, cpProps;
167 // Multiple lines so that default and block properties can maintain pointers
168 // into their line buffers.
169 char lines[kNumLineBuffers][4096];
170 };
171
172 U_NAMESPACE_END
173
174 #endif // __PPUCD_H__