]> git.saurik.com Git - apple/icu.git/blob - icuSources/tools/toolutil/ppucd.h
ICU-62107.0.1.tar.gz
[apple/icu.git] / icuSources / tools / toolutil / ppucd.h
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2011-2013, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: ppucd.h
9 * encoding: UTF-8
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2011dec11
14 * created by: Markus W. Scherer
15 */
16
17 #ifndef __PPUCD_H__
18 #define __PPUCD_H__
19
20 #include "unicode/utypes.h"
21 #include "unicode/uniset.h"
22 #include "unicode/unistr.h"
23
24 #include <stdio.h>
25
26 /** Additions to the uchar.h enum UProperty. */
27 enum {
28 /** Name_Alias */
29 PPUCD_NAME_ALIAS=UCHAR_STRING_LIMIT,
30 PPUCD_CONDITIONAL_CASE_MAPPINGS,
31 PPUCD_TURKIC_CASE_FOLDING
32 };
33
34 U_NAMESPACE_BEGIN
35
36 class U_TOOLUTIL_API PropertyNames {
37 public:
38 virtual ~PropertyNames();
39 virtual int32_t getPropertyEnum(const char *name) const;
40 virtual int32_t getPropertyValueEnum(int32_t property, const char *name) const;
41 };
42
43 struct U_TOOLUTIL_API UniProps {
44 UniProps();
45 ~UniProps();
46
47 int32_t getIntProp(int32_t prop) const { return intProps[prop-UCHAR_INT_START]; }
48
49 UChar32 start, end;
50 UBool binProps[UCHAR_BINARY_LIMIT];
51 int32_t intProps[UCHAR_INT_LIMIT-UCHAR_INT_START];
52 UVersionInfo age;
53 UChar32 bmg, bpb;
54 UChar32 scf, slc, stc, suc;
55 int32_t digitValue;
56 const char *numericValue;
57 const char *name;
58 const char *nameAlias;
59 UnicodeString cf, lc, tc, uc;
60 UnicodeSet scx;
61 };
62
63 class U_TOOLUTIL_API PreparsedUCD {
64 public:
65 enum LineType {
66 /** No line, end of file. */
67 NO_LINE,
68 /** Empty line. (Might contain a comment.) */
69 EMPTY_LINE,
70
71 /** ucd;6.1.0 */
72 UNICODE_VERSION_LINE,
73
74 /** property;Binary;Alpha;Alphabetic */
75 PROPERTY_LINE,
76 /** binary;N;No;F;False */
77 BINARY_LINE,
78 /** value;gc;Zs;Space_Separator */
79 VALUE_LINE,
80
81 /** defaults;0000..10FFFF;age=NA;bc=L;... */
82 DEFAULTS_LINE,
83 /** block;0000..007F;age=1.1;blk=ASCII;ea=Na;... */
84 BLOCK_LINE,
85 /** cp;0030;AHex;bc=EN;gc=Nd;na=DIGIT ZERO;... */
86 CP_LINE,
87 /** unassigned;E01F0..E0FFF;bc=BN;CWKCF;DI;GCB=CN;NFKC_CF= */
88 UNASSIGNED_LINE,
89
90 /** algnamesrange;4E00..9FCC;han;CJK UNIFIED IDEOGRAPH- */
91 ALG_NAMES_RANGE_LINE,
92
93 LINE_TYPE_COUNT
94 };
95
96 /**
97 * Constructor.
98 * Prepare this object for a new, empty package.
99 */
100 PreparsedUCD(const char *filename, UErrorCode &errorCode);
101
102 /** Destructor. */
103 ~PreparsedUCD();
104
105 /** Sets (aliases) a non-standard PropertyNames implementation. Caller retains ownership. */
106 void setPropertyNames(const PropertyNames *pn) { pnames=pn; }
107
108 /**
109 * Reads a line from the preparsed UCD file.
110 * Splits the line by replacing each ';' with a NUL.
111 */
112 LineType readLine(UErrorCode &errorCode);
113
114 /** Returns the number of the line read by readLine(). */
115 int32_t getLineNumber() const { return lineNumber; }
116
117 /** Returns the line's next field, or NULL. */
118 const char *nextField();
119
120 /** Returns the Unicode version when or after the UNICODE_VERSION_LINE has been read. */
121 const UVersionInfo &getUnicodeVersion() const { return ucdVersion; }
122
123 /** Returns TRUE if the current line has property values. */
124 UBool lineHasPropertyValues() const {
125 return DEFAULTS_LINE<=lineType && lineType<=UNASSIGNED_LINE;
126 }
127
128 /**
129 * Parses properties from the current line.
130 * Clears newValues and sets UProperty codes for property values mentioned
131 * on the current line (as opposed to being inherited).
132 * Returns a pointer to the filled-in UniProps, or NULL if something went wrong.
133 * The returned UniProps are usable until the next line of the same type is read.
134 */
135 const UniProps *getProps(UnicodeSet &newValues, UErrorCode &errorCode);
136
137 /**
138 * Returns the code point range for the current algnamesrange line.
139 * Calls & parses nextField().
140 * Further nextField() calls will yield the range's type & prefix string.
141 * Returns U_SUCCESS(errorCode).
142 */
143 UBool getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode);
144
145 private:
146 UBool isLineBufferAvailable(int32_t i) {
147 return defaultLineIndex!=i && blockLineIndex!=i;
148 }
149
150 /** Resets the field iterator and returns the line's first field (the line type field). */
151 const char *firstField();
152
153 UBool parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
154 UErrorCode &errorCode);
155 UChar32 parseCodePoint(const char *s, UErrorCode &errorCode);
156 UBool parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode);
157 void parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode);
158 void parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode);
159
160 static const int32_t kNumLineBuffers=3;
161
162 PropertyNames *icuPnames; // owned
163 const PropertyNames *pnames; // aliased
164 FILE *file;
165 int32_t defaultLineIndex, blockLineIndex, lineIndex;
166 int32_t lineNumber;
167 LineType lineType;
168 char *fieldLimit;
169 char *lineLimit;
170
171 UVersionInfo ucdVersion;
172 UniProps defaultProps, blockProps, cpProps;
173 UnicodeSet blockValues;
174 // Multiple lines so that default and block properties can maintain pointers
175 // into their line buffers.
176 char lines[kNumLineBuffers][4096];
177 };
178
179 U_NAMESPACE_END
180
181 #endif // __PPUCD_H__