]>
Commit | Line | Data |
---|---|---|
1 | // © 2016 and later: Unicode, Inc. and others. | |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
3 | /* | |
4 | ******************************************************************************* | |
5 | * | |
6 | * Copyright (C) 2004-2005, International Business Machines | |
7 | * Corporation and others. All Rights Reserved. | |
8 | * | |
9 | ******************************************************************************* | |
10 | * file name: xmlparser.h | |
11 | * encoding: UTF-8 | |
12 | * tab size: 8 (not used) | |
13 | * indentation:4 | |
14 | * | |
15 | * created on: 2004jul21 | |
16 | * created by: Andy Heninger | |
17 | * | |
18 | * Tiny XML parser using ICU and intended for use in ICU tests and in build tools. | |
19 | * Not suitable for production use. Not supported. | |
20 | * Not conformant. Not efficient. | |
21 | * But very small. | |
22 | */ | |
23 | ||
24 | #ifndef __XMLPARSER_H__ | |
25 | #define __XMLPARSER_H__ | |
26 | ||
27 | #include "unicode/uobject.h" | |
28 | #include "unicode/unistr.h" | |
29 | #include "unicode/regex.h" | |
30 | #include "uvector.h" | |
31 | #include "hash.h" | |
32 | ||
33 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION | |
34 | ||
35 | enum UXMLNodeType { | |
36 | /** Node type string (text contents), stored as a UnicodeString. */ | |
37 | UXML_NODE_TYPE_STRING, | |
38 | /** Node type element, stored as a UXMLElement. */ | |
39 | UXML_NODE_TYPE_ELEMENT, | |
40 | UXML_NODE_TYPE_COUNT | |
41 | }; | |
42 | ||
43 | U_NAMESPACE_BEGIN | |
44 | ||
45 | class UXMLParser; | |
46 | ||
47 | /** | |
48 | * This class represents an element node in a parsed XML tree. | |
49 | */ | |
50 | class U_TOOLUTIL_API UXMLElement : public UObject { | |
51 | public: | |
52 | /** | |
53 | * Destructor. | |
54 | */ | |
55 | virtual ~UXMLElement(); | |
56 | ||
57 | /** | |
58 | * Get the tag name of this element. | |
59 | */ | |
60 | const UnicodeString &getTagName() const; | |
61 | /** | |
62 | * Get the text contents of the element. | |
63 | * Append the contents of all text child nodes. | |
64 | * @param recurse If TRUE, also recursively appends the contents of all | |
65 | * text child nodes of element children. | |
66 | * @return The text contents. | |
67 | */ | |
68 | UnicodeString getText(UBool recurse) const; | |
69 | /** | |
70 | * Get the number of attributes. | |
71 | */ | |
72 | int32_t countAttributes() const; | |
73 | /** | |
74 | * Get the i-th attribute. | |
75 | * @param i Index of the attribute. | |
76 | * @param name Output parameter, receives the attribute name. | |
77 | * @param value Output parameter, receives the attribute value. | |
78 | * @return A pointer to the attribute value (may be &value or a pointer to an | |
79 | * internal string object), or NULL if i is out of bounds. | |
80 | */ | |
81 | const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const; | |
82 | /** | |
83 | * Get the value of the attribute with the given name. | |
84 | * @param name Attribute name to be looked up. | |
85 | * @return A pointer to the attribute value, or NULL if this element | |
86 | * does not have this attribute. | |
87 | */ | |
88 | const UnicodeString *getAttribute(const UnicodeString &name) const; | |
89 | /** | |
90 | * Get the number of child nodes. | |
91 | */ | |
92 | int32_t countChildren() const; | |
93 | /** | |
94 | * Get the i-th child node. | |
95 | * @param i Index of the child node. | |
96 | * @param type The child node type. | |
97 | * @return A pointer to the child node object, or NULL if i is out of bounds. | |
98 | */ | |
99 | const UObject *getChild(int32_t i, UXMLNodeType &type) const; | |
100 | /** | |
101 | * Get the next child element node, skipping non-element child nodes. | |
102 | * @param i Enumeration index; initialize to 0 before getting the first child element. | |
103 | * @return A pointer to the next child element, or NULL if there is none. | |
104 | */ | |
105 | const UXMLElement *nextChildElement(int32_t &i) const; | |
106 | /** | |
107 | * Get the immediate child element with the given name. | |
108 | * If there are multiple child elements with this name, then return | |
109 | * the first one. | |
110 | * @param name Element name to be looked up. | |
111 | * @return A pointer to the element node, or NULL if this element | |
112 | * does not have this immediate child element. | |
113 | */ | |
114 | const UXMLElement *getChildElement(const UnicodeString &name) const; | |
115 | ||
116 | /** | |
117 | * ICU "poor man's RTTI", returns a UClassID for the actual class. | |
118 | */ | |
119 | virtual UClassID getDynamicClassID() const; | |
120 | ||
121 | /** | |
122 | * ICU "poor man's RTTI", returns a UClassID for this class. | |
123 | */ | |
124 | static UClassID U_EXPORT2 getStaticClassID(); | |
125 | ||
126 | private: | |
127 | // prevent default construction etc. | |
128 | UXMLElement(); | |
129 | UXMLElement(const UXMLElement &other); | |
130 | UXMLElement &operator=(const UXMLElement &other); | |
131 | ||
132 | void appendText(UnicodeString &text, UBool recurse) const; | |
133 | ||
134 | friend class UXMLParser; | |
135 | ||
136 | UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode); | |
137 | ||
138 | const UXMLParser *fParser; | |
139 | const UnicodeString *fName; // The tag name of this element (owned by the UXMLParser) | |
140 | UnicodeString fContent; // The text content of this node. All element content is | |
141 | // concatenated even when there are intervening nested elements | |
142 | // (which doesn't happen with most xml files we care about) | |
143 | // Sections of content containing only white space are dropped, | |
144 | // which gets rid the bogus white space content from | |
145 | // elements which are primarily containers for nested elements. | |
146 | UVector fAttNames; // A vector containing the names of this element's attributes | |
147 | // The names are UnicodeString objects, owned by the UXMLParser. | |
148 | UVector fAttValues; // A vector containing the attribute values for | |
149 | // this element's attributes. The order is the same | |
150 | // as that of the attribute name vector. | |
151 | ||
152 | UVector fChildren; // The child nodes of this element (a Vector) | |
153 | ||
154 | UXMLElement *fParent; // A pointer to the parent element of this element. | |
155 | }; | |
156 | ||
157 | /** | |
158 | * A simple XML parser; it is neither efficient nor conformant and only useful for | |
159 | * restricted types of XML documents. | |
160 | * | |
161 | * The parse methods parse whole documents and return the parse trees via their | |
162 | * root elements. | |
163 | */ | |
164 | class U_TOOLUTIL_API UXMLParser : public UObject { | |
165 | public: | |
166 | /** | |
167 | * Create an XML parser. | |
168 | */ | |
169 | static UXMLParser *createParser(UErrorCode &errorCode); | |
170 | /** | |
171 | * Destructor. | |
172 | */ | |
173 | virtual ~UXMLParser(); | |
174 | ||
175 | /** | |
176 | * Parse an XML document, create the entire document tree, and | |
177 | * return a pointer to the root element of the parsed tree. | |
178 | * The caller must delete the element. | |
179 | */ | |
180 | UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode); | |
181 | /** | |
182 | * Parse an XML file, create the entire document tree, and | |
183 | * return a pointer to the root element of the parsed tree. | |
184 | * The caller must delete the element. | |
185 | */ | |
186 | UXMLElement *parseFile(const char *filename, UErrorCode &errorCode); | |
187 | ||
188 | /** | |
189 | * ICU "poor man's RTTI", returns a UClassID for the actual class. | |
190 | */ | |
191 | virtual UClassID getDynamicClassID() const; | |
192 | ||
193 | /** | |
194 | * ICU "poor man's RTTI", returns a UClassID for this class. | |
195 | */ | |
196 | static UClassID U_EXPORT2 getStaticClassID(); | |
197 | ||
198 | private: | |
199 | // prevent default construction etc. | |
200 | UXMLParser(); | |
201 | UXMLParser(const UXMLParser &other); | |
202 | UXMLParser &operator=(const UXMLParser &other); | |
203 | ||
204 | // constructor | |
205 | UXMLParser(UErrorCode &status); | |
206 | ||
207 | void parseMisc(UErrorCode &status); | |
208 | UXMLElement *createElement(RegexMatcher &mEl, UErrorCode &status); | |
209 | void error(const char *message, UErrorCode &status); | |
210 | UnicodeString scanContent(UErrorCode &status); | |
211 | void replaceCharRefs(UnicodeString &s, UErrorCode &status); | |
212 | ||
213 | const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode); | |
214 | public: | |
215 | // public for UXMLElement only | |
216 | const UnicodeString *findName(const UnicodeString &s) const; | |
217 | private: | |
218 | ||
219 | // There is one ICU regex matcher for each of the major XML syntax items | |
220 | // that are recognized. | |
221 | RegexMatcher mXMLDecl; | |
222 | RegexMatcher mXMLComment; | |
223 | RegexMatcher mXMLSP; | |
224 | RegexMatcher mXMLDoctype; | |
225 | RegexMatcher mXMLPI; | |
226 | RegexMatcher mXMLElemStart; | |
227 | RegexMatcher mXMLElemEnd; | |
228 | RegexMatcher mXMLElemEmpty; | |
229 | RegexMatcher mXMLCharData; | |
230 | RegexMatcher mAttrValue; | |
231 | RegexMatcher mAttrNormalizer; | |
232 | RegexMatcher mNewLineNormalizer; | |
233 | RegexMatcher mAmps; | |
234 | ||
235 | Hashtable fNames; // interned element/attribute name strings | |
236 | UStack fElementStack; // Stack holds the parent elements when nested | |
237 | // elements are being parsed. All items on this | |
238 | // stack are of type UXMLElement. | |
239 | int32_t fPos; // String index of the current scan position in | |
240 | // xml source (in fSrc). | |
241 | UnicodeString fOneLF; | |
242 | }; | |
243 | ||
244 | U_NAMESPACE_END | |
245 | #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ | |
246 | ||
247 | #endif |