]> git.saurik.com Git - apple/icu.git/blob - icuSources/tools/toolutil/xmlparser.h
ICU-66108.tar.gz
[apple/icu.git] / icuSources / tools / toolutil / xmlparser.h
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2004-2005, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: xmlparser.h
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2004jul21
16 * created by: Andy Heninger
17 *
18 * Tiny XML parser using ICU and intended for use in ICU tests and in build tools.
19 * Not suitable for production use. Not supported.
20 * Not conformant. Not efficient.
21 * But very small.
22 */
23
24 #ifndef __XMLPARSER_H__
25 #define __XMLPARSER_H__
26
27 #include "unicode/uobject.h"
28 #include "unicode/unistr.h"
29 #include "unicode/regex.h"
30 #include "uvector.h"
31 #include "hash.h"
32
33 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
34
35 enum UXMLNodeType {
36 /** Node type string (text contents), stored as a UnicodeString. */
37 UXML_NODE_TYPE_STRING,
38 /** Node type element, stored as a UXMLElement. */
39 UXML_NODE_TYPE_ELEMENT,
40 UXML_NODE_TYPE_COUNT
41 };
42
43 U_NAMESPACE_BEGIN
44
45 class UXMLParser;
46
47 /**
48 * This class represents an element node in a parsed XML tree.
49 */
50 class U_TOOLUTIL_API UXMLElement : public UObject {
51 public:
52 /**
53 * Destructor.
54 */
55 virtual ~UXMLElement();
56
57 /**
58 * Get the tag name of this element.
59 */
60 const UnicodeString &getTagName() const;
61 /**
62 * Get the text contents of the element.
63 * Append the contents of all text child nodes.
64 * @param recurse If TRUE, also recursively appends the contents of all
65 * text child nodes of element children.
66 * @return The text contents.
67 */
68 UnicodeString getText(UBool recurse) const;
69 /**
70 * Get the number of attributes.
71 */
72 int32_t countAttributes() const;
73 /**
74 * Get the i-th attribute.
75 * @param i Index of the attribute.
76 * @param name Output parameter, receives the attribute name.
77 * @param value Output parameter, receives the attribute value.
78 * @return A pointer to the attribute value (may be &value or a pointer to an
79 * internal string object), or NULL if i is out of bounds.
80 */
81 const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const;
82 /**
83 * Get the value of the attribute with the given name.
84 * @param name Attribute name to be looked up.
85 * @return A pointer to the attribute value, or NULL if this element
86 * does not have this attribute.
87 */
88 const UnicodeString *getAttribute(const UnicodeString &name) const;
89 /**
90 * Get the number of child nodes.
91 */
92 int32_t countChildren() const;
93 /**
94 * Get the i-th child node.
95 * @param i Index of the child node.
96 * @param type The child node type.
97 * @return A pointer to the child node object, or NULL if i is out of bounds.
98 */
99 const UObject *getChild(int32_t i, UXMLNodeType &type) const;
100 /**
101 * Get the next child element node, skipping non-element child nodes.
102 * @param i Enumeration index; initialize to 0 before getting the first child element.
103 * @return A pointer to the next child element, or NULL if there is none.
104 */
105 const UXMLElement *nextChildElement(int32_t &i) const;
106 /**
107 * Get the immediate child element with the given name.
108 * If there are multiple child elements with this name, then return
109 * the first one.
110 * @param name Element name to be looked up.
111 * @return A pointer to the element node, or NULL if this element
112 * does not have this immediate child element.
113 */
114 const UXMLElement *getChildElement(const UnicodeString &name) const;
115
116 /**
117 * ICU "poor man's RTTI", returns a UClassID for the actual class.
118 */
119 virtual UClassID getDynamicClassID() const;
120
121 /**
122 * ICU "poor man's RTTI", returns a UClassID for this class.
123 */
124 static UClassID U_EXPORT2 getStaticClassID();
125
126 private:
127 // prevent default construction etc.
128 UXMLElement();
129 UXMLElement(const UXMLElement &other);
130 UXMLElement &operator=(const UXMLElement &other);
131
132 void appendText(UnicodeString &text, UBool recurse) const;
133
134 friend class UXMLParser;
135
136 UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode);
137
138 const UXMLParser *fParser;
139 const UnicodeString *fName; // The tag name of this element (owned by the UXMLParser)
140 UnicodeString fContent; // The text content of this node. All element content is
141 // concatenated even when there are intervening nested elements
142 // (which doesn't happen with most xml files we care about)
143 // Sections of content containing only white space are dropped,
144 // which gets rid the bogus white space content from
145 // elements which are primarily containers for nested elements.
146 UVector fAttNames; // A vector containing the names of this element's attributes
147 // The names are UnicodeString objects, owned by the UXMLParser.
148 UVector fAttValues; // A vector containing the attribute values for
149 // this element's attributes. The order is the same
150 // as that of the attribute name vector.
151
152 UVector fChildren; // The child nodes of this element (a Vector)
153
154 UXMLElement *fParent; // A pointer to the parent element of this element.
155 };
156
157 /**
158 * A simple XML parser; it is neither efficient nor conformant and only useful for
159 * restricted types of XML documents.
160 *
161 * The parse methods parse whole documents and return the parse trees via their
162 * root elements.
163 */
164 class U_TOOLUTIL_API UXMLParser : public UObject {
165 public:
166 /**
167 * Create an XML parser.
168 */
169 static UXMLParser *createParser(UErrorCode &errorCode);
170 /**
171 * Destructor.
172 */
173 virtual ~UXMLParser();
174
175 /**
176 * Parse an XML document, create the entire document tree, and
177 * return a pointer to the root element of the parsed tree.
178 * The caller must delete the element.
179 */
180 UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode);
181 /**
182 * Parse an XML file, create the entire document tree, and
183 * return a pointer to the root element of the parsed tree.
184 * The caller must delete the element.
185 */
186 UXMLElement *parseFile(const char *filename, UErrorCode &errorCode);
187
188 /**
189 * ICU "poor man's RTTI", returns a UClassID for the actual class.
190 */
191 virtual UClassID getDynamicClassID() const;
192
193 /**
194 * ICU "poor man's RTTI", returns a UClassID for this class.
195 */
196 static UClassID U_EXPORT2 getStaticClassID();
197
198 private:
199 // prevent default construction etc.
200 UXMLParser();
201 UXMLParser(const UXMLParser &other);
202 UXMLParser &operator=(const UXMLParser &other);
203
204 // constructor
205 UXMLParser(UErrorCode &status);
206
207 void parseMisc(UErrorCode &status);
208 UXMLElement *createElement(RegexMatcher &mEl, UErrorCode &status);
209 void error(const char *message, UErrorCode &status);
210 UnicodeString scanContent(UErrorCode &status);
211 void replaceCharRefs(UnicodeString &s, UErrorCode &status);
212
213 const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode);
214 public:
215 // public for UXMLElement only
216 const UnicodeString *findName(const UnicodeString &s) const;
217 private:
218
219 // There is one ICU regex matcher for each of the major XML syntax items
220 // that are recognized.
221 RegexMatcher mXMLDecl;
222 RegexMatcher mXMLComment;
223 RegexMatcher mXMLSP;
224 RegexMatcher mXMLDoctype;
225 RegexMatcher mXMLPI;
226 RegexMatcher mXMLElemStart;
227 RegexMatcher mXMLElemEnd;
228 RegexMatcher mXMLElemEmpty;
229 RegexMatcher mXMLCharData;
230 RegexMatcher mAttrValue;
231 RegexMatcher mAttrNormalizer;
232 RegexMatcher mNewLineNormalizer;
233 RegexMatcher mAmps;
234
235 Hashtable fNames; // interned element/attribute name strings
236 UStack fElementStack; // Stack holds the parent elements when nested
237 // elements are being parsed. All items on this
238 // stack are of type UXMLElement.
239 int32_t fPos; // String index of the current scan position in
240 // xml source (in fSrc).
241 UnicodeString fOneLF;
242 };
243
244 U_NAMESPACE_END
245 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
246
247 #endif