]> git.saurik.com Git - apple/icu.git/blob - icuSources/tools/toolutil/xmlparser.h
ICU-8.11.tar.gz
[apple/icu.git] / icuSources / tools / toolutil / xmlparser.h
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2004-2005, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: xmlparser.h
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2004jul21
14 * created by: Andy Heninger
15 *
16 * Tiny XML parser using ICU and intended for use in ICU tests and in build tools.
17 * Not suitable for production use. Not supported.
18 * Not conformant. Not efficient.
19 * But very small.
20 */
21
22 #ifndef __XMLPARSER_H__
23 #define __XMLPARSER_H__
24
25 #include "unicode/uobject.h"
26 #include "unicode/unistr.h"
27 #include "unicode/regex.h"
28 #include "uvector.h"
29 #include "hash.h"
30
31 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
32
33 enum UXMLNodeType {
34 /** Node type string (text contents), stored as a UnicodeString. */
35 UXML_NODE_TYPE_STRING,
36 /** Node type element, stored as a UXMLElement. */
37 UXML_NODE_TYPE_ELEMENT,
38 UXML_NODE_TYPE_COUNT
39 };
40
41 U_NAMESPACE_BEGIN
42
43 class UXMLParser;
44
45 /**
46 * This class represents an element node in a parsed XML tree.
47 */
48 class U_TOOLUTIL_API UXMLElement : public UObject {
49 public:
50 /**
51 * Destructor.
52 */
53 virtual ~UXMLElement();
54
55 /**
56 * Get the tag name of this element.
57 */
58 const UnicodeString &getTagName() const;
59 /**
60 * Get the text contents of the element.
61 * Append the contents of all text child nodes.
62 * @param recurse If TRUE, also recursively appends the contents of all
63 * text child nodes of element children.
64 * @return The text contents.
65 */
66 UnicodeString getText(UBool recurse) const;
67 /**
68 * Get the number of attributes.
69 */
70 int32_t countAttributes() const;
71 /**
72 * Get the i-th attribute.
73 * @param i Index of the attribute.
74 * @param name Output parameter, receives the attribute name.
75 * @param value Output parameter, receives the attribute value.
76 * @return A pointer to the attribute value (may be &value or a pointer to an
77 * internal string object), or NULL if i is out of bounds.
78 */
79 const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const;
80 /**
81 * Get the value of the attribute with the given name.
82 * @param name Attribute name to be looked up.
83 * @return A pointer to the attribute value, or NULL if this element
84 * does not have this attribute.
85 */
86 const UnicodeString *getAttribute(const UnicodeString &name) const;
87 /**
88 * Get the number of child nodes.
89 */
90 int32_t countChildren() const;
91 /**
92 * Get the i-th child node.
93 * @param i Index of the child node.
94 * @param type The child node type.
95 * @return A pointer to the child node object, or NULL if i is out of bounds.
96 */
97 const UObject *getChild(int32_t i, UXMLNodeType &type) const;
98 /**
99 * Get the next child element node, skipping non-element child nodes.
100 * @param i Enumeration index; initialize to 0 before getting the first child element.
101 * @return A pointer to the next child element, or NULL if there is none.
102 */
103 const UXMLElement *nextChildElement(int32_t &i) const;
104 /**
105 * Get the immediate child element with the given name.
106 * If there are multiple child elements with this name, then return
107 * the first one.
108 * @param name Element name to be looked up.
109 * @return A pointer to the element node, or NULL if this element
110 * does not have this immediate child element.
111 */
112 const UXMLElement *getChildElement(const UnicodeString &name) const;
113
114 /**
115 * ICU "poor man's RTTI", returns a UClassID for the actual class.
116 */
117 virtual UClassID getDynamicClassID() const;
118
119 /**
120 * ICU "poor man's RTTI", returns a UClassID for this class.
121 */
122 static UClassID U_EXPORT2 getStaticClassID();
123
124 private:
125 // prevent default construction etc.
126 UXMLElement();
127 UXMLElement(const UXMLElement &other);
128 UXMLElement &operator=(const UXMLElement &other);
129
130 void appendText(UnicodeString &text, UBool recurse) const;
131
132 friend class UXMLParser;
133
134 UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode);
135
136 const UXMLParser *fParser;
137 const UnicodeString *fName; // The tag name of this element (owned by the UXMLParser)
138 UnicodeString fContent; // The text content of this node. All element content is
139 // concatenated even when there are intervening nested elements
140 // (which doesn't happen with most xml files we care about)
141 // Sections of content containing only white space are dropped,
142 // which gets rid the bogus white space content from
143 // elements which are primarily containers for nested elements.
144 UVector fAttNames; // A vector containing the names of this element's attributes
145 // The names are UnicodeString objects, owned by the UXMLParser.
146 UVector fAttValues; // A vector containing the attribute values for
147 // this element's attributes. The order is the same
148 // as that of the attribute name vector.
149
150 UVector fChildren; // The child nodes of this element (a Vector)
151
152 UXMLElement *fParent; // A pointer to the parent element of this element.
153 };
154
155 /**
156 * A simple XML parser; it is neither efficient nor conformant and only useful for
157 * restricted types of XML documents.
158 *
159 * The parse methods parse whole documents and return the parse trees via their
160 * root elements.
161 */
162 class U_TOOLUTIL_API UXMLParser : public UObject {
163 public:
164 /**
165 * Create an XML parser.
166 */
167 static UXMLParser *createParser(UErrorCode &errorCode);
168 /**
169 * Destructor.
170 */
171 virtual ~UXMLParser();
172
173 /**
174 * Parse an XML document, create the entire document tree, and
175 * return a pointer to the root element of the parsed tree.
176 * The caller must delete the element.
177 */
178 UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode);
179 /**
180 * Parse an XML file, create the entire document tree, and
181 * return a pointer to the root element of the parsed tree.
182 * The caller must delete the element.
183 */
184 UXMLElement *parseFile(const char *filename, UErrorCode &errorCode);
185
186 /**
187 * ICU "poor man's RTTI", returns a UClassID for the actual class.
188 */
189 virtual UClassID getDynamicClassID() const;
190
191 /**
192 * ICU "poor man's RTTI", returns a UClassID for this class.
193 */
194 static UClassID U_EXPORT2 getStaticClassID();
195
196 private:
197 // prevent default construction etc.
198 UXMLParser();
199 UXMLParser(const UXMLParser &other);
200 UXMLParser &operator=(const UXMLParser &other);
201
202 // constructor
203 UXMLParser(UErrorCode &status);
204
205 void parseMisc(UErrorCode &status);
206 UXMLElement *createElement(RegexMatcher &mEl, UErrorCode &status);
207 void error(const char *message, UErrorCode &status);
208 UnicodeString scanContent(UErrorCode &status);
209 void replaceCharRefs(UnicodeString &s, UErrorCode &status);
210
211 const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode);
212 public:
213 // public for UXMLElement only
214 const UnicodeString *findName(const UnicodeString &s) const;
215 private:
216
217 // There is one ICU regex matcher for each of the major XML syntax items
218 // that are recognized.
219 RegexMatcher mXMLDecl;
220 RegexMatcher mXMLComment;
221 RegexMatcher mXMLSP;
222 RegexMatcher mXMLDoctype;
223 RegexMatcher mXMLPI;
224 RegexMatcher mXMLElemStart;
225 RegexMatcher mXMLElemEnd;
226 RegexMatcher mXMLElemEmpty;
227 RegexMatcher mXMLCharData;
228 RegexMatcher mAttrValue;
229 RegexMatcher mAttrNormalizer;
230 RegexMatcher mNewLineNormalizer;
231 RegexMatcher mAmps;
232
233 Hashtable fNames; // interned element/attribute name strings
234 UStack fElementStack; // Stack holds the parent elements when nested
235 // elements are being parsed. All items on this
236 // stack are of type UXMLElement.
237 int32_t fPos; // String index of the current scan position in
238 // xml source (in fSrc).
239 UnicodeString fOneLF;
240 };
241
242 U_NAMESPACE_END
243 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
244
245 #endif