2 ******************************************************************************* 
   4 *   Copyright (C) 2004-2005, International Business Machines 
   5 *   Corporation and others.  All Rights Reserved. 
   7 ******************************************************************************* 
   8 *   file name:  xmlparser.h 
  10 *   tab size:   8 (not used) 
  13 *   created on: 2004jul21 
  14 *   created by: Andy Heninger 
  16 * Tiny XML parser using ICU and intended for use in ICU tests and in build tools. 
  17 * Not suitable for production use. Not supported. 
  18 * Not conformant. Not efficient. 
  22 #ifndef __XMLPARSER_H__ 
  23 #define __XMLPARSER_H__ 
  25 #include "unicode/uobject.h" 
  26 #include "unicode/unistr.h" 
  27 #include "unicode/regex.h" 
  31 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION 
  34     /** Node type string (text contents), stored as a UnicodeString. */ 
  35     UXML_NODE_TYPE_STRING
, 
  36     /** Node type element, stored as a UXMLElement. */ 
  37     UXML_NODE_TYPE_ELEMENT
, 
  46  * This class represents an element node in a parsed XML tree. 
  48 class U_TOOLUTIL_API UXMLElement 
: public UObject 
{ 
  53     virtual ~UXMLElement(); 
  56      * Get the tag name of this element. 
  58     const UnicodeString 
&getTagName() const; 
  60      * Get the text contents of the element. 
  61      * Append the contents of all text child nodes. 
  62      * @param recurse If TRUE, also recursively appends the contents of all 
  63      *        text child nodes of element children. 
  64      * @return The text contents. 
  66     UnicodeString 
getText(UBool recurse
) const; 
  68      * Get the number of attributes. 
  70     int32_t countAttributes() const; 
  72      * Get the i-th attribute. 
  73      * @param i Index of the attribute. 
  74      * @param name Output parameter, receives the attribute name. 
  75      * @param value Output parameter, receives the attribute value. 
  76      * @return A pointer to the attribute value (may be &value or a pointer to an 
  77      *         internal string object), or NULL if i is out of bounds. 
  79     const UnicodeString 
*getAttribute(int32_t i
, UnicodeString 
&name
, UnicodeString 
&value
) const; 
  81      * Get the value of the attribute with the given name. 
  82      * @param name Attribute name to be looked up. 
  83      * @return A pointer to the attribute value, or NULL if this element 
  84      * does not have this attribute. 
  86     const UnicodeString 
*getAttribute(const UnicodeString 
&name
) const; 
  88      * Get the number of child nodes. 
  90     int32_t countChildren() const; 
  92      * Get the i-th child node. 
  93      * @param i Index of the child node. 
  94      * @param type The child node type. 
  95      * @return A pointer to the child node object, or NULL if i is out of bounds. 
  97     const UObject 
*getChild(int32_t i
, UXMLNodeType 
&type
) const; 
  99      * Get the next child element node, skipping non-element child nodes. 
 100      * @param i Enumeration index; initialize to 0 before getting the first child element. 
 101      * @return A pointer to the next child element, or NULL if there is none. 
 103     const UXMLElement 
*nextChildElement(int32_t &i
) const; 
 105      * Get the immediate child element with the given name. 
 106      * If there are multiple child elements with this name, then return 
 108      * @param name Element name to be looked up. 
 109      * @return A pointer to the element node, or NULL if this element 
 110      * does not have this immediate child element. 
 112     const UXMLElement 
*getChildElement(const UnicodeString 
&name
) const; 
 115      * ICU "poor man's RTTI", returns a UClassID for the actual class. 
 117     virtual UClassID 
getDynamicClassID() const; 
 120      * ICU "poor man's RTTI", returns a UClassID for this class. 
 122     static UClassID U_EXPORT2 
getStaticClassID(); 
 125     // prevent default construction etc. 
 127     UXMLElement(const UXMLElement 
&other
); 
 128     UXMLElement 
&operator=(const UXMLElement 
&other
); 
 130     void appendText(UnicodeString 
&text
, UBool recurse
) const; 
 132     friend class UXMLParser
; 
 134     UXMLElement(const UXMLParser 
*parser
, const UnicodeString 
*name
, UErrorCode 
&errorCode
); 
 136     const UXMLParser 
*fParser
; 
 137     const UnicodeString 
*fName
;          // The tag name of this element (owned by the UXMLParser) 
 138     UnicodeString       fContent
;        // The text content of this node.  All element content is  
 139                                          //   concatenated even when there are intervening nested elements 
 140                                          //   (which doesn't happen with most xml files we care about) 
 141                                          //   Sections of content containing only white space are dropped, 
 142                                          //   which gets rid  the bogus white space content from 
 143                                          //   elements which are primarily containers for nested elements. 
 144     UVector             fAttNames
;       // A vector containing the names of this element's attributes 
 145                                          //    The names are UnicodeString objects, owned by the UXMLParser. 
 146     UVector             fAttValues
;      // A vector containing the attribute values for 
 147                                          //    this element's attributes.  The order is the same 
 148                                          //    as that of the attribute name vector. 
 150     UVector             fChildren
;       // The child nodes of this element (a Vector) 
 152     UXMLElement        
*fParent
;         // A pointer to the parent element of this element. 
 156  * A simple XML parser; it is neither efficient nor conformant and only useful for 
 157  * restricted types of XML documents. 
 159  * The parse methods parse whole documents and return the parse trees via their 
 162 class U_TOOLUTIL_API UXMLParser 
: public UObject 
{ 
 165      * Create an XML parser. 
 167     static UXMLParser 
*createParser(UErrorCode 
&errorCode
); 
 171     virtual ~UXMLParser(); 
 174      * Parse an XML document, create the entire document tree, and 
 175      * return a pointer to the root element of the parsed tree. 
 176      * The caller must delete the element. 
 178     UXMLElement 
*parse(const UnicodeString 
&src
, UErrorCode 
&errorCode
); 
 180      * Parse an XML file, create the entire document tree, and 
 181      * return a pointer to the root element of the parsed tree. 
 182      * The caller must delete the element. 
 184     UXMLElement 
*parseFile(const char *filename
, UErrorCode 
&errorCode
); 
 187      * ICU "poor man's RTTI", returns a UClassID for the actual class. 
 189     virtual UClassID 
getDynamicClassID() const; 
 192      * ICU "poor man's RTTI", returns a UClassID for this class. 
 194     static UClassID U_EXPORT2 
getStaticClassID(); 
 197     // prevent default construction etc. 
 199     UXMLParser(const UXMLParser 
&other
); 
 200     UXMLParser 
&operator=(const UXMLParser 
&other
); 
 203     UXMLParser(UErrorCode 
&status
); 
 205     void           parseMisc(UErrorCode 
&status
); 
 206     UXMLElement   
*createElement(RegexMatcher 
&mEl
, UErrorCode 
&status
); 
 207     void           error(const char *message
, UErrorCode 
&status
); 
 208     UnicodeString  
scanContent(UErrorCode 
&status
); 
 209     void           replaceCharRefs(UnicodeString 
&s
, UErrorCode 
&status
); 
 211     const UnicodeString 
*intern(const UnicodeString 
&s
, UErrorCode 
&errorCode
); 
 213     // public for UXMLElement only 
 214     const UnicodeString 
*findName(const UnicodeString 
&s
) const; 
 217     // There is one ICU regex matcher for each of the major XML syntax items 
 218     //  that are recognized. 
 219     RegexMatcher mXMLDecl
; 
 220     RegexMatcher mXMLComment
; 
 222     RegexMatcher mXMLDoctype
; 
 224     RegexMatcher mXMLElemStart
; 
 225     RegexMatcher mXMLElemEnd
; 
 226     RegexMatcher mXMLElemEmpty
; 
 227     RegexMatcher mXMLCharData
; 
 228     RegexMatcher mAttrValue
; 
 229     RegexMatcher mAttrNormalizer
; 
 230     RegexMatcher mNewLineNormalizer
; 
 233     Hashtable             fNames
;           // interned element/attribute name strings 
 234     UStack                fElementStack
;    // Stack holds the parent elements when nested 
 235                                             //    elements are being parsed.  All items on this 
 236                                             //    stack are of type UXMLElement. 
 237     int32_t               fPos
;             // String index of the current scan position in 
 238                                             //    xml source (in fSrc). 
 239     UnicodeString         fOneLF
; 
 243 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */