1 /////////////////////////////////////////////////////////////////////////////
2 // Name: htmlparser.cpp
3 // Purpose: Simple HTML parser
4 // Author: Julian Smart
8 // Copyright: (c) Julian Smart
9 // Licence: wxWindows license
10 /////////////////////////////////////////////////////////////////////////////
12 #ifndef _HTMLPARSER_H_
13 #define _HTMLPARSER_H_
15 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
16 #pragma interface "htmlparser.cpp"
19 //#include "wx/module.h"
20 #include "wx/stream.h"
23 * wxSimpleHtmlAttribute
24 * Representation of an attribute
27 class wxSimpleHtmlAttribute
29 friend class wxSimpleHtmlTag
;
31 wxSimpleHtmlAttribute(const wxString
& name
, const wxString
& value
)
33 m_name
= name
; m_value
= value
; m_next
= NULL
;
37 // Write this attribute
38 void Write(wxOutputStream
& stream
);
41 const wxString
& GetName() const { return m_name
; }
42 const wxString
& GetValue() const { return m_value
; }
43 void SetName(const wxString
& name
) { m_name
= name
; }
44 void SetValue(const wxString
& value
) { m_value
= value
; }
46 wxSimpleHtmlAttribute
* GetNextAttribute() { return m_next
; }
47 void SetNextAttribute(wxSimpleHtmlAttribute
* attr
) { m_next
= attr
; }
49 bool HasName(const wxString
& name
) const { return (0 == m_name
.CmpNoCase(name
)); }
50 bool HasValue(const wxString
& val
) const { return (0 == m_value
.CmpNoCase(val
)); }
55 wxSimpleHtmlAttribute
* m_next
;
61 * Representation of a tag or chunk of text
64 enum { wxSimpleHtmlTag_Text
, wxSimpleHtmlTag_TopLevel
, wxSimpleHtmlTag_Open
, wxSimpleHtmlTag_Close
, wxSimpleHtmlTag_Directive
, wxSimpleHtmlTag_XMLDeclaration
};
69 wxSimpleHtmlTag(const wxString
& tagName
, int tagType
);
73 void ClearAttributes();
74 wxSimpleHtmlAttribute
* FindAttribute(const wxString
& name
) const ;
75 void AppendAttribute(const wxString
& name
, const wxString
& value
);
77 // Remove 1 tag from the child list.
78 void RemoveChild(wxSimpleHtmlTag
*remove
);
79 // Appaned tag to the end of the child list.
80 void AppendTag(wxSimpleHtmlTag
* tag
);
81 // Insert tag after ourself in the parents child list.
82 void AppendTagAfterUs(wxSimpleHtmlTag
* tag
);
84 void Write(wxOutputStream
& stream
);
86 // Gets the text from this tag and its descendants
87 wxString
GetTagText();
90 const wxString
& GetName() const { return m_name
; }
91 void SetName(const wxString
& name
) { m_name
= name
; }
93 int GetType() const { return m_type
; }
94 void SetType(int t
) { m_type
= t
; }
96 // If type is wxSimpleHtmlTag_Text, m_text will contain some text.
97 const wxString
& GetText() const { return m_text
; }
98 void SetText(const wxString
& text
) { m_text
= text
; }
100 wxSimpleHtmlAttribute
* GetFirstAttribute() { return m_attributes
; }
101 void SetFirstAttribute(wxSimpleHtmlAttribute
* attr
) { m_attributes
= attr
; }
103 int GetAttributeCount() const ;
104 wxSimpleHtmlAttribute
* GetAttribute(int i
) const ;
106 wxSimpleHtmlTag
* GetChildren() const { return m_children
; }
107 void SetChildren(wxSimpleHtmlTag
* children
) { m_children
= children
; }
109 wxSimpleHtmlTag
* GetParent() const { return m_parent
; }
110 void SetParent(wxSimpleHtmlTag
* parent
) { m_parent
= parent
; }
111 int GetChildCount() const;
112 wxSimpleHtmlTag
* GetChild(int i
) const;
113 wxSimpleHtmlTag
* GetNext() const { return m_next
; }
115 //// Convenience accessors & search functions
116 bool NameIs(const wxString
& name
) { return (m_name
.CmpNoCase(name
) == 0); }
117 bool HasAttribute(const wxString
& name
, const wxString
& value
) const;
118 bool HasAttribute(const wxString
& name
) const;
119 bool GetAttributeValue(wxString
& value
, const wxString
& attrName
);
121 // Search forward from this tag until we find a tag with this name & optionally attribute
122 wxSimpleHtmlTag
* FindTag(const wxString
& tagName
, const wxString
& attrName
= wxEmptyString
);
124 // Gather the text until we hit the given close tag
125 bool FindTextUntilTagClose(wxString
& text
, const wxString
& tagName
);
131 wxSimpleHtmlAttribute
* m_attributes
;
134 wxSimpleHtmlTag
* m_children
;
135 wxSimpleHtmlTag
* m_next
; // Next sibling
136 wxSimpleHtmlTag
* m_parent
;
141 * Simple HTML parser, for such tasks as scanning HTML for keywords, contents, etc.
144 class wxSimpleHtmlParser
: public wxObject
148 wxSimpleHtmlParser();
149 ~wxSimpleHtmlParser();
152 bool ParseFile(const wxString
& filename
);
153 bool ParseString(const wxString
& str
);
156 void Write(wxOutputStream
& stream
);
157 bool WriteFile(wxString
& filename
);
161 // Main recursive parsing function
162 bool ParseHtml(wxSimpleHtmlTag
* parent
);
164 wxSimpleHtmlTag
* ParseTagHeader();
165 wxSimpleHtmlTag
* ParseTagClose();
166 bool ParseAttributes(wxSimpleHtmlTag
* tag
);
167 wxSimpleHtmlTag
* ParseDirective(); // e.g. <!DOCTYPE ....>
168 wxSimpleHtmlTag
* ParseXMLDeclaration(); // e.g. <?xml .... ?>
169 bool ParseComment(); // Throw away comments
170 // Plain text, up until an angled bracket
171 bool ParseText(wxString
& text
);
173 bool EatWhitespace(); // Throw away whitespace
174 bool EatWhitespace(int& pos
); // Throw away whitespace: using 'pos'
175 bool ReadString(wxString
& str
, bool eatIt
= FALSE
);
176 bool ReadWord(wxString
& str
, bool eatIt
= FALSE
);
177 bool ReadNumber(wxString
& str
, bool eatIt
= FALSE
);
178 // Could be number, string, whatever, but read up until whitespace.
179 bool ReadLiteral(wxString
& str
, bool eatIt
= FALSE
);
183 bool IsXMLDeclaration();
187 bool IsTagStartBracket(int ch
);
188 bool IsTagEndBracket(int ch
);
189 bool IsWhitespace(int ch
);
190 bool IsAlpha(int ch
);
191 bool IsWordChar(int ch
);
192 bool IsNumeric(int ch
);
193 // Check if a specific tag needs a close tag. If not this function should return FALSE.
194 // If no close tag is needed the result will be that the tag will be insert in a none
195 // hierarchical way. i.e. if the function would return FALSE all the time we would get
196 // a flat list of all tags (like it used to be previously).
197 virtual bool IsCloseTagNeeded(const wxString
&name
);
199 // Encode/Decode Special Characters like:
200 // > Begins a tag. >
201 // < Ends a tag. <
202 // " Quotation mark. "
203 // ' Apostrophe. '
204 // & Ampersand. &
205 static void DecodeSpecialChars(wxString
&value
);
206 static wxString
EncodeSpecialChars(const wxString
&value
);
208 // Matches this string (case insensitive)
209 bool Matches(const wxString
& tok
, bool eatIt
= FALSE
) ;
210 bool Eof() const { return (m_pos
>= m_length
); }
211 bool Eof(int pos
) const { return (pos
>= m_length
); }
213 void SetPosition(int pos
) { m_pos
= pos
; }
217 wxSimpleHtmlTag
* GetTopLevelTag() const { return m_topLevel
; }
219 // Safe way of getting a character
220 int GetChar(size_t i
) const;
224 wxSimpleHtmlTag
* m_topLevel
;
225 int m_pos
; // Position in string
226 int m_length
; // Length of string
227 wxString m_text
; // The actual text
232 * wxSimpleHtmlTagSpec
233 * Describes a tag, and what type it is.
234 * wxSimpleHtmlModule will initialise/cleanup a list of these, one per tag type
238 class wxSimpleHtmlTagSpec
: public wxObject
242 wxSimpleHtmlTagSpec(const wxString
& name
, int type
);
245 static void AddTagSpec(wxSimpleHtmlTagSpec
* spec
);
249 const wxString
& GetName() const { return m_name
; }
250 int GetType() const { return m_type
; }
257 static wxList
* sm_tagSpecs
;
262 * Responsible for init/cleanup of appropriate data structures
265 class wxSimpleHtmlModule
: public wxModule
267 DECLARE_DYNAMIC_CLASS(wxSimpleHtmlModule
)
270 wxSimpleHtmlModule() {};