1 /////////////////////////////////////////////////////////////////////////////
2 // Name: htmlparser.cpp
3 // Purpose: Simple HTML parser
4 // Author: Julian Smart
8 // Copyright: (c) Julian Smart
9 // Licence: wxWindows license
10 /////////////////////////////////////////////////////////////////////////////
12 #ifndef _HTMLPARSER_H_
13 #define _HTMLPARSER_H_
15 //#include "wx/module.h"
16 #include "wx/stream.h"
19 * wxSimpleHtmlAttribute
20 * Representation of an attribute
23 class wxSimpleHtmlAttribute
25 friend class wxSimpleHtmlTag
;
27 wxSimpleHtmlAttribute(const wxString
& name
, const wxString
& value
)
29 m_name
= name
; m_value
= value
; m_next
= NULL
;
33 // Write this attribute
34 void Write(wxOutputStream
& stream
);
37 const wxString
& GetName() const { return m_name
; }
38 const wxString
& GetValue() const { return m_value
; }
39 void SetName(const wxString
& name
) { m_name
= name
; }
40 void SetValue(const wxString
& value
) { m_value
= value
; }
42 wxSimpleHtmlAttribute
* GetNextAttribute() { return m_next
; }
43 void SetNextAttribute(wxSimpleHtmlAttribute
* attr
) { m_next
= attr
; }
45 bool HasName(const wxString
& name
) const { return (0 == m_name
.CmpNoCase(name
)); }
46 bool HasValue(const wxString
& val
) const { return (0 == m_value
.CmpNoCase(val
)); }
51 wxSimpleHtmlAttribute
* m_next
;
57 * Representation of a tag or chunk of text
60 enum { wxSimpleHtmlTag_Text
, wxSimpleHtmlTag_TopLevel
, wxSimpleHtmlTag_Open
, wxSimpleHtmlTag_Close
, wxSimpleHtmlTag_Directive
, wxSimpleHtmlTag_XMLDeclaration
};
65 wxSimpleHtmlTag(const wxString
& tagName
, int tagType
);
69 void ClearAttributes();
70 wxSimpleHtmlAttribute
* FindAttribute(const wxString
& name
) const ;
71 void AppendAttribute(const wxString
& name
, const wxString
& value
);
73 // Remove 1 tag from the child list.
74 void RemoveChild(wxSimpleHtmlTag
*remove
);
75 // Appaned tag to the end of the child list.
76 void AppendTag(wxSimpleHtmlTag
* tag
);
77 // Insert tag after ourself in the parents child list.
78 void AppendTagAfterUs(wxSimpleHtmlTag
* tag
);
80 void Write(wxOutputStream
& stream
);
82 // Gets the text from this tag and its descendants
83 wxString
GetTagText();
86 const wxString
& GetName() const { return m_name
; }
87 void SetName(const wxString
& name
) { m_name
= name
; }
89 int GetType() const { return m_type
; }
90 void SetType(int t
) { m_type
= t
; }
92 // If type is wxSimpleHtmlTag_Text, m_text will contain some text.
93 const wxString
& GetText() const { return m_text
; }
94 void SetText(const wxString
& text
) { m_text
= text
; }
96 wxSimpleHtmlAttribute
* GetFirstAttribute() { return m_attributes
; }
97 void SetFirstAttribute(wxSimpleHtmlAttribute
* attr
) { m_attributes
= attr
; }
99 int GetAttributeCount() const ;
100 wxSimpleHtmlAttribute
* GetAttribute(int i
) const ;
102 wxSimpleHtmlTag
* GetChildren() const { return m_children
; }
103 void SetChildren(wxSimpleHtmlTag
* children
) { m_children
= children
; }
105 wxSimpleHtmlTag
* GetParent() const { return m_parent
; }
106 void SetParent(wxSimpleHtmlTag
* parent
) { m_parent
= parent
; }
107 int GetChildCount() const;
108 wxSimpleHtmlTag
* GetChild(int i
) const;
109 wxSimpleHtmlTag
* GetNext() const { return m_next
; }
111 //// Convenience accessors & search functions
112 bool NameIs(const wxString
& name
) { return (m_name
.CmpNoCase(name
) == 0); }
113 bool HasAttribute(const wxString
& name
, const wxString
& value
) const;
114 bool HasAttribute(const wxString
& name
) const;
115 bool GetAttributeValue(wxString
& value
, const wxString
& attrName
);
117 // Search forward from this tag until we find a tag with this name & optionally attribute
118 wxSimpleHtmlTag
* FindTag(const wxString
& tagName
, const wxString
& attrName
= wxEmptyString
);
120 // Gather the text until we hit the given close tag
121 bool FindTextUntilTagClose(wxString
& text
, const wxString
& tagName
);
127 wxSimpleHtmlAttribute
* m_attributes
;
130 wxSimpleHtmlTag
* m_children
;
131 wxSimpleHtmlTag
* m_next
; // Next sibling
132 wxSimpleHtmlTag
* m_parent
;
137 * Simple HTML parser, for such tasks as scanning HTML for keywords, contents, etc.
140 class wxSimpleHtmlParser
: public wxObject
144 wxSimpleHtmlParser();
145 virtual ~wxSimpleHtmlParser();
148 bool ParseFile(const wxString
& filename
);
149 bool ParseString(const wxString
& str
);
152 void Write(wxOutputStream
& stream
);
153 bool WriteFile(wxString
& filename
);
157 // Main recursive parsing function
158 bool ParseHtml(wxSimpleHtmlTag
* parent
);
160 wxSimpleHtmlTag
* ParseTagHeader();
161 wxSimpleHtmlTag
* ParseTagClose();
162 bool ParseAttributes(wxSimpleHtmlTag
* tag
);
163 wxSimpleHtmlTag
* ParseDirective(); // e.g. <!DOCTYPE ....>
164 wxSimpleHtmlTag
* ParseXMLDeclaration(); // e.g. <?xml .... ?>
165 bool ParseComment(); // Throw away comments
166 // Plain text, up until an angled bracket
167 bool ParseText(wxString
& text
);
169 bool EatWhitespace(); // Throw away whitespace
170 bool EatWhitespace(int& pos
); // Throw away whitespace: using 'pos'
171 bool ReadString(wxString
& str
, bool eatIt
= false);
172 bool ReadWord(wxString
& str
, bool eatIt
= false);
173 bool ReadNumber(wxString
& str
, bool eatIt
= false);
174 // Could be number, string, whatever, but read up until whitespace.
175 bool ReadLiteral(wxString
& str
, bool eatIt
= false);
179 bool IsXMLDeclaration();
183 bool IsTagStartBracket(int ch
);
184 bool IsTagEndBracket(int ch
);
185 bool IsWhitespace(int ch
);
186 bool IsAlpha(int ch
);
187 bool IsWordChar(int ch
);
188 bool IsNumeric(int ch
);
189 // Check if a specific tag needs a close tag. If not this function should return false.
190 // If no close tag is needed the result will be that the tag will be insert in a none
191 // hierarchical way. i.e. if the function would return false all the time we would get
192 // a flat list of all tags (like it used to be previously).
193 virtual bool IsCloseTagNeeded(const wxString
&name
);
195 // Encode/Decode Special Characters like:
196 // > Begins a tag. >
197 // < Ends a tag. <
198 // " Quotation mark. "
199 // ' Apostrophe. '
200 // & Ampersand. &
201 static void DecodeSpecialChars(wxString
&value
);
202 static wxString
EncodeSpecialChars(const wxString
&value
);
204 // Matches this string (case insensitive)
205 bool Matches(const wxString
& tok
, bool eatIt
= false) ;
206 bool Eof() const { return (m_pos
>= m_length
); }
207 bool Eof(int pos
) const { return (pos
>= m_length
); }
209 void SetPosition(int pos
) { m_pos
= pos
; }
213 wxSimpleHtmlTag
* GetTopLevelTag() const { return m_topLevel
; }
215 // Safe way of getting a character
216 int GetChar(size_t i
) const;
220 wxSimpleHtmlTag
* m_topLevel
;
221 int m_pos
; // Position in string
222 int m_length
; // Length of string
223 wxString m_text
; // The actual text
228 * wxSimpleHtmlTagSpec
229 * Describes a tag, and what type it is.
230 * wxSimpleHtmlModule will initialise/cleanup a list of these, one per tag type
234 class wxSimpleHtmlTagSpec
: public wxObject
238 wxSimpleHtmlTagSpec(const wxString
& name
, int type
);
241 static void AddTagSpec(wxSimpleHtmlTagSpec
* spec
);
245 const wxString
& GetName() const { return m_name
; }
246 int GetType() const { return m_type
; }
253 static wxList
* sm_tagSpecs
;
258 * Responsible for init/cleanup of appropriate data structures
261 class wxSimpleHtmlModule
: public wxModule
263 DECLARE_DYNAMIC_CLASS(wxSimpleHtmlModule
)
266 wxSimpleHtmlModule() {};