1 /////////////////////////////////////////////////////////////////////////////
2 // Name: htmlparser.cpp
3 // Purpose: Simple HTML parser
4 // Author: Julian Smart
8 // Copyright: (c) Julian Smart
9 // Licence: wxWindows license
10 /////////////////////////////////////////////////////////////////////////////
12 // ----------------------------------------------------------------------------
14 // ----------------------------------------------------------------------------
16 // For compilers that support precompilation, includes "wx/wx.h".
17 #include "wx/wxprec.h"
23 #include "wx/wfstream.h"
24 #include "wx/textfile.h"
25 #include "wx/txtstrm.h"
26 #include "htmlparser.h"
28 /// Useful insertion operators for wxOutputStream.
29 static wxOutputStream
& operator <<(wxOutputStream
& stream
, const wxString
& s
)
31 wxTextOutputStream
txt(stream
); // This is to make sure the line-ending is native!
37 #if 0 // Gives warning because not used...
38 static wxOutputStream
& operator <<(wxOutputStream
& stream
, long l
)
45 static wxOutputStream
& operator <<(wxOutputStream
& stream
, const char c
)
54 * wxSimpleHtmlAttribute
55 * Representation of an attribute
58 wxSimpleHtmlParser::wxSimpleHtmlParser()
65 wxSimpleHtmlParser::~wxSimpleHtmlParser()
70 bool wxSimpleHtmlParser::ParseFile(const wxString
& filename
)
74 if (textFile
.Open(filename
))
79 int count
= textFile
.GetLineCount();
80 for (i
= 0; i
< count
; i
++)
83 line
= textFile
.GetFirstLine();
85 line
= textFile
.GetNextLine();
93 for ( line
= textFile
.GetFirstLine(); !textFile
.Eof(); line
= textFile
.GetNextLine() )
101 return ParseString(text
);
107 bool wxSimpleHtmlParser::ParseString(const wxString
& str
)
113 m_length
= str
.Length();
115 m_topLevel
= new wxSimpleHtmlTag(wxT("TOPLEVEL"), wxSimpleHtmlTag_TopLevel
);
117 bool bResult
= ParseHtml(m_topLevel
);
119 wxASSERT(bResult
); // Failed to parse the TAGs.
120 // Hint: Check if every open tag has a close tag!
125 // Main recursive parsing function
126 bool wxSimpleHtmlParser::ParseHtml(wxSimpleHtmlTag
* parent
)
138 else if (IsDirective())
140 wxSimpleHtmlTag
* tag
= ParseDirective();
142 parent
->AppendTag(tag
);
144 else if (IsXMLDeclaration())
146 wxSimpleHtmlTag
* tag
= ParseXMLDeclaration();
148 parent
->AppendTag(tag
);
150 else if (IsTagClose())
152 wxSimpleHtmlTag
* tag
= ParseTagClose();
155 if (IsCloseTagNeeded(tag
->GetName()))
157 if (!parent
->GetParent())
159 parent
->GetParent()->AppendTag(tag
);
163 parent
->AppendTag(tag
);
166 else if (IsTagStartBracket(GetChar(m_pos
)))
168 wxSimpleHtmlTag
* tag
= ParseTagHeader();
170 parent
->AppendTag(tag
);
172 if (IsCloseTagNeeded(tag
->GetName()))
175 return false; // Something didn't go ok, so don't continue.
180 // Just a text string
184 wxSimpleHtmlTag
* tag
= new wxSimpleHtmlTag(wxT("TEXT"), wxSimpleHtmlTag_Text
);
186 if(parent
->GetParent())
187 parent
->GetParent()->AppendTag(tag
);
189 parent
->AppendTag(tag
); // When this occurs it is probably the
190 // empty lines at the end of the file...
196 // Plain text, up until an angled bracket
197 bool wxSimpleHtmlParser::ParseText(wxString
& text
)
199 while (!Eof() && GetChar(m_pos
) != wxT('<'))
201 text
+= (wxChar
)GetChar(m_pos
);
204 DecodeSpecialChars(text
);
208 wxSimpleHtmlTag
* wxSimpleHtmlParser::ParseTagHeader()
210 if (IsTagStartBracket(GetChar(m_pos
)))
216 ReadWord(word
, true);
220 wxSimpleHtmlTag
* tag
= new wxSimpleHtmlTag(word
, wxSimpleHtmlTag_Open
);
222 ParseAttributes(tag
);
226 if (IsTagEndBracket(GetChar(m_pos
)))
235 wxSimpleHtmlTag
* wxSimpleHtmlParser::ParseTagClose()
237 Matches(wxT("</"), true);
242 ReadWord(word
, true);
247 wxSimpleHtmlTag
* tag
= new wxSimpleHtmlTag(word
, wxSimpleHtmlTag_Close
);
251 bool wxSimpleHtmlParser::ParseAttributes(wxSimpleHtmlTag
* tag
)
253 // Parse attributes of a tag header until we reach >
254 while (!IsTagEndBracket(GetChar(m_pos
)) && !Eof())
258 wxString attrName
, attrValue
;
262 ReadString(attrName
, true);
263 tag
->AppendAttribute(attrName
, wxEmptyString
);
265 else if (IsNumeric(GetChar(m_pos
)))
267 ReadNumber(attrName
, true);
268 tag
->AppendAttribute(attrName
, wxEmptyString
);
272 // Try to read an attribute name/value pair, or at least a name
274 ReadLiteral(attrName
, true);
277 if (GetChar(m_pos
) == wxT('='))
283 ReadString(attrValue
, true);
284 else if (!Eof() && !IsTagEndBracket(GetChar(m_pos
)))
285 ReadLiteral(attrValue
, true);
287 if (!attrName
.IsEmpty())
288 tag
->AppendAttribute(attrName
, attrValue
);
294 // e.g. <!DOCTYPE ....>
295 wxSimpleHtmlTag
* wxSimpleHtmlParser::ParseDirective()
297 Matches(wxT("<!"), true);
302 ReadWord(word
, true);
306 wxSimpleHtmlTag
* tag
= new wxSimpleHtmlTag(word
, wxSimpleHtmlTag_Directive
);
308 ParseAttributes(tag
);
312 if (IsTagEndBracket(GetChar(m_pos
)))
318 // e.g. <?xml .... ?>
319 wxSimpleHtmlTag
* wxSimpleHtmlParser::ParseXMLDeclaration()
321 Matches(wxT("<?"), true);
326 ReadWord(word
, true);
330 wxSimpleHtmlTag
* tag
= new wxSimpleHtmlTag(word
, wxSimpleHtmlTag_XMLDeclaration
);
332 ParseAttributes(tag
);
336 if (IsTagEndBracket(GetChar(m_pos
)))
342 bool wxSimpleHtmlParser::ParseComment()
344 // Eat the comment tag start
345 Matches(wxT("<!--"), true);
347 while (!Eof() && !Matches(wxT("-->"), true))
355 bool wxSimpleHtmlParser::EatWhitespace()
357 while (!Eof() && IsWhitespace(GetChar(m_pos
)))
362 bool wxSimpleHtmlParser::EatWhitespace(int& pos
)
364 while (!Eof(pos
) && IsWhitespace(GetChar(pos
)))
369 bool wxSimpleHtmlParser::ReadString(wxString
& str
, bool eatIt
)
372 if (GetChar(pos
) == (int) '"')
375 while (!Eof(pos
) && GetChar(pos
) != (int) '"')
377 // TODO: how are quotes escaped in HTML?
378 str
+= (wxChar
) GetChar(pos
);
381 if (GetChar(pos
) == (int) '"')
385 DecodeSpecialChars(str
);
392 bool wxSimpleHtmlParser::ReadWord(wxString
& str
, bool eatIt
)
396 if (!IsAlpha(GetChar(pos
)))
399 str
+= (wxChar
) GetChar(pos
) ;
402 while (!Eof(pos
) && IsWordChar(GetChar(pos
)))
404 str
+= (wxChar
) GetChar(pos
);
409 DecodeSpecialChars(str
);
413 bool wxSimpleHtmlParser::ReadNumber(wxString
& str
, bool eatIt
)
417 if (!IsNumeric(GetChar(pos
)))
420 str
+= (wxChar
) GetChar(pos
) ;
423 while (!Eof(pos
) && IsNumeric(GetChar(pos
)))
425 str
+= (wxChar
) GetChar(pos
);
430 DecodeSpecialChars(str
);
434 // Could be number, string, whatever, but read up until whitespace or end of tag (but not a quoted string)
435 bool wxSimpleHtmlParser::ReadLiteral(wxString
& str
, bool eatIt
)
439 while (!Eof(pos
) && !IsWhitespace(GetChar(pos
)) && !IsTagEndBracket(GetChar(pos
)) && GetChar(pos
) != wxT('='))
441 str
+= (wxChar
)GetChar(pos
);
446 DecodeSpecialChars(str
);
450 bool wxSimpleHtmlParser::IsComment()
452 return Matches(wxT("<!--"));
455 bool wxSimpleHtmlParser::IsDirective()
457 return Matches(wxT("<!"));
460 bool wxSimpleHtmlParser::IsXMLDeclaration()
462 return Matches(wxT("<?xml"));
465 bool wxSimpleHtmlParser::IsString()
467 return (GetChar(m_pos
) == (int) '"') ;
470 bool wxSimpleHtmlParser::IsWord()
472 return (IsAlpha(GetChar(m_pos
)));
475 bool wxSimpleHtmlParser::IsTagClose()
477 return Matches(wxT("</"));
480 bool wxSimpleHtmlParser::IsTagStartBracket(int ch
)
482 return (ch
== wxT('<'));
485 bool wxSimpleHtmlParser::IsTagEndBracket(int ch
)
487 return (ch
== wxT('>'));
490 bool wxSimpleHtmlParser::IsWhitespace(int ch
)
492 return ((ch
== 13) || (ch
== 10) || (ch
== 32) || (ch
== (int) '\t')) ;
495 bool wxSimpleHtmlParser::IsAlpha(int ch
)
497 return (wxIsalpha((wxChar
) ch
) != 0);
500 bool wxSimpleHtmlParser::IsWordChar(int ch
)
502 return (wxIsalpha((wxChar
) ch
) != 0 || ch
== wxT('-') || ch
== wxT('_') || IsNumeric(ch
));
505 bool wxSimpleHtmlParser::IsNumeric(int ch
)
507 return (wxIsdigit((wxChar
) ch
) != 0 || ch
== wxT('-') || ch
== wxT('.')) ;
510 bool wxSimpleHtmlParser::IsCloseTagNeeded(const wxString
&name
)
512 if (name
.IsSameAs(wxT("P"), false)) // e.g <P>
515 // ToDo add more items here.
520 // Encode/Decode Special Characters.
521 // See here for the used table: http://msdn.microsoft.com/library/default.asp?url=/library/en-us/xmlsql/ac_xml1_1nqk.asp
522 /* static */ void wxSimpleHtmlParser::DecodeSpecialChars(wxString
&value
)
525 value
.Replace(wxT(">"), wxT(">"), true);
526 value
.Replace(wxT("<"), wxT("<"), true);
527 value
.Replace(wxT("""), wxT("\""), true);
528 value
.Replace(wxT("'"), wxT("'"), true);
529 value
.Replace(wxT("&"), wxT("&"), true); // Note: do this as last to prevent replace problems.
532 /* static */ wxString
wxSimpleHtmlParser::EncodeSpecialChars(const wxString
&value
)
534 wxString newvalue
= value
;
537 newvalue
.Replace(wxT("&"), wxT("&"), true); // Note: do this as first to prevent replace problems.
538 newvalue
.Replace(wxT(">"), wxT(">"), true);
539 newvalue
.Replace(wxT("<"), wxT("<"), true);
540 newvalue
.Replace(wxT("\""),wxT("""), true);
541 newvalue
.Replace(wxT("'"), wxT("'"), true);
546 // Matches this string (case insensitive)
547 bool wxSimpleHtmlParser::Matches(const wxString
& tok
, bool eatIt
)
549 wxString
text(m_text
.Mid(m_pos
, tok
.Length()));
550 bool success
= (text
.CmpNoCase(tok
) == 0) ;
551 if (success
&& eatIt
)
553 m_pos
+= tok
.Length();
558 // Safe way of getting a character
559 int wxSimpleHtmlParser::GetChar(size_t i
) const
561 if (i
>= (size_t) m_length
)
566 void wxSimpleHtmlParser::Clear()
571 m_text
= wxEmptyString
;
577 void wxSimpleHtmlParser::Write(wxOutputStream
& stream
)
580 m_topLevel
->Write(stream
);
583 bool wxSimpleHtmlParser::WriteFile(wxString
& filename
)
585 wxFileOutputStream
fstream(filename
);
597 * Representation of a tag or chunk of text
600 wxSimpleHtmlTag::wxSimpleHtmlTag(const wxString
& tagName
, int tagType
)
610 wxSimpleHtmlTag::~wxSimpleHtmlTag()
617 void wxSimpleHtmlTag::ClearAttributes()
621 wxSimpleHtmlAttribute
* attr
= m_attributes
;
624 wxSimpleHtmlAttribute
* next
= attr
->m_next
;
634 wxSimpleHtmlAttribute
* wxSimpleHtmlTag::FindAttribute(const wxString
& name
) const
636 wxSimpleHtmlAttribute
* attr
= m_attributes
;
639 if (attr
->GetName().CmpNoCase(name
) == 0)
648 void wxSimpleHtmlTag::AppendAttribute(const wxString
& name
, const wxString
& value
)
650 wxSimpleHtmlAttribute
* attr
= new wxSimpleHtmlAttribute(name
, value
);
654 wxSimpleHtmlAttribute
* last
= m_attributes
;
664 void wxSimpleHtmlTag::ClearChildren()
668 wxSimpleHtmlTag
* child
= m_children
;
671 wxSimpleHtmlTag
* next
= child
->m_next
;
673 child
->m_next
= NULL
;
681 void wxSimpleHtmlTag::RemoveChild(wxSimpleHtmlTag
*remove
)
685 wxSimpleHtmlTag
* child
= m_children
;
686 wxSimpleHtmlTag
* prev
= NULL
;
689 wxSimpleHtmlTag
* next
= child
->m_next
;
693 child
->m_next
= NULL
;
709 void wxSimpleHtmlTag::AppendTag(wxSimpleHtmlTag
* tag
)
717 wxSimpleHtmlTag
* last
= m_children
;
728 tag
->m_parent
= this;
731 void wxSimpleHtmlTag::AppendTagAfterUs(wxSimpleHtmlTag
* tag
)
736 tag
->m_parent
= m_parent
;
737 tag
->m_next
= m_next
;
741 // Gets the text from this tag and its descendants
742 wxString
wxSimpleHtmlTag::GetTagText()
747 wxSimpleHtmlTag
* tag
= m_children
;
750 text
+= tag
->GetTagText();
755 else if (GetType() == wxSimpleHtmlTag_Text
)
758 return wxEmptyString
;
761 int wxSimpleHtmlTag::GetAttributeCount() const
764 wxSimpleHtmlAttribute
* attr
= m_attributes
;
773 wxSimpleHtmlAttribute
* wxSimpleHtmlTag::GetAttribute(int i
) const
776 wxSimpleHtmlAttribute
* attr
= m_attributes
;
787 int wxSimpleHtmlTag::GetChildCount() const
790 wxSimpleHtmlTag
* tag
= m_children
;
799 bool wxSimpleHtmlTag::HasAttribute(const wxString
& name
, const wxString
& value
) const
801 wxSimpleHtmlAttribute
* attr
= FindAttribute(name
);
803 return (attr
&& (attr
->GetValue().CmpNoCase(value
) == 0)) ;
806 bool wxSimpleHtmlTag::HasAttribute(const wxString
& name
) const
808 return FindAttribute(name
) != NULL
;
811 bool wxSimpleHtmlTag::GetAttributeValue(wxString
& value
, const wxString
& attrName
)
813 wxSimpleHtmlAttribute
* attr
= FindAttribute(attrName
);
816 value
= attr
->GetValue();
823 // Search forward from this tag until we find a tag with this name & attribute
824 wxSimpleHtmlTag
* wxSimpleHtmlTag::FindTag(const wxString
& tagName
, const wxString
& attrName
)
826 wxSimpleHtmlTag
* tag
= m_next
;
829 if (tag
->NameIs(tagName
) && (attrName
.IsEmpty() || tag
->FindAttribute(attrName
)))
837 bool wxSimpleHtmlTag::FindTextUntilTagClose(wxString
& text
, const wxString
& tagName
)
839 wxSimpleHtmlTag
* tag
= this;
842 if (tag
->GetType() == wxSimpleHtmlTag_Close
&& tag
->NameIs(tagName
))
845 if (tag
->GetType() == wxSimpleHtmlTag_Text
)
846 text
+= tag
->GetText();
854 wxSimpleHtmlTag
* wxSimpleHtmlTag::GetChild(int i
) const
857 wxSimpleHtmlTag
* tag
= m_children
;
869 void wxSimpleHtmlTag::Write(wxOutputStream
& stream
)
871 // Some helpers to layout the open and close tags.
872 static bool sbUseTab
= true;
873 static size_t snTabLevel
= 0;
875 #if 0 // Enable if no tabs should be used to align the tags.
879 // Handle the different types of tags we can write.
882 case wxSimpleHtmlTag_Text
:
884 stream
<< wxSimpleHtmlParser::EncodeSpecialChars(m_text
);
887 case wxSimpleHtmlTag_Open
:
890 for(tab
= 0; tab
< snTabLevel
; tab
++)
892 stream
<< wxT("<") << wxSimpleHtmlParser::EncodeSpecialChars(m_name
);
893 if (GetAttributeCount() > 0)
896 for (i
= 0; i
< GetAttributeCount(); i
++)
898 wxSimpleHtmlAttribute
* attr
= GetAttribute(i
);
900 if (i
< GetAttributeCount() - 1)
905 sbUseTab
= false; // We're putting the open a close tag on the same line,
906 // so we don't wan't any tabs
912 stream
<< wxT(">\n");
917 case wxSimpleHtmlTag_Directive
:
919 stream
<< wxT("<!") << wxSimpleHtmlParser::EncodeSpecialChars(m_name
) << wxT(" ");
921 for (i
= 0; i
< GetAttributeCount(); i
++)
923 wxSimpleHtmlAttribute
* attr
= GetAttribute(i
);
925 if (i
< GetAttributeCount() - 1)
928 stream
<< wxT(">\n");
931 case wxSimpleHtmlTag_XMLDeclaration
:
933 stream
<< wxT("<?") << wxSimpleHtmlParser::EncodeSpecialChars(m_name
) << wxT(" ");
935 for (i
= 0; i
< GetAttributeCount(); i
++)
937 wxSimpleHtmlAttribute
* attr
= GetAttribute(i
);
939 if (i
< GetAttributeCount() - 1)
942 stream
<< wxT(">\n\n");
945 case wxSimpleHtmlTag_Close
:
947 if (snTabLevel
) // Safety to prevent going around...
948 snTabLevel
--; // Reduce the tab level
949 if (sbUseTab
) // Do we write the open tag and close tag on a other line?
952 for(tab
= 0; tab
< snTabLevel
; tab
++)
955 stream
<< wxT("</") << wxSimpleHtmlParser::EncodeSpecialChars(m_name
) << wxT(">\n");
964 wxSimpleHtmlTag
* tag
= m_children
;
973 void wxSimpleHtmlAttribute::Write(wxOutputStream
& stream
)
975 if (m_value
.IsEmpty())
976 stream
<< wxSimpleHtmlParser::EncodeSpecialChars(m_name
);
979 stream
<< wxSimpleHtmlParser::EncodeSpecialChars(m_name
);
980 stream
<< wxT("=\"");
981 stream
<< wxSimpleHtmlParser::EncodeSpecialChars(m_value
);