1 /////////////////////////////////////////////////////////////////////////////
2 // Name: htmlparser.cpp
3 // Purpose: Simple HTML parser
4 // Author: Julian Smart
8 // Copyright: (c) Julian Smart
9 // Licence: wxWindows license
10 /////////////////////////////////////////////////////////////////////////////
12 // ----------------------------------------------------------------------------
14 // ----------------------------------------------------------------------------
15 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
16 #pragma implementation "htmlparser.h"
19 // For compilers that support precompilation, includes "wx/wx.h".
20 #include "wx/wxprec.h"
26 #include "wx/wfstream.h"
27 #include "wx/textfile.h"
28 #include "wx/txtstrm.h"
29 #include "htmlparser.h"
31 /// Useful insertion operators for wxOutputStream.
32 static wxOutputStream
& operator <<(wxOutputStream
& stream
, const wxString
& s
)
34 wxTextOutputStream
txt(stream
); // This is to make sure the line-ending is native!
40 #if 0 // Gives warning because not used...
41 static wxOutputStream
& operator <<(wxOutputStream
& stream
, long l
)
48 static wxOutputStream
& operator <<(wxOutputStream
& stream
, const char c
)
57 * wxSimpleHtmlAttribute
58 * Representation of an attribute
61 wxSimpleHtmlParser::wxSimpleHtmlParser()
68 wxSimpleHtmlParser::~wxSimpleHtmlParser()
73 bool wxSimpleHtmlParser::ParseFile(const wxString
& filename
)
77 if (textFile
.Open(filename
))
82 int count
= textFile
.GetLineCount();
83 for (i
= 0; i
< count
; i
++)
86 line
= textFile
.GetFirstLine();
88 line
= textFile
.GetNextLine();
96 for ( line
= textFile
.GetFirstLine(); !textFile
.Eof(); line
= textFile
.GetNextLine() )
104 return ParseString(text
);
110 bool wxSimpleHtmlParser::ParseString(const wxString
& str
)
116 m_length
= str
.Length();
118 m_topLevel
= new wxSimpleHtmlTag(wxT("TOPLEVEL"), wxSimpleHtmlTag_TopLevel
);
120 bool bResult
= ParseHtml(m_topLevel
);
122 wxASSERT(bResult
); // Failed to parse the TAGs.
123 // Hint: Check if every open tag has a close tag!
128 // Main recursive parsing function
129 bool wxSimpleHtmlParser::ParseHtml(wxSimpleHtmlTag
* parent
)
141 else if (IsDirective())
143 wxSimpleHtmlTag
* tag
= ParseDirective();
145 parent
->AppendTag(tag
);
147 else if (IsXMLDeclaration())
149 wxSimpleHtmlTag
* tag
= ParseXMLDeclaration();
151 parent
->AppendTag(tag
);
153 else if (IsTagClose())
155 wxSimpleHtmlTag
* tag
= ParseTagClose();
158 if (IsCloseTagNeeded(tag
->GetName()))
160 if (!parent
->GetParent())
162 parent
->GetParent()->AppendTag(tag
);
166 parent
->AppendTag(tag
);
169 else if (IsTagStartBracket(GetChar(m_pos
)))
171 wxSimpleHtmlTag
* tag
= ParseTagHeader();
173 parent
->AppendTag(tag
);
175 if (IsCloseTagNeeded(tag
->GetName()))
178 return false; // Something didn't go ok, so don't continue.
183 // Just a text string
187 wxSimpleHtmlTag
* tag
= new wxSimpleHtmlTag(wxT("TEXT"), wxSimpleHtmlTag_Text
);
189 if(parent
->GetParent())
190 parent
->GetParent()->AppendTag(tag
);
192 parent
->AppendTag(tag
); // When this occurs it is probably the
193 // empty lines at the end of the file...
199 // Plain text, up until an angled bracket
200 bool wxSimpleHtmlParser::ParseText(wxString
& text
)
202 while (!Eof() && GetChar(m_pos
) != wxT('<'))
204 text
+= (wxChar
)GetChar(m_pos
);
207 DecodeSpecialChars(text
);
211 wxSimpleHtmlTag
* wxSimpleHtmlParser::ParseTagHeader()
213 if (IsTagStartBracket(GetChar(m_pos
)))
219 ReadWord(word
, true);
223 wxSimpleHtmlTag
* tag
= new wxSimpleHtmlTag(word
, wxSimpleHtmlTag_Open
);
225 ParseAttributes(tag
);
229 if (IsTagEndBracket(GetChar(m_pos
)))
238 wxSimpleHtmlTag
* wxSimpleHtmlParser::ParseTagClose()
240 Matches(wxT("</"), true);
245 ReadWord(word
, true);
250 wxSimpleHtmlTag
* tag
= new wxSimpleHtmlTag(word
, wxSimpleHtmlTag_Close
);
254 bool wxSimpleHtmlParser::ParseAttributes(wxSimpleHtmlTag
* tag
)
256 // Parse attributes of a tag header until we reach >
257 while (!IsTagEndBracket(GetChar(m_pos
)) && !Eof())
261 wxString attrName
, attrValue
;
265 ReadString(attrName
, true);
266 tag
->AppendAttribute(attrName
, wxEmptyString
);
268 else if (IsNumeric(GetChar(m_pos
)))
270 ReadNumber(attrName
, true);
271 tag
->AppendAttribute(attrName
, wxEmptyString
);
275 // Try to read an attribute name/value pair, or at least a name
277 ReadLiteral(attrName
, true);
280 if (GetChar(m_pos
) == wxT('='))
286 ReadString(attrValue
, true);
287 else if (!Eof() && !IsTagEndBracket(GetChar(m_pos
)))
288 ReadLiteral(attrValue
, true);
290 if (!attrName
.IsEmpty())
291 tag
->AppendAttribute(attrName
, attrValue
);
297 // e.g. <!DOCTYPE ....>
298 wxSimpleHtmlTag
* wxSimpleHtmlParser::ParseDirective()
300 Matches(wxT("<!"), true);
305 ReadWord(word
, true);
309 wxSimpleHtmlTag
* tag
= new wxSimpleHtmlTag(word
, wxSimpleHtmlTag_Directive
);
311 ParseAttributes(tag
);
315 if (IsTagEndBracket(GetChar(m_pos
)))
321 // e.g. <?xml .... ?>
322 wxSimpleHtmlTag
* wxSimpleHtmlParser::ParseXMLDeclaration()
324 Matches(wxT("<?"), true);
329 ReadWord(word
, true);
333 wxSimpleHtmlTag
* tag
= new wxSimpleHtmlTag(word
, wxSimpleHtmlTag_XMLDeclaration
);
335 ParseAttributes(tag
);
339 if (IsTagEndBracket(GetChar(m_pos
)))
345 bool wxSimpleHtmlParser::ParseComment()
347 // Eat the comment tag start
348 Matches(wxT("<!--"), true);
350 while (!Eof() && !Matches(wxT("-->"), true))
358 bool wxSimpleHtmlParser::EatWhitespace()
360 while (!Eof() && IsWhitespace(GetChar(m_pos
)))
365 bool wxSimpleHtmlParser::EatWhitespace(int& pos
)
367 while (!Eof(pos
) && IsWhitespace(GetChar(pos
)))
372 bool wxSimpleHtmlParser::ReadString(wxString
& str
, bool eatIt
)
375 if (GetChar(pos
) == (int) '"')
378 while (!Eof(pos
) && GetChar(pos
) != (int) '"')
380 // TODO: how are quotes escaped in HTML?
381 str
+= (wxChar
) GetChar(pos
);
384 if (GetChar(pos
) == (int) '"')
388 DecodeSpecialChars(str
);
395 bool wxSimpleHtmlParser::ReadWord(wxString
& str
, bool eatIt
)
399 if (!IsAlpha(GetChar(pos
)))
402 str
+= (wxChar
) GetChar(pos
) ;
405 while (!Eof(pos
) && IsWordChar(GetChar(pos
)))
407 str
+= (wxChar
) GetChar(pos
);
412 DecodeSpecialChars(str
);
416 bool wxSimpleHtmlParser::ReadNumber(wxString
& str
, bool eatIt
)
420 if (!IsNumeric(GetChar(pos
)))
423 str
+= (wxChar
) GetChar(pos
) ;
426 while (!Eof(pos
) && IsNumeric(GetChar(pos
)))
428 str
+= (wxChar
) GetChar(pos
);
433 DecodeSpecialChars(str
);
437 // Could be number, string, whatever, but read up until whitespace or end of tag (but not a quoted string)
438 bool wxSimpleHtmlParser::ReadLiteral(wxString
& str
, bool eatIt
)
442 while (!Eof(pos
) && !IsWhitespace(GetChar(pos
)) && !IsTagEndBracket(GetChar(pos
)) && GetChar(pos
) != wxT('='))
444 str
+= (wxChar
)GetChar(pos
);
449 DecodeSpecialChars(str
);
453 bool wxSimpleHtmlParser::IsComment()
455 return Matches(wxT("<!--"));
458 bool wxSimpleHtmlParser::IsDirective()
460 return Matches(wxT("<!"));
463 bool wxSimpleHtmlParser::IsXMLDeclaration()
465 return Matches(wxT("<?xml"));
468 bool wxSimpleHtmlParser::IsString()
470 return (GetChar(m_pos
) == (int) '"') ;
473 bool wxSimpleHtmlParser::IsWord()
475 return (IsAlpha(GetChar(m_pos
)));
478 bool wxSimpleHtmlParser::IsTagClose()
480 return Matches(wxT("</"));
483 bool wxSimpleHtmlParser::IsTagStartBracket(int ch
)
485 return (ch
== wxT('<'));
488 bool wxSimpleHtmlParser::IsTagEndBracket(int ch
)
490 return (ch
== wxT('>'));
493 bool wxSimpleHtmlParser::IsWhitespace(int ch
)
495 return ((ch
== 13) || (ch
== 10) || (ch
== 32) || (ch
== (int) '\t')) ;
498 bool wxSimpleHtmlParser::IsAlpha(int ch
)
500 return (wxIsalpha((wxChar
) ch
) != 0);
503 bool wxSimpleHtmlParser::IsWordChar(int ch
)
505 return (wxIsalpha((wxChar
) ch
) != 0 || ch
== wxT('-') || ch
== wxT('_') || IsNumeric(ch
));
508 bool wxSimpleHtmlParser::IsNumeric(int ch
)
510 return (wxIsdigit((wxChar
) ch
) != 0 || ch
== wxT('-') || ch
== wxT('.')) ;
513 bool wxSimpleHtmlParser::IsCloseTagNeeded(const wxString
&name
)
515 if (name
.IsSameAs(wxT("P"), false)) // e.g <P>
518 // ToDo add more items here.
523 // Encode/Decode Special Characters.
524 // See here for the used table: http://msdn.microsoft.com/library/default.asp?url=/library/en-us/xmlsql/ac_xml1_1nqk.asp
525 /* static */ void wxSimpleHtmlParser::DecodeSpecialChars(wxString
&value
)
528 value
.Replace(wxT(">"), wxT(">"), true);
529 value
.Replace(wxT("<"), wxT("<"), true);
530 value
.Replace(wxT("""), wxT("\""), true);
531 value
.Replace(wxT("'"), wxT("'"), true);
532 value
.Replace(wxT("&"), wxT("&"), true); // Note: do this as last to prevent replace problems.
535 /* static */ wxString
wxSimpleHtmlParser::EncodeSpecialChars(const wxString
&value
)
537 wxString newvalue
= value
;
540 newvalue
.Replace(wxT("&"), wxT("&"), true); // Note: do this as first to prevent replace problems.
541 newvalue
.Replace(wxT(">"), wxT(">"), true);
542 newvalue
.Replace(wxT("<"), wxT("<"), true);
543 newvalue
.Replace(wxT("\""),wxT("""), true);
544 newvalue
.Replace(wxT("'"), wxT("'"), true);
549 // Matches this string (case insensitive)
550 bool wxSimpleHtmlParser::Matches(const wxString
& tok
, bool eatIt
)
552 wxString
text(m_text
.Mid(m_pos
, tok
.Length()));
553 bool success
= (text
.CmpNoCase(tok
) == 0) ;
554 if (success
&& eatIt
)
556 m_pos
+= tok
.Length();
561 // Safe way of getting a character
562 int wxSimpleHtmlParser::GetChar(size_t i
) const
564 if (i
>= (size_t) m_length
)
569 void wxSimpleHtmlParser::Clear()
574 m_text
= wxEmptyString
;
580 void wxSimpleHtmlParser::Write(wxOutputStream
& stream
)
583 m_topLevel
->Write(stream
);
586 bool wxSimpleHtmlParser::WriteFile(wxString
& filename
)
588 wxFileOutputStream
fstream(filename
);
600 * Representation of a tag or chunk of text
603 wxSimpleHtmlTag::wxSimpleHtmlTag(const wxString
& tagName
, int tagType
)
613 wxSimpleHtmlTag::~wxSimpleHtmlTag()
620 void wxSimpleHtmlTag::ClearAttributes()
624 wxSimpleHtmlAttribute
* attr
= m_attributes
;
627 wxSimpleHtmlAttribute
* next
= attr
->m_next
;
637 wxSimpleHtmlAttribute
* wxSimpleHtmlTag::FindAttribute(const wxString
& name
) const
639 wxSimpleHtmlAttribute
* attr
= m_attributes
;
642 if (attr
->GetName().CmpNoCase(name
) == 0)
651 void wxSimpleHtmlTag::AppendAttribute(const wxString
& name
, const wxString
& value
)
653 wxSimpleHtmlAttribute
* attr
= new wxSimpleHtmlAttribute(name
, value
);
657 wxSimpleHtmlAttribute
* last
= m_attributes
;
667 void wxSimpleHtmlTag::ClearChildren()
671 wxSimpleHtmlTag
* child
= m_children
;
674 wxSimpleHtmlTag
* next
= child
->m_next
;
676 child
->m_next
= NULL
;
684 void wxSimpleHtmlTag::RemoveChild(wxSimpleHtmlTag
*remove
)
688 wxSimpleHtmlTag
* child
= m_children
;
689 wxSimpleHtmlTag
* prev
= NULL
;
692 wxSimpleHtmlTag
* next
= child
->m_next
;
696 child
->m_next
= NULL
;
712 void wxSimpleHtmlTag::AppendTag(wxSimpleHtmlTag
* tag
)
720 wxSimpleHtmlTag
* last
= m_children
;
731 tag
->m_parent
= this;
734 void wxSimpleHtmlTag::AppendTagAfterUs(wxSimpleHtmlTag
* tag
)
739 tag
->m_parent
= m_parent
;
740 tag
->m_next
= m_next
;
744 // Gets the text from this tag and its descendants
745 wxString
wxSimpleHtmlTag::GetTagText()
750 wxSimpleHtmlTag
* tag
= m_children
;
753 text
+= tag
->GetTagText();
758 else if (GetType() == wxSimpleHtmlTag_Text
)
761 return wxEmptyString
;
764 int wxSimpleHtmlTag::GetAttributeCount() const
767 wxSimpleHtmlAttribute
* attr
= m_attributes
;
776 wxSimpleHtmlAttribute
* wxSimpleHtmlTag::GetAttribute(int i
) const
779 wxSimpleHtmlAttribute
* attr
= m_attributes
;
790 int wxSimpleHtmlTag::GetChildCount() const
793 wxSimpleHtmlTag
* tag
= m_children
;
802 bool wxSimpleHtmlTag::HasAttribute(const wxString
& name
, const wxString
& value
) const
804 wxSimpleHtmlAttribute
* attr
= FindAttribute(name
);
806 return (attr
&& (attr
->GetValue().CmpNoCase(value
) == 0)) ;
809 bool wxSimpleHtmlTag::HasAttribute(const wxString
& name
) const
811 return FindAttribute(name
) != NULL
;
814 bool wxSimpleHtmlTag::GetAttributeValue(wxString
& value
, const wxString
& attrName
)
816 wxSimpleHtmlAttribute
* attr
= FindAttribute(attrName
);
819 value
= attr
->GetValue();
826 // Search forward from this tag until we find a tag with this name & attribute
827 wxSimpleHtmlTag
* wxSimpleHtmlTag::FindTag(const wxString
& tagName
, const wxString
& attrName
)
829 wxSimpleHtmlTag
* tag
= m_next
;
832 if (tag
->NameIs(tagName
) && (attrName
.IsEmpty() || tag
->FindAttribute(attrName
)))
840 bool wxSimpleHtmlTag::FindTextUntilTagClose(wxString
& text
, const wxString
& tagName
)
842 wxSimpleHtmlTag
* tag
= this;
845 if (tag
->GetType() == wxSimpleHtmlTag_Close
&& tag
->NameIs(tagName
))
848 if (tag
->GetType() == wxSimpleHtmlTag_Text
)
849 text
+= tag
->GetText();
857 wxSimpleHtmlTag
* wxSimpleHtmlTag::GetChild(int i
) const
860 wxSimpleHtmlTag
* tag
= m_children
;
872 void wxSimpleHtmlTag::Write(wxOutputStream
& stream
)
874 // Some helpers to layout the open and close tags.
875 static bool sbUseTab
= true;
876 static size_t snTabLevel
= 0;
878 #if 0 // Enable if no tabs should be used to align the tags.
882 // Handle the different types of tags we can write.
885 case wxSimpleHtmlTag_Text
:
887 stream
<< wxSimpleHtmlParser::EncodeSpecialChars(m_text
);
890 case wxSimpleHtmlTag_Open
:
893 for(tab
= 0; tab
< snTabLevel
; tab
++)
895 stream
<< wxT("<") << wxSimpleHtmlParser::EncodeSpecialChars(m_name
);
896 if (GetAttributeCount() > 0)
899 for (i
= 0; i
< GetAttributeCount(); i
++)
901 wxSimpleHtmlAttribute
* attr
= GetAttribute(i
);
903 if (i
< GetAttributeCount() - 1)
908 sbUseTab
= false; // We're putting the open a close tag on the same line,
909 // so we don't wan't any tabs
915 stream
<< wxT(">\n");
920 case wxSimpleHtmlTag_Directive
:
922 stream
<< wxT("<!") << wxSimpleHtmlParser::EncodeSpecialChars(m_name
) << wxT(" ");
924 for (i
= 0; i
< GetAttributeCount(); i
++)
926 wxSimpleHtmlAttribute
* attr
= GetAttribute(i
);
928 if (i
< GetAttributeCount() - 1)
931 stream
<< wxT(">\n");
934 case wxSimpleHtmlTag_XMLDeclaration
:
936 stream
<< wxT("<?") << wxSimpleHtmlParser::EncodeSpecialChars(m_name
) << wxT(" ");
938 for (i
= 0; i
< GetAttributeCount(); i
++)
940 wxSimpleHtmlAttribute
* attr
= GetAttribute(i
);
942 if (i
< GetAttributeCount() - 1)
945 stream
<< wxT(">\n\n");
948 case wxSimpleHtmlTag_Close
:
950 if (snTabLevel
) // Safety to prevent going around...
951 snTabLevel
--; // Reduce the tab level
952 if (sbUseTab
) // Do we write the open tag and close tag on a other line?
955 for(tab
= 0; tab
< snTabLevel
; tab
++)
958 stream
<< wxT("</") << wxSimpleHtmlParser::EncodeSpecialChars(m_name
) << wxT(">\n");
967 wxSimpleHtmlTag
* tag
= m_children
;
976 void wxSimpleHtmlAttribute::Write(wxOutputStream
& stream
)
978 if (m_value
.IsEmpty())
979 stream
<< wxSimpleHtmlParser::EncodeSpecialChars(m_name
);
982 stream
<< wxSimpleHtmlParser::EncodeSpecialChars(m_name
);
983 stream
<< wxT("=\"");
984 stream
<< wxSimpleHtmlParser::EncodeSpecialChars(m_value
);