1 /////////////////////////////////////////////////////////////////////////////
2 // Name: htmlparser.cpp
3 // Purpose: Simple HTML parser
4 // Author: Julian Smart
8 // Copyright: (c) Julian Smart
9 // Licence: wxWindows license
10 /////////////////////////////////////////////////////////////////////////////
12 // ----------------------------------------------------------------------------
14 // ----------------------------------------------------------------------------
15 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
16 #pragma implementation "htmlparser.h"
25 #include "wx/textfile.h"
26 #include "wx/wfstream.h"
27 #include "wx/txtstrm.h"
31 #include "htmlparser.h"
33 /// Useful insertion operators for wxOutputStream.
34 static wxOutputStream
& operator <<(wxOutputStream
& stream
, const wxString
& s
)
36 wxTextOutputStream
txt(stream
); // This is to make sure the line-ending is native!
42 #if 0 // Gives warning because not used...
43 static wxOutputStream
& operator <<(wxOutputStream
& stream
, long l
)
50 static wxOutputStream
& operator <<(wxOutputStream
& stream
, const char c
)
59 * wxSimpleHtmlAttribute
60 * Representation of an attribute
63 wxSimpleHtmlParser::wxSimpleHtmlParser()
70 wxSimpleHtmlParser::~wxSimpleHtmlParser()
75 bool wxSimpleHtmlParser::ParseFile(const wxString
& filename
)
79 if (textFile
.Open(filename
))
84 int count
= textFile
.GetLineCount();
85 for (i
= 0; i
< count
; i
++)
88 line
= textFile
.GetFirstLine();
90 line
= textFile
.GetNextLine();
98 for ( line
= textFile
.GetFirstLine(); !textFile
.Eof(); line
= textFile
.GetNextLine() )
106 return ParseString(text
);
112 bool wxSimpleHtmlParser::ParseString(const wxString
& str
)
118 m_length
= str
.Length();
120 m_topLevel
= new wxSimpleHtmlTag(wxT("TOPLEVEL"), wxSimpleHtmlTag_TopLevel
);
122 bool bResult
= ParseHtml(m_topLevel
);
124 wxASSERT(bResult
); // Failed to parse the TAGs.
125 // Hint: Check if every open tag has a close tag!
130 // Main recursive parsing function
131 bool wxSimpleHtmlParser::ParseHtml(wxSimpleHtmlTag
* parent
)
143 else if (IsDirective())
145 wxSimpleHtmlTag
* tag
= ParseDirective();
147 parent
->AppendTag(tag
);
149 else if (IsXMLDeclaration())
151 wxSimpleHtmlTag
* tag
= ParseXMLDeclaration();
153 parent
->AppendTag(tag
);
155 else if (IsTagClose())
157 wxSimpleHtmlTag
* tag
= ParseTagClose();
160 if (IsCloseTagNeeded(tag
->GetName()))
162 if (!parent
->GetParent())
164 parent
->GetParent()->AppendTag(tag
);
168 parent
->AppendTag(tag
);
171 else if (IsTagStartBracket(GetChar(m_pos
)))
173 wxSimpleHtmlTag
* tag
= ParseTagHeader();
175 parent
->AppendTag(tag
);
177 if (IsCloseTagNeeded(tag
->GetName()))
180 return false; // Something didn't go ok, so don't continue.
185 // Just a text string
189 wxSimpleHtmlTag
* tag
= new wxSimpleHtmlTag(wxT("TEXT"), wxSimpleHtmlTag_Text
);
191 if(parent
->GetParent())
192 parent
->GetParent()->AppendTag(tag
);
194 parent
->AppendTag(tag
); // When this occurs it is probably the
195 // empty lines at the end of the file...
201 // Plain text, up until an angled bracket
202 bool wxSimpleHtmlParser::ParseText(wxString
& text
)
204 while (!Eof() && GetChar(m_pos
) != wxT('<'))
206 text
+= GetChar(m_pos
);
209 DecodeSpecialChars(text
);
213 wxSimpleHtmlTag
* wxSimpleHtmlParser::ParseTagHeader()
215 if (IsTagStartBracket(GetChar(m_pos
)))
221 ReadWord(word
, true);
225 wxSimpleHtmlTag
* tag
= new wxSimpleHtmlTag(word
, wxSimpleHtmlTag_Open
);
227 ParseAttributes(tag
);
231 if (IsTagEndBracket(GetChar(m_pos
)))
240 wxSimpleHtmlTag
* wxSimpleHtmlParser::ParseTagClose()
242 Matches(wxT("</"), true);
247 ReadWord(word
, true);
252 wxSimpleHtmlTag
* tag
= new wxSimpleHtmlTag(word
, wxSimpleHtmlTag_Close
);
256 bool wxSimpleHtmlParser::ParseAttributes(wxSimpleHtmlTag
* tag
)
258 // Parse attributes of a tag header until we reach >
259 while (!IsTagEndBracket(GetChar(m_pos
)) && !Eof())
263 wxString attrName
, attrValue
;
267 ReadString(attrName
, true);
268 tag
->AppendAttribute(attrName
, wxEmptyString
);
270 else if (IsNumeric(GetChar(m_pos
)))
272 ReadNumber(attrName
, true);
273 tag
->AppendAttribute(attrName
, wxEmptyString
);
277 // Try to read an attribute name/value pair, or at least a name
279 ReadLiteral(attrName
, true);
282 if (GetChar(m_pos
) == wxT('='))
288 ReadString(attrValue
, true);
289 else if (!Eof() && !IsTagEndBracket(GetChar(m_pos
)))
290 ReadLiteral(attrValue
, true);
292 if (!attrName
.IsEmpty())
293 tag
->AppendAttribute(attrName
, attrValue
);
299 // e.g. <!DOCTYPE ....>
300 wxSimpleHtmlTag
* wxSimpleHtmlParser::ParseDirective()
302 Matches(wxT("<!"), true);
307 ReadWord(word
, true);
311 wxSimpleHtmlTag
* tag
= new wxSimpleHtmlTag(word
, wxSimpleHtmlTag_Directive
);
313 ParseAttributes(tag
);
317 if (IsTagEndBracket(GetChar(m_pos
)))
323 // e.g. <?xml .... ?>
324 wxSimpleHtmlTag
* wxSimpleHtmlParser::ParseXMLDeclaration()
326 Matches(wxT("<?"), true);
331 ReadWord(word
, true);
335 wxSimpleHtmlTag
* tag
= new wxSimpleHtmlTag(word
, wxSimpleHtmlTag_XMLDeclaration
);
337 ParseAttributes(tag
);
341 if (IsTagEndBracket(GetChar(m_pos
)))
347 bool wxSimpleHtmlParser::ParseComment()
349 // Eat the comment tag start
350 Matches(wxT("<!--"), true);
352 while (!Eof() && !Matches(wxT("-->"), true))
360 bool wxSimpleHtmlParser::EatWhitespace()
362 while (!Eof() && IsWhitespace(GetChar(m_pos
)))
367 bool wxSimpleHtmlParser::EatWhitespace(int& pos
)
369 while (!Eof(pos
) && IsWhitespace(GetChar(pos
)))
374 bool wxSimpleHtmlParser::ReadString(wxString
& str
, bool eatIt
)
377 if (GetChar(pos
) == (int) '"')
380 while (!Eof(pos
) && GetChar(pos
) != (int) '"')
382 // TODO: how are quotes escaped in HTML?
383 str
+= (wxChar
) GetChar(pos
);
386 if (GetChar(pos
) == (int) '"')
390 DecodeSpecialChars(str
);
397 bool wxSimpleHtmlParser::ReadWord(wxString
& str
, bool eatIt
)
401 if (!IsAlpha(GetChar(pos
)))
404 str
+= (wxChar
) GetChar(pos
) ;
407 while (!Eof(pos
) && IsWordChar(GetChar(pos
)))
409 str
+= (wxChar
) GetChar(pos
);
414 DecodeSpecialChars(str
);
418 bool wxSimpleHtmlParser::ReadNumber(wxString
& str
, bool eatIt
)
422 if (!IsNumeric(GetChar(pos
)))
425 str
+= (wxChar
) GetChar(pos
) ;
428 while (!Eof(pos
) && IsNumeric(GetChar(pos
)))
430 str
+= (wxChar
) GetChar(pos
);
435 DecodeSpecialChars(str
);
439 // Could be number, string, whatever, but read up until whitespace or end of tag (but not a quoted string)
440 bool wxSimpleHtmlParser::ReadLiteral(wxString
& str
, bool eatIt
)
444 while (!Eof(pos
) && !IsWhitespace(GetChar(pos
)) && !IsTagEndBracket(GetChar(pos
)) && GetChar(pos
) != wxT('='))
451 DecodeSpecialChars(str
);
455 bool wxSimpleHtmlParser::IsComment()
457 return Matches(wxT("<!--"));
460 bool wxSimpleHtmlParser::IsDirective()
462 return Matches(wxT("<!"));
465 bool wxSimpleHtmlParser::IsXMLDeclaration()
467 return Matches(wxT("<?xml"));
470 bool wxSimpleHtmlParser::IsString()
472 return (GetChar(m_pos
) == (int) '"') ;
475 bool wxSimpleHtmlParser::IsWord()
477 return (IsAlpha(GetChar(m_pos
)));
480 bool wxSimpleHtmlParser::IsTagClose()
482 return Matches(wxT("</"));
485 bool wxSimpleHtmlParser::IsTagStartBracket(int ch
)
487 return (ch
== wxT('<'));
490 bool wxSimpleHtmlParser::IsTagEndBracket(int ch
)
492 return (ch
== wxT('>'));
495 bool wxSimpleHtmlParser::IsWhitespace(int ch
)
497 return ((ch
== 13) || (ch
== 10) || (ch
== 32) || (ch
== (int) '\t')) ;
500 bool wxSimpleHtmlParser::IsAlpha(int ch
)
502 return (wxIsalpha((wxChar
) ch
) != 0);
505 bool wxSimpleHtmlParser::IsWordChar(int ch
)
507 return (wxIsalpha((wxChar
) ch
) != 0 || ch
== wxT('-') || ch
== wxT('_') || IsNumeric(ch
));
510 bool wxSimpleHtmlParser::IsNumeric(int ch
)
512 return (wxIsdigit((wxChar
) ch
) != 0 || ch
== wxT('-') || ch
== wxT('.')) ;
515 bool wxSimpleHtmlParser::IsCloseTagNeeded(const wxString
&name
)
517 if (name
.IsSameAs(wxT("P"), false)) // e.g <P>
520 // ToDo add more items here.
525 // Encode/Decode Special Characters.
526 // See here for the used table: http://msdn.microsoft.com/library/default.asp?url=/library/en-us/xmlsql/ac_xml1_1nqk.asp
527 /* static */ void wxSimpleHtmlParser::DecodeSpecialChars(wxString
&value
)
530 value
.Replace(wxT(">"), wxT(">"), true);
531 value
.Replace(wxT("<"), wxT("<"), true);
532 value
.Replace(wxT("""), wxT("\""), true);
533 value
.Replace(wxT("'"), wxT("'"), true);
534 value
.Replace(wxT("&"), wxT("&"), true); // Note: do this as last to prevent replace problems.
537 /* static */ wxString
wxSimpleHtmlParser::EncodeSpecialChars(const wxString
&value
)
539 wxString newvalue
= value
;
542 newvalue
.Replace(wxT("&"), wxT("&"), true); // Note: do this as first to prevent replace problems.
543 newvalue
.Replace(wxT(">"), wxT(">"), true);
544 newvalue
.Replace(wxT("<"), wxT("<"), true);
545 newvalue
.Replace(wxT("\""),wxT("""), true);
546 newvalue
.Replace(wxT("'"), wxT("'"), true);
551 // Matches this string (case insensitive)
552 bool wxSimpleHtmlParser::Matches(const wxString
& tok
, bool eatIt
)
554 wxString
text(m_text
.Mid(m_pos
, tok
.Length()));
555 bool success
= (text
.CmpNoCase(tok
) == 0) ;
556 if (success
&& eatIt
)
558 m_pos
+= tok
.Length();
563 // Safe way of getting a character
564 int wxSimpleHtmlParser::GetChar(size_t i
) const
566 if (i
>= (size_t) m_length
)
571 void wxSimpleHtmlParser::Clear()
576 m_text
= wxEmptyString
;
582 void wxSimpleHtmlParser::Write(wxOutputStream
& stream
)
585 m_topLevel
->Write(stream
);
588 bool wxSimpleHtmlParser::WriteFile(wxString
& filename
)
590 wxFileOutputStream
fstream(filename
);
602 * Representation of a tag or chunk of text
605 wxSimpleHtmlTag::wxSimpleHtmlTag(const wxString
& tagName
, int tagType
)
615 wxSimpleHtmlTag::~wxSimpleHtmlTag()
622 void wxSimpleHtmlTag::ClearAttributes()
626 wxSimpleHtmlAttribute
* attr
= m_attributes
;
629 wxSimpleHtmlAttribute
* next
= attr
->m_next
;
639 wxSimpleHtmlAttribute
* wxSimpleHtmlTag::FindAttribute(const wxString
& name
) const
641 wxSimpleHtmlAttribute
* attr
= m_attributes
;
644 if (attr
->GetName().CmpNoCase(name
) == 0)
653 void wxSimpleHtmlTag::AppendAttribute(const wxString
& name
, const wxString
& value
)
655 wxSimpleHtmlAttribute
* attr
= new wxSimpleHtmlAttribute(name
, value
);
659 wxSimpleHtmlAttribute
* last
= m_attributes
;
669 void wxSimpleHtmlTag::ClearChildren()
673 wxSimpleHtmlTag
* child
= m_children
;
676 wxSimpleHtmlTag
* next
= child
->m_next
;
678 child
->m_next
= NULL
;
686 void wxSimpleHtmlTag::RemoveChild(wxSimpleHtmlTag
*remove
)
690 wxSimpleHtmlTag
* child
= m_children
;
691 wxSimpleHtmlTag
* prev
= NULL
;
694 wxSimpleHtmlTag
* next
= child
->m_next
;
698 child
->m_next
= NULL
;
714 void wxSimpleHtmlTag::AppendTag(wxSimpleHtmlTag
* tag
)
722 wxSimpleHtmlTag
* last
= m_children
;
733 tag
->m_parent
= this;
736 void wxSimpleHtmlTag::AppendTagAfterUs(wxSimpleHtmlTag
* tag
)
741 tag
->m_parent
= m_parent
;
742 tag
->m_next
= m_next
;
746 // Gets the text from this tag and its descendants
747 wxString
wxSimpleHtmlTag::GetTagText()
752 wxSimpleHtmlTag
* tag
= m_children
;
755 text
+= tag
->GetTagText();
760 else if (GetType() == wxSimpleHtmlTag_Text
)
763 return wxEmptyString
;
766 int wxSimpleHtmlTag::GetAttributeCount() const
769 wxSimpleHtmlAttribute
* attr
= m_attributes
;
778 wxSimpleHtmlAttribute
* wxSimpleHtmlTag::GetAttribute(int i
) const
781 wxSimpleHtmlAttribute
* attr
= m_attributes
;
792 int wxSimpleHtmlTag::GetChildCount() const
795 wxSimpleHtmlTag
* tag
= m_children
;
804 bool wxSimpleHtmlTag::HasAttribute(const wxString
& name
, const wxString
& value
) const
806 wxSimpleHtmlAttribute
* attr
= FindAttribute(name
);
808 return (attr
&& (attr
->GetValue().CmpNoCase(value
) == 0)) ;
811 bool wxSimpleHtmlTag::HasAttribute(const wxString
& name
) const
813 return FindAttribute(name
) != NULL
;
816 bool wxSimpleHtmlTag::GetAttributeValue(wxString
& value
, const wxString
& attrName
)
818 wxSimpleHtmlAttribute
* attr
= FindAttribute(attrName
);
821 value
= attr
->GetValue();
828 // Search forward from this tag until we find a tag with this name & attribute
829 wxSimpleHtmlTag
* wxSimpleHtmlTag::FindTag(const wxString
& tagName
, const wxString
& attrName
)
831 wxSimpleHtmlTag
* tag
= m_next
;
834 if (tag
->NameIs(tagName
) && (attrName
.IsEmpty() || tag
->FindAttribute(attrName
)))
842 bool wxSimpleHtmlTag::FindTextUntilTagClose(wxString
& text
, const wxString
& tagName
)
844 wxSimpleHtmlTag
* tag
= this;
847 if (tag
->GetType() == wxSimpleHtmlTag_Close
&& tag
->NameIs(tagName
))
850 if (tag
->GetType() == wxSimpleHtmlTag_Text
)
851 text
+= tag
->GetText();
859 wxSimpleHtmlTag
* wxSimpleHtmlTag::GetChild(int i
) const
862 wxSimpleHtmlTag
* tag
= m_children
;
874 void wxSimpleHtmlTag::Write(wxOutputStream
& stream
)
876 // Some helpers to layout the open and close tags.
877 static bool sbUseTab
= true;
878 static size_t snTabLevel
= 0;
880 #if 0 // Enable if no tabs should be used to align the tags.
884 // Handle the different types of tags we can write.
887 case wxSimpleHtmlTag_Text
:
889 stream
<< wxSimpleHtmlParser::EncodeSpecialChars(m_text
);
892 case wxSimpleHtmlTag_Open
:
895 for(tab
= 0; tab
< snTabLevel
; tab
++)
897 stream
<< wxT("<") << wxSimpleHtmlParser::EncodeSpecialChars(m_name
);
898 if (GetAttributeCount() > 0)
901 for (i
= 0; i
< GetAttributeCount(); i
++)
903 wxSimpleHtmlAttribute
* attr
= GetAttribute(i
);
905 if (i
< GetAttributeCount() - 1)
910 sbUseTab
= false; // We're putting the open a close tag on the same line,
911 // so we don't wan't any tabs
917 stream
<< wxT(">\n");
922 case wxSimpleHtmlTag_Directive
:
924 stream
<< wxT("<!") << wxSimpleHtmlParser::EncodeSpecialChars(m_name
) << wxT(" ");
926 for (i
= 0; i
< GetAttributeCount(); i
++)
928 wxSimpleHtmlAttribute
* attr
= GetAttribute(i
);
930 if (i
< GetAttributeCount() - 1)
933 stream
<< wxT(">\n");
936 case wxSimpleHtmlTag_XMLDeclaration
:
938 stream
<< wxT("<?") << wxSimpleHtmlParser::EncodeSpecialChars(m_name
) << wxT(" ");
940 for (i
= 0; i
< GetAttributeCount(); i
++)
942 wxSimpleHtmlAttribute
* attr
= GetAttribute(i
);
944 if (i
< GetAttributeCount() - 1)
947 stream
<< wxT(">\n\n");
950 case wxSimpleHtmlTag_Close
:
952 if (snTabLevel
) // Safety to prevent going around...
953 snTabLevel
--; // Reduce the tab level
954 if (sbUseTab
) // Do we write the open tag and close tag on a other line?
957 for(tab
= 0; tab
< snTabLevel
; tab
++)
960 stream
<< wxT("</") << wxSimpleHtmlParser::EncodeSpecialChars(m_name
) << wxT(">\n");
969 wxSimpleHtmlTag
* tag
= m_children
;
978 void wxSimpleHtmlAttribute::Write(wxOutputStream
& stream
)
980 if (m_value
.IsEmpty())
981 stream
<< wxSimpleHtmlParser::EncodeSpecialChars(m_name
);
984 stream
<< wxSimpleHtmlParser::EncodeSpecialChars(m_name
);
985 stream
<< wxT("=\"");
986 stream
<< wxSimpleHtmlParser::EncodeSpecialChars(m_value
);