1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/html/htmlpars.cpp
3 // Purpose: wxHtmlParser class (generic parser)
4 // Author: Vaclav Slavik
6 // Copyright: (c) 1999 Vaclav Slavik
7 // Licence: wxWindows licence
8 /////////////////////////////////////////////////////////////////////////////
10 #include "wx/wxprec.h"
16 #if wxUSE_HTML && wxUSE_STREAMS
19 #include "wx/dynarray.h"
23 #include "wx/wxcrtvararg.h"
26 #include "wx/tokenzr.h"
27 #include "wx/wfstream.h"
29 #include "wx/fontmap.h"
30 #include "wx/html/htmldefs.h"
31 #include "wx/html/htmlpars.h"
32 #include "wx/vector.h"
35 #include "wx/msw/wince/missing.h" // for bsearch()
38 // DLL options compatibility check:
39 WX_CHECK_BUILD_OPTIONS("wxHTML")
41 const wxChar
*wxTRACE_HTML_DEBUG
= wxT("htmldebug");
43 //-----------------------------------------------------------------------------
44 // wxHtmlParser helpers
45 //-----------------------------------------------------------------------------
51 wxHtmlTextPiece(const wxString::const_iterator
& start
,
52 const wxString::const_iterator
& end
)
53 : m_start(start
), m_end(end
) {}
54 wxString::const_iterator m_start
, m_end
;
57 // NB: this is an empty class and not typedef because of forward declaration
58 class wxHtmlTextPieces
: public wxVector
<wxHtmlTextPiece
>
62 class wxHtmlParserState
67 wxHtmlTextPieces
*m_textPieces
;
69 const wxString
*m_source
;
70 wxHtmlParserState
*m_nextState
;
73 //-----------------------------------------------------------------------------
75 //-----------------------------------------------------------------------------
77 IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser
,wxObject
)
79 wxHtmlParser::wxHtmlParser()
84 m_entitiesParser
= new wxHtmlEntitiesParser
;
92 wxHtmlParser::~wxHtmlParser()
94 while (RestoreState()) {}
97 WX_CLEAR_ARRAY(m_HandlersStack
);
98 WX_CLEAR_HASH_SET(wxHtmlTagHandlersSet
, m_HandlersSet
);
99 delete m_entitiesParser
;
103 wxObject
* wxHtmlParser::Parse(const wxString
& source
)
107 wxObject
*result
= GetProduct();
112 void wxHtmlParser::InitParser(const wxString
& source
)
115 m_stopParsing
= false;
118 void wxHtmlParser::DoneParser()
123 void wxHtmlParser::SetSource(const wxString
& src
)
126 // NB: This is allocated on heap because wxHtmlTag uses iterators and
127 // making a copy of m_Source string in SetSourceAndSaveState() and
128 // RestoreState() would invalidate them (because wxString::m_impl's
129 // memory would change completely twice and iterators use pointers
130 // into it). So instead, we keep the string object intact and only
131 // store/restore pointer to it, for which we need it to be allocated
134 m_Source
= new wxString(src
);
140 void wxHtmlParser::CreateDOMTree()
142 wxHtmlTagsCache
cache(*m_Source
);
143 m_TextPieces
= new wxHtmlTextPieces
;
144 CreateDOMSubTree(NULL
, m_Source
->begin(), m_Source
->end(), &cache
);
148 extern bool wxIsCDATAElement(const wxString
& tag
);
150 void wxHtmlParser::CreateDOMSubTree(wxHtmlTag
*cur
,
151 const wxString::const_iterator
& begin_pos
,
152 const wxString::const_iterator
& end_pos
,
153 wxHtmlTagsCache
*cache
)
155 if (end_pos
<= begin_pos
)
159 wxString::const_iterator i
= begin_pos
;
160 wxString::const_iterator textBeginning
= begin_pos
;
162 // If the tag contains CDATA text, we include the text between beginning
163 // and ending tag verbosely. Setting i=end_pos will skip to the very
164 // end of this function where text piece is added, bypassing any child
165 // tags parsing (CDATA element can't have child elements by definition):
166 if (cur
!= NULL
&& wxIsCDATAElement(cur
->GetName()))
177 // add text to m_TextPieces:
178 if (i
> textBeginning
)
179 m_TextPieces
->push_back(wxHtmlTextPiece(textBeginning
, i
));
181 // if it is a comment, skip it:
182 if ( SkipCommentTag(i
, m_Source
->end()) )
184 textBeginning
= i
= i
+ 1; // skip closing '>' too
187 // add another tag to the tree:
188 else if (i
< end_pos
-1 && *(i
+1) != wxT('/'))
192 chd
= new wxHtmlTag(cur
, m_Source
,
193 i
, end_pos
, cache
, m_entitiesParser
);
196 chd
= new wxHtmlTag(NULL
, m_Source
,
197 i
, end_pos
, cache
, m_entitiesParser
);
200 // if this is the first tag to be created make the root
201 // m_Tags point to it:
206 // if there is already a root tag add this tag as
208 chd
->m_Prev
= m_Tags
->GetLastSibling();
209 chd
->m_Prev
->m_Next
= chd
;
213 if (chd
->HasEnding())
215 CreateDOMSubTree(chd
,
216 chd
->GetBeginIter(), chd
->GetEndIter1(),
218 i
= chd
->GetEndIter2();
221 i
= chd
->GetBeginIter();
226 // ... or skip ending tag:
229 while (i
< end_pos
&& *i
!= wxT('>')) ++i
;
236 // add remaining text to m_TextPieces:
237 if (end_pos
> textBeginning
)
238 m_TextPieces
->push_back(wxHtmlTextPiece(textBeginning
, end_pos
));
241 void wxHtmlParser::DestroyDOMTree()
247 t2
= t1
->GetNextSibling();
251 m_Tags
= m_CurTag
= NULL
;
257 void wxHtmlParser::DoParsing()
261 DoParsing(m_Source
->begin(), m_Source
->end());
264 void wxHtmlParser::DoParsing(const wxString::const_iterator
& begin_pos_
,
265 const wxString::const_iterator
& end_pos
)
267 wxString::const_iterator
begin_pos(begin_pos_
);
269 if (end_pos
<= begin_pos
)
272 wxHtmlTextPieces
& pieces
= *m_TextPieces
;
273 size_t piecesCnt
= pieces
.size();
275 while (begin_pos
< end_pos
)
277 while (m_CurTag
&& m_CurTag
->GetBeginIter() < begin_pos
)
278 m_CurTag
= m_CurTag
->GetNextTag();
279 while (m_CurTextPiece
< piecesCnt
&&
280 pieces
[m_CurTextPiece
].m_start
< begin_pos
)
283 if (m_CurTextPiece
< piecesCnt
&&
285 pieces
[m_CurTextPiece
].m_start
< m_CurTag
->GetBeginIter()))
288 AddText(GetEntitiesParser()->Parse(
289 wxString(pieces
[m_CurTextPiece
].m_start
,
290 pieces
[m_CurTextPiece
].m_end
)));
291 begin_pos
= pieces
[m_CurTextPiece
].m_end
;
296 if (m_CurTag
->HasEnding())
297 begin_pos
= m_CurTag
->GetEndIter2();
299 begin_pos
= m_CurTag
->GetBeginIter();
300 wxHtmlTag
*t
= m_CurTag
;
301 m_CurTag
= m_CurTag
->GetNextTag();
310 void wxHtmlParser::AddTag(const wxHtmlTag
& tag
)
314 wxHtmlTagHandlersHash::const_iterator h
= m_HandlersHash
.find(tag
.GetName());
315 if (h
!= m_HandlersHash
.end())
317 inner
= h
->second
->HandleTag(tag
);
324 DoParsing(tag
.GetBeginIter(), tag
.GetEndIter1());
328 void wxHtmlParser::AddTagHandler(wxHtmlTagHandler
*handler
)
330 wxString
s(handler
->GetSupportedTags());
331 wxStringTokenizer
tokenizer(s
, wxT(", "));
333 while (tokenizer
.HasMoreTokens())
334 m_HandlersHash
[tokenizer
.GetNextToken()] = handler
;
336 m_HandlersSet
.insert(handler
);
338 handler
->SetParser(this);
341 void wxHtmlParser::PushTagHandler(wxHtmlTagHandler
*handler
, const wxString
& tags
)
343 wxStringTokenizer
tokenizer(tags
, wxT(", "));
346 m_HandlersStack
.push_back(new wxHtmlTagHandlersHash(m_HandlersHash
));
348 while (tokenizer
.HasMoreTokens())
350 key
= tokenizer
.GetNextToken();
351 m_HandlersHash
[key
] = handler
;
355 void wxHtmlParser::PopTagHandler()
357 wxCHECK_RET( !m_HandlersStack
.empty(),
358 "attempt to remove HTML tag handler from empty stack" );
360 wxHtmlTagHandlersHash
*prev
= m_HandlersStack
.back();
361 m_HandlersStack
.pop_back();
362 m_HandlersHash
= *prev
;
366 void wxHtmlParser::SetSourceAndSaveState(const wxString
& src
)
368 wxHtmlParserState
*s
= new wxHtmlParserState
;
370 s
->m_curTag
= m_CurTag
;
372 s
->m_textPieces
= m_TextPieces
;
373 s
->m_curTextPiece
= m_CurTextPiece
;
374 s
->m_source
= m_Source
;
376 s
->m_nextState
= m_SavedStates
;
388 bool wxHtmlParser::RestoreState()
390 if (!m_SavedStates
) return false;
395 wxHtmlParserState
*s
= m_SavedStates
;
396 m_SavedStates
= s
->m_nextState
;
398 m_CurTag
= s
->m_curTag
;
400 m_TextPieces
= s
->m_textPieces
;
401 m_CurTextPiece
= s
->m_curTextPiece
;
402 m_Source
= s
->m_source
;
408 wxString
wxHtmlParser::GetInnerSource(const wxHtmlTag
& tag
)
410 return wxString(tag
.GetBeginIter(), tag
.GetEndIter1());
413 //-----------------------------------------------------------------------------
415 //-----------------------------------------------------------------------------
417 IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler
,wxObject
)
419 void wxHtmlTagHandler::ParseInnerSource(const wxString
& source
)
421 // It is safe to temporarily change the source being parsed,
422 // provided we restore the state back after parsing
423 m_Parser
->SetSourceAndSaveState(source
);
424 m_Parser
->DoParsing();
425 m_Parser
->RestoreState();
429 //-----------------------------------------------------------------------------
430 // wxHtmlEntitiesParser
431 //-----------------------------------------------------------------------------
433 IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser
,wxObject
)
435 wxHtmlEntitiesParser::wxHtmlEntitiesParser()
437 : m_conv(NULL
), m_encoding(wxFONTENCODING_SYSTEM
)
442 wxHtmlEntitiesParser::~wxHtmlEntitiesParser()
450 void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding
)
452 if (encoding
== m_encoding
)
457 m_encoding
= encoding
;
458 if (m_encoding
== wxFONTENCODING_SYSTEM
)
461 m_conv
= new wxCSConv(wxFontMapper::GetEncodingName(m_encoding
));
463 #endif // !wxUSE_UNICODE
465 wxString
wxHtmlEntitiesParser::Parse(const wxString
& input
) const
469 const wxString::const_iterator
end(input
.end());
470 wxString::const_iterator
c(input
.begin());
471 wxString::const_iterator
last(c
);
473 for ( ; c
< end
; ++c
)
477 if ( output
.empty() )
478 output
.reserve(input
.length());
481 output
.append(last
, c
);
486 const wxString::const_iterator ent_s
= c
;
489 for ( ; c
!= end
; ++c
)
492 if ( !((ch
>= wxT('a') && ch
<= wxT('z')) ||
493 (ch
>= wxT('A') && ch
<= wxT('Z')) ||
494 (ch
>= wxT('0') && ch
<= wxT('9')) ||
495 ch
== wxT('_') || ch
== wxT('#')) )
499 entity
.append(ent_s
, c
);
500 if (c
== end
|| *c
!= wxT(';')) --c
;
502 entity_char
= GetEntityChar(entity
);
504 output
<< entity_char
;
507 output
.append(ent_s
-1, c
+1);
508 wxLogTrace(wxTRACE_HTML_DEBUG
,
509 "Unrecognized HTML entity: '%s'",
514 if ( last
== input
.begin() ) // common case: no entity
517 output
.append(last
, end
);
522 wxChar
wxHtmlEntitiesParser::GetCharForCode(unsigned code
) const
526 wbuf
[0] = (wchar_t)code
;
528 wxMBConv
*conv
= m_conv
? m_conv
: &wxConvLocal
;
529 if (conv
->WC2MB(buf
, wbuf
, 2) == (size_t)-1)
535 struct wxHtmlEntityInfo
537 const wxStringCharType
*name
;
541 extern "C" int LINKAGEMODE
wxHtmlEntityCompare(const void *key
, const void *item
)
543 #if wxUSE_UNICODE_UTF8
544 return strcmp((char*)key
, ((wxHtmlEntityInfo
*)item
)->name
);
546 return wxStrcmp((wxChar
*)key
, ((wxHtmlEntityInfo
*)item
)->name
);
550 wxChar
wxHtmlEntitiesParser::GetEntityChar(const wxString
& entity
) const
555 return 0; // invalid entity reference
557 if (entity
[0] == wxT('#'))
559 // NB: parsed value is a number, so it's OK to use wx_str(), internal
560 // representation is the same for numbers
561 const wxStringCharType
*ent_s
= entity
.wx_str();
562 const wxStringCharType
*format
;
564 if (ent_s
[1] == wxS('x') || ent_s
[1] == wxS('X'))
573 if (wxSscanf(ent_s
, format
, &code
) != 1)
578 // store the literals in wx's internal representation (either char*
579 // in UTF-8 or wchar_t*) for best performance:
580 #define ENTITY(name, code) { wxS(name), code }
582 static wxHtmlEntityInfo substitutions
[] = {
583 ENTITY("AElig", 198),
584 ENTITY("Aacute", 193),
585 ENTITY("Acirc", 194),
586 ENTITY("Agrave", 192),
587 ENTITY("Alpha", 913),
588 ENTITY("Aring", 197),
589 ENTITY("Atilde", 195),
592 ENTITY("Ccedil", 199),
594 ENTITY("Dagger", 8225),
595 ENTITY("Delta", 916),
597 ENTITY("Eacute", 201),
598 ENTITY("Ecirc", 202),
599 ENTITY("Egrave", 200),
600 ENTITY("Epsilon", 917),
603 ENTITY("Gamma", 915),
604 ENTITY("Iacute", 205),
605 ENTITY("Icirc", 206),
606 ENTITY("Igrave", 204),
609 ENTITY("Kappa", 922),
610 ENTITY("Lambda", 923),
612 ENTITY("Ntilde", 209),
614 ENTITY("OElig", 338),
615 ENTITY("Oacute", 211),
616 ENTITY("Ocirc", 212),
617 ENTITY("Ograve", 210),
618 ENTITY("Omega", 937),
619 ENTITY("Omicron", 927),
620 ENTITY("Oslash", 216),
621 ENTITY("Otilde", 213),
625 ENTITY("Prime", 8243),
628 ENTITY("Scaron", 352),
629 ENTITY("Sigma", 931),
630 ENTITY("THORN", 222),
632 ENTITY("Theta", 920),
633 ENTITY("Uacute", 218),
634 ENTITY("Ucirc", 219),
635 ENTITY("Ugrave", 217),
636 ENTITY("Upsilon", 933),
639 ENTITY("Yacute", 221),
642 ENTITY("aacute", 225),
643 ENTITY("acirc", 226),
644 ENTITY("acute", 180),
645 ENTITY("aelig", 230),
646 ENTITY("agrave", 224),
647 ENTITY("alefsym", 8501),
648 ENTITY("alpha", 945),
653 ENTITY("aring", 229),
654 ENTITY("asymp", 8776),
655 ENTITY("atilde", 227),
657 ENTITY("bdquo", 8222),
659 ENTITY("brvbar", 166),
660 ENTITY("bull", 8226),
662 ENTITY("ccedil", 231),
663 ENTITY("cedil", 184),
667 ENTITY("clubs", 9827),
668 ENTITY("cong", 8773),
670 ENTITY("crarr", 8629),
672 ENTITY("curren", 164),
673 ENTITY("dArr", 8659),
674 ENTITY("dagger", 8224),
675 ENTITY("darr", 8595),
677 ENTITY("delta", 948),
678 ENTITY("diams", 9830),
679 ENTITY("divide", 247),
680 ENTITY("eacute", 233),
681 ENTITY("ecirc", 234),
682 ENTITY("egrave", 232),
683 ENTITY("empty", 8709),
684 ENTITY("emsp", 8195),
685 ENTITY("ensp", 8194),
686 ENTITY("epsilon", 949),
687 ENTITY("equiv", 8801),
691 ENTITY("euro", 8364),
692 ENTITY("exist", 8707),
694 ENTITY("forall", 8704),
695 ENTITY("frac12", 189),
696 ENTITY("frac14", 188),
697 ENTITY("frac34", 190),
698 ENTITY("frasl", 8260),
699 ENTITY("gamma", 947),
702 ENTITY("hArr", 8660),
703 ENTITY("harr", 8596),
704 ENTITY("hearts", 9829),
705 ENTITY("hellip", 8230),
706 ENTITY("iacute", 237),
707 ENTITY("icirc", 238),
708 ENTITY("iexcl", 161),
709 ENTITY("igrave", 236),
710 ENTITY("image", 8465),
711 ENTITY("infin", 8734),
714 ENTITY("iquest", 191),
715 ENTITY("isin", 8712),
717 ENTITY("kappa", 954),
718 ENTITY("lArr", 8656),
719 ENTITY("lambda", 955),
720 ENTITY("lang", 9001),
721 ENTITY("laquo", 171),
722 ENTITY("larr", 8592),
723 ENTITY("lceil", 8968),
724 ENTITY("ldquo", 8220),
726 ENTITY("lfloor", 8970),
727 ENTITY("lowast", 8727),
730 ENTITY("lsaquo", 8249),
731 ENTITY("lsquo", 8216),
734 ENTITY("mdash", 8212),
735 ENTITY("micro", 181),
736 ENTITY("middot", 183),
737 ENTITY("minus", 8722),
739 ENTITY("nabla", 8711),
741 ENTITY("ndash", 8211),
745 ENTITY("notin", 8713),
746 ENTITY("nsub", 8836),
747 ENTITY("ntilde", 241),
749 ENTITY("oacute", 243),
750 ENTITY("ocirc", 244),
751 ENTITY("oelig", 339),
752 ENTITY("ograve", 242),
753 ENTITY("oline", 8254),
754 ENTITY("omega", 969),
755 ENTITY("omicron", 959),
756 ENTITY("oplus", 8853),
760 ENTITY("oslash", 248),
761 ENTITY("otilde", 245),
762 ENTITY("otimes", 8855),
765 ENTITY("part", 8706),
766 ENTITY("permil", 8240),
767 ENTITY("perp", 8869),
771 ENTITY("plusmn", 177),
772 ENTITY("pound", 163),
773 ENTITY("prime", 8242),
774 ENTITY("prod", 8719),
775 ENTITY("prop", 8733),
778 ENTITY("rArr", 8658),
779 ENTITY("radic", 8730),
780 ENTITY("rang", 9002),
781 ENTITY("raquo", 187),
782 ENTITY("rarr", 8594),
783 ENTITY("rceil", 8969),
784 ENTITY("rdquo", 8221),
785 ENTITY("real", 8476),
787 ENTITY("rfloor", 8971),
790 ENTITY("rsaquo", 8250),
791 ENTITY("rsquo", 8217),
792 ENTITY("sbquo", 8218),
793 ENTITY("scaron", 353),
794 ENTITY("sdot", 8901),
797 ENTITY("sigma", 963),
798 ENTITY("sigmaf", 962),
800 ENTITY("spades", 9824),
802 ENTITY("sube", 8838),
808 ENTITY("supe", 8839),
809 ENTITY("szlig", 223),
811 ENTITY("there4", 8756),
812 ENTITY("theta", 952),
813 ENTITY("thetasym", 977),
814 ENTITY("thinsp", 8201),
815 ENTITY("thorn", 254),
816 ENTITY("tilde", 732),
817 ENTITY("times", 215),
818 ENTITY("trade", 8482),
819 ENTITY("uArr", 8657),
820 ENTITY("uacute", 250),
821 ENTITY("uarr", 8593),
822 ENTITY("ucirc", 251),
823 ENTITY("ugrave", 249),
825 ENTITY("upsih", 978),
826 ENTITY("upsilon", 965),
828 ENTITY("weierp", 8472),
830 ENTITY("yacute", 253),
835 ENTITY("zwnj", 8204),
838 static size_t substitutions_cnt
= 0;
840 if (substitutions_cnt
== 0)
841 while (substitutions
[substitutions_cnt
].code
!= 0)
844 wxHtmlEntityInfo
*info
;
846 // bsearch crashes under WinCE for some reason
849 for (i
= 0; i
< substitutions_cnt
; i
++)
851 if (entity
== substitutions
[i
].name
)
853 info
= & substitutions
[i
];
858 info
= (wxHtmlEntityInfo
*) bsearch(entity
.wx_str(), substitutions
,
860 sizeof(wxHtmlEntityInfo
),
861 wxHtmlEntityCompare
);
870 return GetCharForCode(code
);
873 wxFSFile
*wxHtmlParser::OpenURL(wxHtmlURLType
WXUNUSED(type
),
874 const wxString
& url
) const
876 return m_FS
? m_FS
->OpenFile(url
) : NULL
;
881 //-----------------------------------------------------------------------------
882 // wxHtmlParser::ExtractCharsetInformation
883 //-----------------------------------------------------------------------------
885 class wxMetaTagParser
: public wxHtmlParser
888 wxMetaTagParser() { }
890 wxObject
* GetProduct() { return NULL
; }
893 virtual void AddText(const wxString
& WXUNUSED(txt
)) {}
895 wxDECLARE_NO_COPY_CLASS(wxMetaTagParser
);
898 class wxMetaTagHandler
: public wxHtmlTagHandler
901 wxMetaTagHandler(wxString
*retval
) : wxHtmlTagHandler(), m_retval(retval
) {}
902 wxString
GetSupportedTags() { return wxT("META,BODY"); }
903 bool HandleTag(const wxHtmlTag
& tag
);
908 wxDECLARE_NO_COPY_CLASS(wxMetaTagHandler
);
911 bool wxMetaTagHandler::HandleTag(const wxHtmlTag
& tag
)
913 if (tag
.GetName() == wxT("BODY"))
915 m_Parser
->StopParsing();
919 if (tag
.HasParam(wxT("HTTP-EQUIV")) &&
920 tag
.GetParam(wxT("HTTP-EQUIV")).IsSameAs(wxT("Content-Type"), false) &&
921 tag
.HasParam(wxT("CONTENT")))
923 wxString content
= tag
.GetParam(wxT("CONTENT")).Lower();
924 if (content
.Left(19) == wxT("text/html; charset="))
926 *m_retval
= content
.Mid(19);
927 m_Parser
->StopParsing();
935 wxString
wxHtmlParser::ExtractCharsetInformation(const wxString
& markup
)
938 wxMetaTagParser
*parser
= new wxMetaTagParser();
941 parser
->AddTagHandler(new wxMetaTagHandler(&charset
));
942 parser
->Parse(markup
);
950 wxHtmlParser::SkipCommentTag(wxString::const_iterator
& start
,
951 wxString::const_iterator end
)
953 wxASSERT_MSG( *start
== '<', wxT("should be called on the tag start") );
955 wxString::const_iterator p
= start
;
957 // comments begin with "<!--" in HTML 4.0
958 if ( p
> end
- 3 || *++p
!= '!' || *++p
!= '-' || *++p
!= '-' )
960 // not a comment at all
964 // skip the start of the comment tag in any case, if we don't find the
965 // closing tag we should ignore broken markup
968 // comments end with "--[ \t\r\n]*>", i.e. white space is allowed between
969 // comment delimiter and the closing tag character (section 3.2.4 of
970 // http://www.w3.org/TR/html401/)
976 if ( (c
== wxT(' ') || c
== wxT('\n') ||
977 c
== wxT('\r') || c
== wxT('\t')) && dashes
>= 2 )
979 // ignore white space before potential tag end
983 if ( c
== wxT('>') && dashes
>= 2 )
985 // found end of comment