From daa616fca06ce77df132d31eda6d1f829b449aad Mon Sep 17 00:00:00 2001 From: =?utf8?q?V=C3=A1clav=20Slav=C3=ADk?= Date: Sun, 1 Jul 2001 15:09:35 +0000 Subject: [PATCH] new HTML tags parser and entities substitution code git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@10744 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775 --- include/wx/html/htmlpars.h | 77 ++++--- src/html/helpdata.cpp | 158 +------------- src/html/htmlcell.cpp | 150 -------------- src/html/htmlpars.cpp | 412 ++++++++++++++++++++++++++++++++++++- src/html/htmltag.cpp | 266 ++++++++++++------------ src/html/winpars.cpp | 35 +++- 6 files changed, 619 insertions(+), 479 deletions(-) diff --git a/include/wx/html/htmlpars.h b/include/wx/html/htmlpars.h index 6a93d07ddb..238a86c704 100644 --- a/include/wx/html/htmlpars.h +++ b/include/wx/html/htmlpars.h @@ -21,24 +21,21 @@ #include "wx/html/htmltag.h" #include "wx/filesys.h" -class wxHtmlParser; -class wxHtmlTagHandler; - -//-------------------------------------------------------------------------------- -// wxHtmlParser -// This class handles generic parsing of HTML document : it scans -// the document and divide it into blocks of tags (where one block -// consists of starting and ending tag and of text between these -// 2 tags. -//-------------------------------------------------------------------------------- - +class WXDLLEXPORT wxMBConv; +class WXDLLEXPORT wxHtmlParser; +class WXDLLEXPORT wxHtmlTagHandler; +class WXDLLEXPORT wxHtmlEntitiesParser; + +// This class handles generic parsing of HTML document : it scans +// the document and divide it into blocks of tags (where one block +// consists of starting and ending tag and of text between these +// 2 tags. class WXDLLEXPORT wxHtmlParser : public wxObject { DECLARE_ABSTRACT_CLASS(wxHtmlParser) public: - wxHtmlParser() : wxObject(), m_HandlersHash(wxKEY_STRING) - { m_FS = NULL; m_Cache = NULL; m_HandlersStack = NULL; } + wxHtmlParser(); virtual ~wxHtmlParser(); // Sets the class which will be used for opening files @@ -106,6 +103,9 @@ protected: // ignored if no hander is found. // Derived class is *responsible* for filling in m_Handlers table. virtual void AddTag(const wxHtmlTag& tag); + + // Returns entity parser object, used to substitute HTML &entities; + wxHtmlEntitiesParser *GetEntitiesParser() const { return m_entitiesParser; } protected: // source being parsed @@ -130,24 +130,20 @@ protected: wxFileSystem *m_FS; // handlers stack used by PushTagHandler and PopTagHandler wxList *m_HandlersStack; + + // entity parse + wxHtmlEntitiesParser *m_entitiesParser; }; - - - -//-------------------------------------------------------------------------------- -// wxHtmlTagHandler -// This class (and derived classes) cooperates with wxHtmlParser. -// Each recognized tag is passed to handler which is capable -// of handling it. Each tag is handled in 3 steps: -// 1. Handler will modifies state of parser -// (using it's public methods) -// 2. Parser parses source between starting and ending tag -// 3. Handler restores original state of the parser -//-------------------------------------------------------------------------------- - +// This class (and derived classes) cooperates with wxHtmlParser. +// Each recognized tag is passed to handler which is capable +// of handling it. Each tag is handled in 3 steps: +// 1. Handler will modifies state of parser +// (using it's public methods) +// 2. Parser parses source between starting and ending tag +// 3. Handler restores original state of the parser class WXDLLEXPORT wxHtmlTagHandler : public wxObject { DECLARE_ABSTRACT_CLASS(wxHtmlTagHandler) @@ -184,6 +180,33 @@ protected: }; +// This class is used to parse HTML entities in strings. It can handle +// both named entities and &#xxxx entries where xxxx is Unicode code. +class WXDLLEXPORT wxHtmlEntitiesParser : public wxObject +{ + DECLARE_DYNAMIC_CLASS(wxHtmlEntitiesParser) + +public: + wxHtmlEntitiesParser(); + virtual ~wxHtmlEntitiesParser(); + + // Sets encoding of output string. + // Has no effect if wxUSE_WCHAR_T==0 or wxUSE_UNICODE==1 + void SetEncoding(wxFontEncoding encoding); + + // Parses entities in input and replaces them with respective characters + // (with respect to output encoding) + wxString Parse(const wxString& input); + +protected: + wxChar GetEntityChar(const wxString& entity); + wxChar GetCharForCode(unsigned code); + +#if wxUSE_WCHAR_T && !wxUSE_UNICODE + wxMBConv *m_conv; + wxFontEncoding m_encoding; +#endif +}; #endif diff --git a/src/html/helpdata.cpp b/src/html/helpdata.cpp index 76e7264ae8..3b7b4c4ccd 100644 --- a/src/html/helpdata.cpp +++ b/src/html/helpdata.cpp @@ -157,160 +157,12 @@ bool HP_TagHandler::HandleTag(const wxHtmlTag& tag) } else { // "PARAM" - if (m_Name == wxEmptyString && tag.GetParam(wxT("NAME")) == wxT("Name")) - { + if (m_Name == wxEmptyString && tag.GetParam(wxT("NAME")) == wxT("Name")) m_Name = tag.GetParam(wxT("VALUE")); - if (m_Name.Find(wxT('&')) != -1) - { -#define ESCSEQ(escape, subst) \ - { _T("&") _T(escape) _T(";"), _T("&") _T(escape) _T(" "), _T("&") _T(escape), _T(subst) } - static wxChar* substitutions[][4] = - { - ESCSEQ("quot", "\""), - ESCSEQ("#34", "\""), - ESCSEQ("#8220", "\""), - ESCSEQ("#8221", "\""), - ESCSEQ("lt", "<"), - ESCSEQ("#60", "<"), - ESCSEQ("gt", ">"), - ESCSEQ("#62", ">"), - - ESCSEQ("#94", "^"), /* ^ */ - - ESCSEQ("nbsp", " "), - ESCSEQ("#32", " "), - ESCSEQ("iexcl", "!"), - ESCSEQ("#33", "!"), - ESCSEQ("cent", "¢"/* ¢ */), - ESCSEQ("#162", "¢"/* ¢ */), - - ESCSEQ("trade", "(TM)"), - ESCSEQ("#153", "(TM)"), - ESCSEQ("#8482", "(TM)"), - - ESCSEQ("yen", "¥"), - ESCSEQ("#165", "¥"), - ESCSEQ("brkbar", "¦"), - ESCSEQ("#166", "¦"), - ESCSEQ("sect", "§"), - ESCSEQ("#167", "§"), - ESCSEQ("uml", "¨"), - ESCSEQ("#168", "¨"), - - ESCSEQ("copy", "©"), /* © */ - ESCSEQ("#169", "©"), - ESCSEQ("ordf", "ª"), - ESCSEQ("#170", "ª"), - ESCSEQ("laquo", "«"), /* « */ - ESCSEQ("#171", "«"), - ESCSEQ("not", "¬"), - ESCSEQ("#172", "¬"), - - ESCSEQ("reg", "®"), /* ® */ - ESCSEQ("#174", "®"), - - ESCSEQ("deg", "°"), /* ° */ - ESCSEQ("#176", "°"), - ESCSEQ("plusm", "±"), /* ± */ - ESCSEQ("#177", "±"), - - ESCSEQ("acute", "´"), - ESCSEQ("#180", "´"), - ESCSEQ("macron", "¯"), - ESCSEQ("#175", "¯"), - ESCSEQ("micro", "µ"), /* µ */ - ESCSEQ("#181", "µ"), - ESCSEQ("para", "¶"), /* ¶ */ - ESCSEQ("#182", "¶"), - - ESCSEQ("ordm", "º"), /* º */ - ESCSEQ("#186", "º"), - ESCSEQ("raquo", "»"), /* » */ - ESCSEQ("#187", "»"), - - ESCSEQ("iquest", "¿"), /* ¿ */ - ESCSEQ("#191", "¿"), - ESCSEQ("Agrave", "\300"/* À */), - ESCSEQ("#193", "\300"/* À */), - - ESCSEQ("Acirc", "\302"/* Â */), - ESCSEQ("Atilde", "\303"/* Ã */), - ESCSEQ("Auml", "\304"/* Ä */), - ESCSEQ("Aring", " "), - ESCSEQ("AElig", " "), - ESCSEQ("Ccedil", "\347"/* ç */), - ESCSEQ("Egrave", "\310"/* È */), - ESCSEQ("Eacute", "\311"/* É */), - ESCSEQ("Ecirc", "\312"/* Ê */), - ESCSEQ("Euml", "\313"/* Ë */), - ESCSEQ("Igrave", "\314"/* Ì */), - - ESCSEQ("Icirc", "\316"/* Î */), - ESCSEQ("Iuml", "\317"/* Ï */), - - ESCSEQ("Ntilde", "\321"/* Ñ */), - ESCSEQ("Ograve", "\322"/* Ò */), - - ESCSEQ("Ocirc", "\324"/* Ô */), - ESCSEQ("Otilde", "\325"/* Õ */), - ESCSEQ("Ouml", "\326"/* Ö */), - - ESCSEQ("Oslash", " "), - ESCSEQ("Ugrave", "\331"/* Ù */), - - ESCSEQ("Ucirc", " "), - ESCSEQ("Uuml", "\334"/* Ü */), - - ESCSEQ("szlig", "\247"/* § */), - ESCSEQ("agrave","\340"/* à */), - ESCSEQ("aacute", "\341"/* á */), - ESCSEQ("acirc", "\342"/* â */), - ESCSEQ("atilde", "\343"/* ã */), - ESCSEQ("auml", "\344"/* ä */), - ESCSEQ("aring", "a"), - ESCSEQ("aelig", "ae"), - ESCSEQ("ccedil", "\347"/* ç */), - ESCSEQ("egrave", "\350"/* è */), - ESCSEQ("eacute", "\351"/* é */), - ESCSEQ("ecirc", "\352"/* ê */), - ESCSEQ("euml", "\353"/* ë */), - ESCSEQ("igrave", "\354"/* ì */), - ESCSEQ("iacute", "\355"/* í */), - ESCSEQ("icirc", " "), - ESCSEQ("iuml", "\357"/* ï */), - ESCSEQ("eth", " "), - ESCSEQ("ntilde", "\361"/* ñ */), - ESCSEQ("ograve", "\362"/* ò */), - ESCSEQ("oacute", "\363"/* ó */), - ESCSEQ("ocirc", "\364"/* ô */), - ESCSEQ("otilde", "\365"/* õ */), - ESCSEQ("ouml", "\366"/* ö */), - ESCSEQ("divide", " "), - ESCSEQ("oslash", " "), - ESCSEQ("ugrave", "\371"/* ù */), - ESCSEQ("uacute", "\372"/* ú */), - ESCSEQ("ucirc", "\373"/* û */), - ESCSEQ("uuml", "\374"/* ü */), - - ESCSEQ("yuml", ""), - - /* this one should ALWAYS stay the last one!!! */ - ESCSEQ("amp", "&"), - ESCSEQ("#38", "&"), - - { NULL, NULL, NULL } - }; - - for (int i = 0; substitutions[i][0] != NULL; i++) - { - m_Name.Replace(substitutions[i][0], substitutions[i][3], TRUE); - m_Name.Replace(substitutions[i][1], substitutions[i][3], TRUE); - m_Name.Replace(substitutions[i][2], substitutions[i][3], TRUE); - } - } - } - if (tag.GetParam(wxT("NAME")) == wxT("Local")) m_Page = tag.GetParam(wxT("VALUE")); - if (tag.GetParam(wxT("NAME")) == wxT("ID")) tag.ScanParam(wxT("VALUE"), wxT("%i"), &m_ID); + if (tag.GetParam(wxT("NAME")) == wxT("Local")) + m_Page = tag.GetParam(wxT("VALUE")); + if (tag.GetParam(wxT("NAME")) == wxT("ID")) + tag.ScanParam(wxT("VALUE"), wxT("%i"), &m_ID); return FALSE; } } diff --git a/src/html/htmlcell.cpp b/src/html/htmlcell.cpp index c907af3fe7..cd927aa55c 100644 --- a/src/html/htmlcell.cpp +++ b/src/html/htmlcell.cpp @@ -131,156 +131,6 @@ const wxHtmlCell* wxHtmlCell::Find(int condition, const void* param) const wxHtmlWordCell::wxHtmlWordCell(const wxString& word, wxDC& dc) : wxHtmlCell() { m_Word = word; - - if (m_Word.Find(wxT('&')) != -1) - { -#define ESCSEQ(escape, subst) \ - { _T("&") _T(escape) _T(";"), _T("&") _T(escape) _T(" "), _T("&") _T(escape), _T(subst) } - static wxChar* substitutions[][4] = - { - ESCSEQ("quot", "\""), - ESCSEQ("#34", "\""), - ESCSEQ("#8220", "\""), - ESCSEQ("#8221", "\""), - ESCSEQ("lt", "<"), - ESCSEQ("#60", "<"), - ESCSEQ("gt", ">"), - ESCSEQ("#62", ">"), - - ESCSEQ("#94", "^"), /* ^ */ - - ESCSEQ("nbsp", " "), - ESCSEQ("#32", " "), - ESCSEQ("iexcl", "!"), - ESCSEQ("#33", "!"), - ESCSEQ("cent", "¢"/* ¢ */), - ESCSEQ("#162", "¢"/* ¢ */), - - ESCSEQ("trade", "(TM)"), - ESCSEQ("#153", "(TM)"), - ESCSEQ("#8482", "(TM)"), - - ESCSEQ("yen", "¥"), - ESCSEQ("#165", "¥"), - ESCSEQ("brkbar", "¦"), - ESCSEQ("#166", "¦"), - ESCSEQ("sect", "§"), - ESCSEQ("#167", "§"), - ESCSEQ("uml", "¨"), - ESCSEQ("#168", "¨"), - - ESCSEQ("copy", "©"), /* © */ - ESCSEQ("#169", "©"), - ESCSEQ("ordf", "ª"), - ESCSEQ("#170", "ª"), - ESCSEQ("laquo", "«"), /* « */ - ESCSEQ("#171", "«"), - ESCSEQ("not", "¬"), - ESCSEQ("#172", "¬"), - - ESCSEQ("reg", "®"), /* ® */ - ESCSEQ("#174", "®"), - - ESCSEQ("deg", "°"), /* ° */ - ESCSEQ("#176", "°"), - ESCSEQ("plusm", "±"), /* ± */ - ESCSEQ("#177", "±"), - - ESCSEQ("acute", "´"), - ESCSEQ("#180", "´"), - ESCSEQ("macron", "¯"), - ESCSEQ("#175", "¯"), - ESCSEQ("micro", "µ"), /* µ */ - ESCSEQ("#181", "µ"), - ESCSEQ("para", "¶"), /* ¶ */ - ESCSEQ("#182", "¶"), - - ESCSEQ("ordm", "º"), /* º */ - ESCSEQ("#186", "º"), - ESCSEQ("raquo", "»"), /* » */ - ESCSEQ("#187", "»"), - - ESCSEQ("iquest", "¿"), /* ¿ */ - ESCSEQ("#191", "¿"), - ESCSEQ("Agrave", "\300"/* À */), - ESCSEQ("#193", "\300"/* À */), - - ESCSEQ("Acirc", "\302"/* Â */), - ESCSEQ("Atilde", "\303"/* Ã */), - ESCSEQ("Auml", "\304"/* Ä */), - ESCSEQ("Aring", " "), - ESCSEQ("AElig", " "), - ESCSEQ("Ccedil", "\347"/* ç */), - ESCSEQ("Egrave", "\310"/* È */), - ESCSEQ("Eacute", "\311"/* É */), - ESCSEQ("Ecirc", "\312"/* Ê */), - ESCSEQ("Euml", "\313"/* Ë */), - ESCSEQ("Igrave", "\314"/* Ì */), - - ESCSEQ("Icirc", "\316"/* Î */), - ESCSEQ("Iuml", "\317"/* Ï */), - - ESCSEQ("Ntilde", "\321"/* Ñ */), - ESCSEQ("Ograve", "\322"/* Ò */), - - ESCSEQ("Ocirc", "\324"/* Ô */), - ESCSEQ("Otilde", "\325"/* Õ */), - ESCSEQ("Ouml", "\326"/* Ö */), - - ESCSEQ("Oslash", " "), - ESCSEQ("Ugrave", "\331"/* Ù */), - - ESCSEQ("Ucirc", " "), - ESCSEQ("Uuml", "\334"/* Ü */), - - ESCSEQ("szlig", "\247"/* § */), - ESCSEQ("agrave","\340"/* à */), - ESCSEQ("aacute", "\341"/* á */), - ESCSEQ("acirc", "\342"/* â */), - ESCSEQ("atilde", "\343"/* ã */), - ESCSEQ("auml", "\344"/* ä */), - ESCSEQ("aring", "a"), - ESCSEQ("aelig", "ae"), - ESCSEQ("ccedil", "\347"/* ç */), - ESCSEQ("egrave", "\350"/* è */), - ESCSEQ("eacute", "\351"/* é */), - ESCSEQ("ecirc", "\352"/* ê */), - ESCSEQ("euml", "\353"/* ë */), - ESCSEQ("igrave", "\354"/* ì */), - ESCSEQ("iacute", "\355"/* í */), - ESCSEQ("icirc", " "), - ESCSEQ("iuml", "\357"/* ï */), - ESCSEQ("eth", " "), - ESCSEQ("ntilde", "\361"/* ñ */), - ESCSEQ("ograve", "\362"/* ò */), - ESCSEQ("oacute", "\363"/* ó */), - ESCSEQ("ocirc", "\364"/* ô */), - ESCSEQ("otilde", "\365"/* õ */), - ESCSEQ("ouml", "\366"/* ö */), - ESCSEQ("divide", " "), - ESCSEQ("oslash", " "), - ESCSEQ("ugrave", "\371"/* ù */), - ESCSEQ("uacute", "\372"/* ú */), - ESCSEQ("ucirc", "\373"/* û */), - ESCSEQ("uuml", "\374"/* ü */), - - ESCSEQ("yuml", ""), - - /* this one should ALWAYS stay the last one!!! */ - ESCSEQ("amp", "&"), - ESCSEQ("#38", "&"), - - { NULL, NULL, NULL } - }; - - for (int i = 0; substitutions[i][0] != NULL; i++) - { - m_Word.Replace(substitutions[i][0], substitutions[i][3], TRUE); - m_Word.Replace(substitutions[i][1], substitutions[i][3], TRUE); - m_Word.Replace(substitutions[i][2], substitutions[i][3], TRUE); - } - } - dc.GetTextExtent(m_Word, &m_Width, &m_Height, &m_Descent); SetCanLiveOnPagebreak(FALSE); } diff --git a/src/html/htmlpars.cpp b/src/html/htmlpars.cpp index ab88ab7d46..dc572b0a89 100644 --- a/src/html/htmlpars.cpp +++ b/src/html/htmlpars.cpp @@ -28,6 +28,7 @@ #include "wx/tokenzr.h" #include "wx/wfstream.h" #include "wx/url.h" +#include "wx/fontmap.h" #include "wx/html/htmldefs.h" #include "wx/html/htmlpars.h" @@ -39,6 +40,21 @@ IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject) +wxHtmlParser::wxHtmlParser() + : wxObject(), m_Cache(NULL), m_HandlersHash(wxKEY_STRING), + m_FS(NULL), m_HandlersStack(NULL) +{ + m_entitiesParser = new wxHtmlEntitiesParser; +} + +wxHtmlParser::~wxHtmlParser() +{ + delete m_HandlersStack; + m_HandlersHash.Clear(); + m_HandlersList.DeleteContents(TRUE); + m_HandlersList.Clear(); + delete m_entitiesParser; +} wxObject* wxHtmlParser::Parse(const wxString& source) { @@ -180,18 +196,398 @@ void wxHtmlParser::PopTagHandler() m_HandlersStack->DeleteNode(first); } -wxHtmlParser::~wxHtmlParser() -{ - if (m_HandlersStack) delete m_HandlersStack; - m_HandlersHash.Clear(); - m_HandlersList.DeleteContents(TRUE); - m_HandlersList.Clear(); -} - //----------------------------------------------------------------------------- // wxHtmlTagHandler //----------------------------------------------------------------------------- IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject) + + +//----------------------------------------------------------------------------- +// wxHtmlEntitiesParser +//----------------------------------------------------------------------------- + +IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject) + +wxHtmlEntitiesParser::wxHtmlEntitiesParser() +#if wxUSE_WCHAR_T && !wxUSE_UNICODE + : m_conv(NULL), m_encoding(wxFONTENCODING_SYSTEM) #endif +{ +} + +wxHtmlEntitiesParser::~wxHtmlEntitiesParser() +{ + delete m_conv; +} +void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding) +{ +#if wxUSE_WCHAR_T && !wxUSE_UNICODE + if (encoding == m_encoding) return; + delete m_conv; + m_conv = NULL; + m_encoding = encoding; + if (m_encoding != wxFONTENCODING_SYSTEM) + m_conv = new wxCSConv(wxFontMapper::GetEncodingName(m_encoding)); +#endif +} + +wxString wxHtmlEntitiesParser::Parse(const wxString& input) +{ + const wxChar *c, *last; + const wxChar *in_str = input.c_str(); + wxString output; + + for (c = in_str, last = in_str; *c != wxT('\0'); c++) + { + if (*c == wxT('&')) + { + if (c - last > 0) + output.append(last, c - last); + if (++c == wxT('\0')) break; + wxString entity; + const wxChar *ent_s = c; + for (; (*c >= wxT('a') && *c <= wxT('z')) || + (*c >= wxT('A') && *c <= wxT('Z')) || + (*c >= wxT('0') && *c <= wxT('9')) || + *c == wxT('_') || *c == wxT('#'); c++) {} + entity.append(ent_s, c - ent_s); + if (*c == wxT(';')) c++; + output << GetEntityChar(entity); + last = c; + } + } + if (*last != wxT('\0')) + output.append(last); + return output; +} + +struct wxHtmlEntityInfo +{ + const wxChar *name; + unsigned code; +}; + +static int compar_entity(const void *key, const void *item) +{ + return wxStrcmp((wxChar*)key, ((wxHtmlEntityInfo*)item)->name); +} + +wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code) +{ +#if wxUSE_UNICODE + return (wxChar)code; +#elif wxUSE_WCHAR_T + char buf[2]; + wchar_t wbuf[2]; + wbuf[0] = (wchar_t)code; + wbuf[1] = 0; + wxMBConv *conv = m_conv ? m_conv : &wxConvLocal; + if (conv->WC2MB(buf, wbuf, 1) == (size_t)-1) + return '?'; + return buf[0]; +#else + return (code < 256) ? (wxChar)code : '?'; +#endif +} + +wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity) +{ + unsigned code = 0; + + if (entity[0] == wxT('#')) + { + const wxChar *ent_s = entity.c_str(); + const wxChar *format; + + if (ent_s[1] == wxT('x') || ent_s[1] == wxT('X')) + { + format = wxT("%x"); + ent_s++; + } + else + format = wxT("%u"); + ent_s++; + + if (wxSscanf(ent_s, format, &code) != 1) + code = 0; + } + else + { + static wxHtmlEntityInfo substitutions[] = { + { wxT("AElig"),198 }, + { wxT("Aacute"),193 }, + { wxT("Acirc"),194 }, + { wxT("Agrave"),192 }, + { wxT("Alpha"),913 }, + { wxT("Aring"),197 }, + { wxT("Atilde"),195 }, + { wxT("Auml"),196 }, + { wxT("Beta"),914 }, + { wxT("Ccedil"),199 }, + { wxT("Chi"),935 }, + { wxT("Dagger"),8225 }, + { wxT("Delta"),916 }, + { wxT("ETH"),208 }, + { wxT("Eacute"),201 }, + { wxT("Ecirc"),202 }, + { wxT("Egrave"),200 }, + { wxT("Epsilon"),917 }, + { wxT("Eta"),919 }, + { wxT("Euml"),203 }, + { wxT("Gamma"),915 }, + { wxT("Iacute"),205 }, + { wxT("Icirc"),206 }, + { wxT("Igrave"),204 }, + { wxT("Iota"),921 }, + { wxT("Iuml"),207 }, + { wxT("Kappa"),922 }, + { wxT("Lambda"),923 }, + { wxT("Mu"),924 }, + { wxT("Ntilde"),209 }, + { wxT("Nu"),925 }, + { wxT("OElig"),338 }, + { wxT("Oacute"),211 }, + { wxT("Ocirc"),212 }, + { wxT("Ograve"),210 }, + { wxT("Omega"),937 }, + { wxT("Omicron"),927 }, + { wxT("Oslash"),216 }, + { wxT("Otilde"),213 }, + { wxT("Ouml"),214 }, + { wxT("Phi"),934 }, + { wxT("Pi"),928 }, + { wxT("Prime"),8243 }, + { wxT("Psi"),936 }, + { wxT("Rho"),929 }, + { wxT("Scaron"),352 }, + { wxT("Sigma"),931 }, + { wxT("THORN"),222 }, + { wxT("Tau"),932 }, + { wxT("Theta"),920 }, + { wxT("Uacute"),218 }, + { wxT("Ucirc"),219 }, + { wxT("Ugrave"),217 }, + { wxT("Upsilon"),933 }, + { wxT("Uuml"),220 }, + { wxT("Xi"),926 }, + { wxT("Yacute"),221 }, + { wxT("Yuml"),376 }, + { wxT("Zeta"),918 }, + { wxT("aacute"),225 }, + { wxT("acirc"),226 }, + { wxT("acute"),180 }, + { wxT("aelig"),230 }, + { wxT("agrave"),224 }, + { wxT("alefsym"),8501 }, + { wxT("alpha"),945 }, + { wxT("amp"),38 }, + { wxT("and"),8743 }, + { wxT("ang"),8736 }, + { wxT("aring"),229 }, + { wxT("asymp"),8776 }, + { wxT("atilde"),227 }, + { wxT("auml"),228 }, + { wxT("bdquo"),8222 }, + { wxT("beta"),946 }, + { wxT("brvbar"),166 }, + { wxT("bull"),8226 }, + { wxT("cap"),8745 }, + { wxT("ccedil"),231 }, + { wxT("cedil"),184 }, + { wxT("cent"),162 }, + { wxT("chi"),967 }, + { wxT("circ"),710 }, + { wxT("clubs"),9827 }, + { wxT("cong"),8773 }, + { wxT("copy"),169 }, + { wxT("crarr"),8629 }, + { wxT("cup"),8746 }, + { wxT("curren"),164 }, + { wxT("dArr"),8659 }, + { wxT("dagger"),8224 }, + { wxT("darr"),8595 }, + { wxT("deg"),176 }, + { wxT("delta"),948 }, + { wxT("diams"),9830 }, + { wxT("divide"),247 }, + { wxT("eacute"),233 }, + { wxT("ecirc"),234 }, + { wxT("egrave"),232 }, + { wxT("empty"),8709 }, + { wxT("emsp"),8195 }, + { wxT("ensp"),8194 }, + { wxT("epsilon"),949 }, + { wxT("equiv"),8801 }, + { wxT("eta"),951 }, + { wxT("eth"),240 }, + { wxT("euml"),235 }, + { wxT("euro"),8364 }, + { wxT("exist"),8707 }, + { wxT("fnof"),402 }, + { wxT("forall"),8704 }, + { wxT("frac12"),189 }, + { wxT("frac14"),188 }, + { wxT("frac34"),190 }, + { wxT("frasl"),8260 }, + { wxT("gamma"),947 }, + { wxT("ge"),8805 }, + { wxT("gt"),62 }, + { wxT("hArr"),8660 }, + { wxT("harr"),8596 }, + { wxT("hearts"),9829 }, + { wxT("hellip"),8230 }, + { wxT("iacute"),237 }, + { wxT("icirc"),238 }, + { wxT("iexcl"),161 }, + { wxT("igrave"),236 }, + { wxT("image"),8465 }, + { wxT("infin"),8734 }, + { wxT("int"),8747 }, + { wxT("iota"),953 }, + { wxT("iquest"),191 }, + { wxT("isin"),8712 }, + { wxT("iuml"),239 }, + { wxT("kappa"),954 }, + { wxT("lArr"),8656 }, + { wxT("lambda"),955 }, + { wxT("lang"),9001 }, + { wxT("laquo"),171 }, + { wxT("larr"),8592 }, + { wxT("lceil"),8968 }, + { wxT("ldquo"),8220 }, + { wxT("le"),8804 }, + { wxT("lfloor"),8970 }, + { wxT("lowast"),8727 }, + { wxT("loz"),9674 }, + { wxT("lrm"),8206 }, + { wxT("lsaquo"),8249 }, + { wxT("lsquo"),8216 }, + { wxT("lt"),60 }, + { wxT("macr"),175 }, + { wxT("mdash"),8212 }, + { wxT("micro"),181 }, + { wxT("middot"),183 }, + { wxT("minus"),8722 }, + { wxT("mu"),956 }, + { wxT("nabla"),8711 }, + { wxT("nbsp"),160 }, + { wxT("ndash"),8211 }, + { wxT("ne"),8800 }, + { wxT("ni"),8715 }, + { wxT("not"),172 }, + { wxT("notin"),8713 }, + { wxT("nsub"),8836 }, + { wxT("ntilde"),241 }, + { wxT("nu"),957 }, + { wxT("oacute"),243 }, + { wxT("ocirc"),244 }, + { wxT("oelig"),339 }, + { wxT("ograve"),242 }, + { wxT("oline"),8254 }, + { wxT("omega"),969 }, + { wxT("omicron"),959 }, + { wxT("oplus"),8853 }, + { wxT("or"),8744 }, + { wxT("ordf"),170 }, + { wxT("ordm"),186 }, + { wxT("oslash"),248 }, + { wxT("otilde"),245 }, + { wxT("otimes"),8855 }, + { wxT("ouml"),246 }, + { wxT("para"),182 }, + { wxT("part"),8706 }, + { wxT("permil"),8240 }, + { wxT("perp"),8869 }, + { wxT("phi"),966 }, + { wxT("pi"),960 }, + { wxT("piv"),982 }, + { wxT("plusmn"),177 }, + { wxT("pound"),163 }, + { wxT("prime"),8242 }, + { wxT("prod"),8719 }, + { wxT("prop"),8733 }, + { wxT("psi"),968 }, + { wxT("quot"),34 }, + { wxT("rArr"),8658 }, + { wxT("radic"),8730 }, + { wxT("rang"),9002 }, + { wxT("raquo"),187 }, + { wxT("rarr"),8594 }, + { wxT("rceil"),8969 }, + { wxT("rdquo"),8221 }, + { wxT("real"),8476 }, + { wxT("reg"),174 }, + { wxT("rfloor"),8971 }, + { wxT("rho"),961 }, + { wxT("rlm"),8207 }, + { wxT("rsaquo"),8250 }, + { wxT("rsquo"),8217 }, + { wxT("sbquo"),8218 }, + { wxT("scaron"),353 }, + { wxT("sdot"),8901 }, + { wxT("sect"),167 }, + { wxT("shy"),173 }, + { wxT("sigma"),963 }, + { wxT("sigmaf"),962 }, + { wxT("sim"),8764 }, + { wxT("spades"),9824 }, + { wxT("sub"),8834 }, + { wxT("sube"),8838 }, + { wxT("sum"),8721 }, + { wxT("sup"),8835 }, + { wxT("sup1"),185 }, + { wxT("sup2"),178 }, + { wxT("sup3"),179 }, + { wxT("supe"),8839 }, + { wxT("szlig"),223 }, + { wxT("tau"),964 }, + { wxT("there4"),8756 }, + { wxT("theta"),952 }, + { wxT("thetasym"),977 }, + { wxT("thinsp"),8201 }, + { wxT("thorn"),254 }, + { wxT("tilde"),732 }, + { wxT("times"),215 }, + { wxT("trade"),8482 }, + { wxT("uArr"),8657 }, + { wxT("uacute"),250 }, + { wxT("uarr"),8593 }, + { wxT("ucirc"),251 }, + { wxT("ugrave"),249 }, + { wxT("uml"),168 }, + { wxT("upsih"),978 }, + { wxT("upsilon"),965 }, + { wxT("uuml"),252 }, + { wxT("weierp"),8472 }, + { wxT("xi"),958 }, + { wxT("yacute"),253 }, + { wxT("yen"),165 }, + { wxT("yuml"),255 }, + { wxT("zeta"),950 }, + { wxT("zwj"),8205 }, + { wxT("zwnj"),8204 }, + {NULL, 0}}; + static size_t substitutions_cnt = 0; + + if (substitutions_cnt == 0) + while (substitutions[substitutions_cnt].code != 0) + substitutions_cnt++; + + wxHtmlEntityInfo *info; + info = (wxHtmlEntityInfo*) bsearch(entity.c_str(), substitutions, + substitutions_cnt, + sizeof(wxHtmlEntityInfo), + compar_entity); + if (info) + code = info->code; + } + + if (code == 0) + return wxT('?'); + else + return GetCharForCode(code); +} + +#endif diff --git a/src/html/htmltag.cpp b/src/html/htmltag.cpp index 02f045ce3c..9b8049fdba 100644 --- a/src/html/htmltag.cpp +++ b/src/html/htmltag.cpp @@ -26,6 +26,7 @@ #endif #include "wx/html/htmltag.h" +#include "wx/html/htmlpars.h" #include // for vsscanf #include @@ -121,15 +122,17 @@ wxHtmlTagsCache::wxHtmlTagsCache(const wxString& source) } } - - void wxHtmlTagsCache::QueryTag(int at, int* end1, int* end2) { if (m_Cache == NULL) return; if (m_Cache[m_CachePos].Key != at) { int delta = (at < m_Cache[m_CachePos].Key) ? -1 : 1; - do {m_CachePos += delta;} while (m_Cache[m_CachePos].Key != at); + do + { + m_CachePos += delta; + } + while (m_Cache[m_CachePos].Key != at); } *end1 = m_Cache[m_CachePos].End1; *end2 = m_Cache[m_CachePos].End2; @@ -144,64 +147,129 @@ void wxHtmlTagsCache::QueryTag(int at, int* end1, int* end2) IMPLEMENT_CLASS(wxHtmlTag,wxObject) -wxHtmlTag::wxHtmlTag(const wxString& source, int pos, int end_pos, wxHtmlTagsCache* cache) : wxObject() +wxHtmlTag::wxHtmlTag(const wxString& source, int pos, int end_pos, + wxHtmlTagsCache *cache, + wxHtmlEntitiesParser *entParser) : wxObject() { int i; - char c; + wxChar c; // fill-in name, params and begin pos: - m_Name = m_Params = wxEmptyString; i = pos+1; - if (source[i] == wxT('/')) { m_Ending = TRUE; i++; } - else m_Ending = FALSE; + if (source[i] == wxT('/')) + { m_Ending = TRUE; i++; } + else + m_Ending = FALSE; // find tag's name and convert it to uppercase: while ((i < end_pos) && - ((c = source[i++]) != wxT(' ') && c != wxT('\r') && - c != wxT('\n') && c != wxT('\t') && - c != wxT('>'))) + ((c = source[i++]) != wxT(' ') && c != wxT('\r') && + c != wxT('\n') && c != wxT('\t') && + c != wxT('>'))) { - if ((c >= wxT('a')) && (c <= wxT('z'))) c -= (wxT('a') - wxT('A')); - m_Name += c; + if ((c >= wxT('a')) && (c <= wxT('z'))) + c -= (wxT('a') - wxT('A')); + m_Name << c; } // if the tag has parameters, read them and "normalize" them, // i.e. convert to uppercase, replace whitespaces by spaces and // remove whitespaces around '=': if (source[i-1] != wxT('>')) - while ((i < end_pos) && ((c = source[i++]) != wxT('>'))) + { + #define IS_WHITE(c) (c == wxT(' ') || c == wxT('\r') || \ + c == wxT('\n') || c == wxT('\t')) + wxString pname, pvalue; + wxChar quote; + enum { - if ((c >= wxT('a')) && (c <= wxT('z'))) - c -= (wxT('a') - wxT('A')); - if (c == wxT('\r') || c == wxT('\n') || c == wxT('\t')) - c = wxT(' '); // make future parsing a bit simpler - m_Params += c; - if (c == wxT('"')) + ST_BEFORE_NAME = 1, + ST_NAME, + ST_BEFORE_EQ, + ST_BEFORE_VALUE, + ST_VALUE + } state; + + quote = 0; + state = ST_BEFORE_NAME; + while (i < end_pos) + { + c = source[i++]; + + if (c == wxT('>') && !(state == ST_VALUE && quote != 0)) { - // remove spaces around the '=' character: - if (m_Params.Length() > 1 && - m_Params[m_Params.Length()-2] == wxT(' ')) + if (state == ST_BEFORE_EQ || state == ST_NAME) { - m_Params.RemoveLast(); - while (m_Params.Length() > 0 && m_Params.Last() == wxT(' ')) - m_Params.RemoveLast(); - m_Params += wxT('"'); + m_ParamNames.Add(pname); + m_ParamValues.Add(wxEmptyString); } - while ((i < end_pos) && (source[i++] == wxT(' '))) {} - if (i < end_pos) i--; - - // ...and copy the value to m_Params: - while ((i < end_pos) && ((c = source[i++]) != wxT('"'))) - m_Params += c; - m_Params += c; + else if (state == ST_VALUE && quote == 0) + { + m_ParamNames.Add(pname); + m_ParamValues.Add(entParser->Parse(pvalue)); + } + break; } - else if (c == wxT('\'')) + switch (state) { - while ((i < end_pos) && ((c = source[i++]) != wxT('\''))) - m_Params += c; - m_Params += c; + case ST_BEFORE_NAME: + if (!IS_WHITE(c)) + { + pname = c; + state = ST_NAME; + } + break; + case ST_NAME: + if (IS_WHITE(c)) + state = ST_BEFORE_EQ; + else if (c == wxT('=')) + state = ST_BEFORE_VALUE; + else + pname << c; + break; + case ST_BEFORE_EQ: + if (c == wxT('=')) + state = ST_BEFORE_VALUE; + else if (!IS_WHITE(c)) + { + m_ParamNames.Add(pname); + m_ParamValues.Add(wxEmptyString); + pname = c; + state = ST_NAME; + } + break; + case ST_BEFORE_VALUE: + if (!IS_WHITE(c)) + { + if (c == wxT('"') || c == wxT('\'')) + quote = c, pvalue = wxEmptyString; + else + quote = 0, pvalue = c; + state = ST_VALUE; + } + break; + case ST_VALUE: + if ((quote != 0 && c == quote) || + (quote == 0 && IS_WHITE(c))) + { + m_ParamNames.Add(pname); + if (quote == 0) + { + // VS: backward compatibility, no real reason, + // but wxHTML code relies on this... :( + pvalue.MakeUpper(); + } + m_ParamValues.Add(entParser->Parse(pvalue)); + state = ST_BEFORE_NAME; + } + else + pvalue << c; + break; } } + + #undef IS_WHITE + } m_Begin = i; cache->QueryTag(pos, &m_End1, &m_End2); @@ -209,113 +277,49 @@ wxHtmlTag::wxHtmlTag(const wxString& source, int pos, int end_pos, wxHtmlTagsCac if (m_End2 > end_pos) m_End2 = end_pos; } - - bool wxHtmlTag::HasParam(const wxString& par) const { - const wxChar *st = m_Params, *p = par; - const wxChar *st2, *p2; - const wxChar invalid = wxT('\1'); - - if (*st == 0) return FALSE; - if (*p == 0) return FALSE; - for (st2 = st, p2 = p; ; st2++) - { - if (*p2 == 0 && *st2 == wxT('=')) return TRUE; - if (*st2 == 0) return FALSE; - if (*p2 != *st2) p2 = &invalid; - if (*p2 == *st2) p2++; - if (*st2 == wxT(' ')) p2 = p; - else if (*st2 == wxT('=')) - { - p2 = p; - while (*st2 != wxT(' ')) - { - if (*st2 == wxT('"')) - { - st2++; - while (*st2 != wxT('"')) st2++; - } - st2++; - if (*st2 == 0) return FALSE; - } - } - } + return (m_ParamNames.Index(par, FALSE) != wxNOT_FOUND); } - - wxString wxHtmlTag::GetParam(const wxString& par, bool with_commas) const { - const wxChar *st = m_Params, *p = par; - const wxChar *st2, *p2; - const wxChar invalid = wxT('\1'); - bool comma; - wxChar comma_char; - - if (*st == 0) return wxEmptyString; - if (*p == 0) return wxEmptyString; - for (st2 = st, p2 = p; ; st2++) + int index = m_ParamNames.Index(par, FALSE); + if (index == wxNOT_FOUND) + return wxEmptyString; + if (with_commas) { - if (*p2 == 0 && *st2 == wxT('=')) // found - { - wxString fnd = wxEmptyString; - st2++; // '=' character - comma = FALSE; - comma_char = wxT('\0'); - if (!with_commas && (*(st2) == wxT('"'))) - { - st2++; - comma = TRUE; - comma_char = wxT('"'); - } - else if (!with_commas && (*(st2) == wxT('\''))) - { - st2++; - comma = TRUE; - comma_char = wxT('\''); - } - - while (*st2 != 0) - { - if (comma && *st2 == comma_char) comma = FALSE; - else if ((*st2 == wxT(' ')) && (!comma)) break; - fnd += (*(st2++)); - } - if (!with_commas && (*(st2-1) == comma_char)) fnd.RemoveLast(); - return fnd; - } - if (*st2 == 0) return wxEmptyString; - if (*p2 != *st2) p2 = &invalid; - if (*p2 == *st2) p2++; - if (*st2 == wxT(' ')) p2 = p; - else if (*st2 == wxT('=')) - { - p2 = p; - while (*st2 != wxT(' ')) - { - if (*st2 == wxT('"')) - { - st2++; - while (*st2 != wxT('"')) st2++; - } - else if (*st2 == wxT('\'')) - { - st2++; - while (*st2 != wxT('\'')) st2++; - } - st2++; - } - } + // VS: backward compatibility, seems to be never used by wxHTML... + wxString s; + s << wxT('"') << m_ParamValues[index] << wxT('"'); + return s; } + else + return m_ParamValues[index]; } - - int wxHtmlTag::ScanParam(const wxString& par, wxChar *format, void *param) const { wxString parval = GetParam(par); return wxSscanf(parval, format, param); } +wxString wxHtmlTag::GetAllParams() const +{ + // VS: this function is for backward compatiblity only, + // never used by wxHTML + wxString s; + size_t cnt = m_ParamNames.GetCount(); + for (size_t i = 0; i < cnt; i++) + { + s << m_ParamNames[i]; + s << wxT('='); + if (m_ParamValues[i].Find(wxT('"')) != wxNOT_FOUND) + s << wxT('\'') << m_ParamValues[i] << wxT('\''); + else + s << wxT('"') << m_ParamValues[i] << wxT('"'); + } + return s; +} + #endif diff --git a/src/html/winpars.cpp b/src/html/winpars.cpp index 121b76e3aa..e90c35f723 100644 --- a/src/html/winpars.cpp +++ b/src/html/winpars.cpp @@ -197,10 +197,12 @@ void wxHtmlWinParser::AddText(const char* txt) char temp[wxHTML_BUFLEN]; register char d; int templen = 0; - + if (m_tmpLastWasSpace) { - while ((i < lng) && ((txt[i] == '\n') || (txt[i] == '\r') || (txt[i] == ' ') || (txt[i] == '\t'))) i++; + while ((i < lng) && + ((txt[i] == '\n') || (txt[i] == '\r') || (txt[i] == ' ') || + (txt[i] == '\t'))) i++; } while (i < lng) @@ -210,7 +212,8 @@ void wxHtmlWinParser::AddText(const char* txt) if ((d == '\n') || (d == '\r') || (d == ' ') || (d == '\t')) { i++, x++; - while ((i < lng) && ((txt[i] == '\n') || (txt[i] == '\r') || (txt[i] == ' ') || (txt[i] == '\t'))) i++, x++; + while ((i < lng) && ((txt[i] == '\n') || (txt[i] == '\r') || + (txt[i] == ' ') || (txt[i] == '\t'))) i++, x++; } else i++; @@ -219,9 +222,11 @@ void wxHtmlWinParser::AddText(const char* txt) temp[templen-1] = ' '; temp[templen] = 0; templen = 0; - if (m_EncConv) m_EncConv->Convert(temp); - c = new wxHtmlWordCell(temp, *(GetDC())); - if (m_UseLink) c->SetLink(m_Link); + if (m_EncConv) + m_EncConv->Convert(temp); + c = new wxHtmlWordCell(GetEntitiesParser()->Parse(temp), *(GetDC())); + if (m_UseLink) + c->SetLink(m_Link); m_Container->InsertCell(c); m_tmpLastWasSpace = TRUE; } @@ -229,9 +234,11 @@ void wxHtmlWinParser::AddText(const char* txt) if (templen) { temp[templen] = 0; - if (m_EncConv) m_EncConv->Convert(temp); - c = new wxHtmlWordCell(temp, *(GetDC())); - if (m_UseLink) c->SetLink(m_Link); + if (m_EncConv) + m_EncConv->Convert(temp); + c = new wxHtmlWordCell(GetEntitiesParser()->Parse(temp), *(GetDC())); + if (m_UseLink) + c->SetLink(m_Link); m_Container->InsertCell(c); m_tmpLastWasSpace = FALSE; } @@ -333,7 +340,11 @@ void wxHtmlWinParser::SetFontFace(const wxString& face) void wxHtmlWinParser::SetInputEncoding(wxFontEncoding enc) { m_InputEnc = m_OutputEnc = wxFONTENCODING_DEFAULT; - if (m_EncConv) {delete m_EncConv; m_EncConv = NULL;} + if (m_EncConv) + { + delete m_EncConv; + m_EncConv = NULL; + } if (enc == wxFONTENCODING_DEFAULT) return; @@ -363,6 +374,10 @@ void wxHtmlWinParser::SetInputEncoding(wxFontEncoding enc) m_OutputEnc = wxFONTENCODING_DEFAULT; m_InputEnc = enc; + if (m_OutputEnc == wxFONTENCODING_DEFAULT) + GetEntitiesParser()->SetEncoding(wxFONTENCODING_SYSTEM); + else + GetEntitiesParser()->SetEncoding(m_OutputEnc); if (m_InputEnc == m_OutputEnc) return; -- 2.45.2