X-Git-Url: https://git.saurik.com/wxWidgets.git/blobdiff_plain/d699f48ba33c96c24f7ab3764ad98ded0633c1c5..adb4b1488a7b38ab9878d378bc87beb8c6e6d02f:/src/html/htmlpars.cpp diff --git a/src/html/htmlpars.cpp b/src/html/htmlpars.cpp index 64054fc2d2..efd6d9349e 100644 --- a/src/html/htmlpars.cpp +++ b/src/html/htmlpars.cpp @@ -9,7 +9,7 @@ #ifdef __GNUG__ -#pragma implementation +#pragma implementation "htmlpars.h" #endif #include "wx/wxprec.h" @@ -17,7 +17,7 @@ #include "wx/defs.h" #if wxUSE_HTML && wxUSE_STREAMS -#ifdef __BORDLANDC__ +#ifdef __BORLANDC__ #pragma hdrstop #endif @@ -80,8 +80,9 @@ wxHtmlParser::wxHtmlParser() wxHtmlParser::~wxHtmlParser() { - while (RestoreState()) - DestroyDOMTree(); + while (RestoreState()) {} + DestroyDOMTree(); + delete m_HandlersStack; m_HandlersHash.Clear(); m_HandlersList.DeleteContents(TRUE); @@ -91,11 +92,9 @@ wxHtmlParser::~wxHtmlParser() wxObject* wxHtmlParser::Parse(const wxString& source) { - wxObject *result; - InitParser(source); DoParsing(); - result = GetProduct(); + wxObject *result = GetProduct(); DoneParser(); return result; } @@ -103,6 +102,7 @@ wxObject* wxHtmlParser::Parse(const wxString& source) void wxHtmlParser::InitParser(const wxString& source) { SetSource(source); + m_stopParsing = FALSE; } void wxHtmlParser::DoneParser() @@ -127,6 +127,8 @@ void wxHtmlParser::CreateDOMTree() m_CurTextPiece = 0; } +extern bool wxIsCDATAElement(const wxChar *tag); + void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur, int begin_pos, int end_pos, wxHtmlTagsCache *cache) @@ -137,6 +139,15 @@ void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur, int i = begin_pos; int textBeginning = begin_pos; + // If the tag contains CDATA text, we include the text between beginning + // and ending tag verbosely. Setting i=end_pos will skip to the very + // end of this function where text piece is added, bypassing any child + // tags parsing (CDATA element can't have child elements by definition): + if (cur != NULL && wxIsCDATAElement(cur->GetName().c_str())) + { + i = end_pos; + } + while (i < end_pos) { c = m_Source.GetChar(i); @@ -209,6 +220,7 @@ void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur, } else i = chd->GetBeginPos(); + textBeginning = i; } @@ -271,8 +283,9 @@ void wxHtmlParser::DoParsing(int begin_pos, int end_pos) pieces[m_CurTextPiece].m_pos < m_CurTag->GetBeginPos())) { // Add text: - AddText(m_Source.Mid(pieces[m_CurTextPiece].m_pos, - pieces[m_CurTextPiece].m_lng)); + AddText(GetEntitiesParser()->Parse( + m_Source.Mid(pieces[m_CurTextPiece].m_pos, + pieces[m_CurTextPiece].m_lng))); begin_pos = pieces[m_CurTextPiece].m_pos + pieces[m_CurTextPiece].m_lng; m_CurTextPiece++; @@ -290,6 +303,8 @@ void wxHtmlParser::DoParsing(int begin_pos, int end_pos) wxHtmlTag *t = m_CurTag; m_CurTag = m_CurTag->GetNextTag(); AddTag(*t); + if (m_stopParsing) + return; } else break; } @@ -302,7 +317,11 @@ void wxHtmlParser::AddTag(const wxHtmlTag& tag) h = (wxHtmlTagHandler*) m_HandlersHash.Get(tag.GetName()); if (h) + { inner = h->HandleTag(tag); + if (m_stopParsing) + return; + } if (!inner) { if (tag.HasEnding()) @@ -316,7 +335,7 @@ void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler) wxStringTokenizer tokenizer(s, wxT(", ")); while (tokenizer.HasMoreTokens()) - m_HandlersHash.Put(tokenizer.NextToken(), handler); + m_HandlersHash.Put(tokenizer.GetNextToken(), handler); if (m_HandlersList.IndexOf(handler) == wxNOT_FOUND) m_HandlersList.Append(handler); @@ -339,7 +358,7 @@ void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, wxString tags) while (tokenizer.HasMoreTokens()) { - key = tokenizer.NextToken(); + key = tokenizer.GetNextToken(); m_HandlersHash.Delete(key); m_HandlersHash.Put(key, handler); } @@ -385,6 +404,8 @@ bool wxHtmlParser::RestoreState() { if (!m_SavedStates) return FALSE; + DestroyDOMTree(); + wxHtmlParserState *s = m_SavedStates; m_SavedStates = s->m_nextState; @@ -428,11 +449,15 @@ wxHtmlEntitiesParser::~wxHtmlEntitiesParser() void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding) { #if wxUSE_WCHAR_T && !wxUSE_UNICODE - if (encoding == m_encoding) return; + if (encoding == m_encoding) + return; + delete m_conv; - m_conv = NULL; + m_encoding = encoding; - if (m_encoding != wxFONTENCODING_SYSTEM) + if (m_encoding == wxFONTENCODING_SYSTEM) + m_conv = NULL; + else m_conv = new wxCSConv(wxFontMapper::GetEncodingName(m_encoding)); #else (void) encoding; @@ -444,6 +469,8 @@ wxString wxHtmlEntitiesParser::Parse(const wxString& input) const wxChar *c, *last; const wxChar *in_str = input.c_str(); wxString output; + + output.reserve(input.length()); for (c = in_str, last = in_str; *c != wxT('\0'); c++) { @@ -452,8 +479,11 @@ wxString wxHtmlEntitiesParser::Parse(const wxString& input) if (c - last > 0) output.append(last, c - last); if (++c == wxT('\0')) break; + wxString entity; const wxChar *ent_s = c; + wxChar entity_char; + for (; (*c >= wxT('a') && *c <= wxT('z')) || (*c >= wxT('A') && *c <= wxT('Z')) || (*c >= wxT('0') && *c <= wxT('9')) || @@ -461,7 +491,14 @@ wxString wxHtmlEntitiesParser::Parse(const wxString& input) entity.append(ent_s, c - ent_s); if (*c != wxT(';')) c--; last = c+1; - output << GetEntityChar(entity); + entity_char = GetEntityChar(entity); + if (entity_char) + output << entity_char; + else + { + output.append(ent_s-1, c-ent_s+2); + wxLogDebug(wxT("Unrecognized HTML entity: '%s'"), entity.c_str()); + } } } if (*last != wxT('\0')) @@ -475,16 +512,15 @@ struct wxHtmlEntityInfo unsigned code; }; -static int LINKAGEMODE compar_entity(const void *key, const void *item) +extern "C" int LINKAGEMODE wxHtmlEntityCompare(const void *key, const void *item) { return wxStrcmp((wxChar*)key, ((wxHtmlEntityInfo*)item)->name); } +#if !wxUSE_UNICODE wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code) { -#if wxUSE_UNICODE - return (wxChar)code; -#elif wxUSE_WCHAR_T +#if wxUSE_WCHAR_T char buf[2]; wchar_t wbuf[2]; wbuf[0] = (wchar_t)code; @@ -497,6 +533,7 @@ wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code) return (code < 256) ? (wxChar)code : '?'; #endif } +#endif wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity) { @@ -785,15 +822,79 @@ wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity) info = (wxHtmlEntityInfo*) bsearch(entity.c_str(), substitutions, substitutions_cnt, sizeof(wxHtmlEntityInfo), - compar_entity); + wxHtmlEntityCompare); if (info) code = info->code; } if (code == 0) - return wxT('?'); + return 0; else return GetCharForCode(code); } +wxFSFile *wxHtmlParser::OpenURL(wxHtmlURLType WXUNUSED(type), + const wxString& url) const +{ + return GetFS()->OpenFile(url); +} + + +//----------------------------------------------------------------------------- +// wxHtmlParser::ExtractCharsetInformation +//----------------------------------------------------------------------------- + +class wxMetaTagParser : public wxHtmlParser +{ +public: + wxObject* GetProduct() { return NULL; } +protected: + virtual void AddText(const wxChar* WXUNUSED(txt)) {} +}; + +class wxMetaTagHandler : public wxHtmlTagHandler +{ +public: + wxMetaTagHandler(wxString *retval) : wxHtmlTagHandler(), m_retval(retval) {} + wxString GetSupportedTags() { return wxT("META,BODY"); } + bool HandleTag(const wxHtmlTag& tag); + +private: + wxString *m_retval; +}; + +bool wxMetaTagHandler::HandleTag(const wxHtmlTag& tag) +{ + if (tag.GetName() == _T("BODY")) + { + m_Parser->StopParsing(); + return FALSE; + } + + if (tag.HasParam(_T("HTTP-EQUIV")) && + tag.GetParam(_T("HTTP-EQUIV")).IsSameAs(_T("Content-Type"), false) && + tag.HasParam(_T("CONTENT"))) + { + wxString content = tag.GetParam(_T("CONTENT")); + if (content.Left(19) == _T("text/html; charset=")) + { + *m_retval = content.Mid(19); + m_Parser->StopParsing(); + } + } + return FALSE; +} + + +/*static*/ +wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup) +{ + wxString charset; + wxMetaTagParser parser; + parser.AddTagHandler(new wxMetaTagHandler(&charset)); + parser.Parse(markup); + return charset; +} + + #endif