X-Git-Url: https://git.saurik.com/wxWidgets.git/blobdiff_plain/04dbb6467be8f564f380bd9a1106fbdecbd26a98..50415ad7eebd3399fc06417a8eb427324978a13a:/src/html/htmlpars.cpp diff --git a/src/html/htmlpars.cpp b/src/html/htmlpars.cpp index 6a69f9e0b9..ff9c189223 100644 --- a/src/html/htmlpars.cpp +++ b/src/html/htmlpars.cpp @@ -1,29 +1,25 @@ ///////////////////////////////////////////////////////////////////////////// -// Name: htmlpars.cpp +// Name: src/html/htmlpars.cpp // Purpose: wxHtmlParser class (generic parser) // Author: Vaclav Slavik // RCS-ID: $Id$ // Copyright: (c) 1999 Vaclav Slavik -// Licence: wxWindows Licence +// Licence: wxWindows licence ///////////////////////////////////////////////////////////////////////////// +#include "wx/wxprec.h" -#ifdef __GNUG__ -#pragma implementation +#ifdef __BORLANDC__ + #pragma hdrstop #endif -#include "wx/wxprec.h" - -#include "wx/defs.h" #if wxUSE_HTML && wxUSE_STREAMS -#ifdef __BORDLANDC__ -#pragma hdrstop -#endif - #ifndef WXPRECOMP + #include "wx/dynarray.h" #include "wx/log.h" #include "wx/intl.h" + #include "wx/app.h" #endif #include "wx/tokenzr.h" @@ -32,8 +28,41 @@ #include "wx/fontmap.h" #include "wx/html/htmldefs.h" #include "wx/html/htmlpars.h" +#include "wx/arrimpl.cpp" +#ifdef __WXWINCE__ + #include "wx/msw/wince/missing.h" // for bsearch() +#endif + +// DLL options compatibility check: +WX_CHECK_BUILD_OPTIONS("wxHTML") +const wxChar *wxTRACE_HTML_DEBUG = _T("htmldebug"); + +//----------------------------------------------------------------------------- +// wxHtmlParser helpers +//----------------------------------------------------------------------------- + +class wxHtmlTextPiece +{ +public: + wxHtmlTextPiece(int pos, int lng) : m_pos(pos), m_lng(lng) {} + int m_pos, m_lng; +}; + +WX_DECLARE_OBJARRAY(wxHtmlTextPiece, wxHtmlTextPieces); +WX_DEFINE_OBJARRAY(wxHtmlTextPieces) + +class wxHtmlParserState +{ +public: + wxHtmlTag *m_curTag; + wxHtmlTag *m_tags; + wxHtmlTextPieces *m_textPieces; + int m_curTextPiece; + wxString m_source; + wxHtmlParserState *m_nextState; +}; //----------------------------------------------------------------------------- // wxHtmlParser @@ -42,28 +71,41 @@ IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject) wxHtmlParser::wxHtmlParser() - : wxObject(), m_Cache(NULL), m_HandlersHash(wxKEY_STRING), + : wxObject(), m_HandlersHash(wxKEY_STRING), m_FS(NULL), m_HandlersStack(NULL) { m_entitiesParser = new wxHtmlEntitiesParser; + m_Tags = NULL; + m_CurTag = NULL; + m_TextPieces = NULL; + m_CurTextPiece = 0; + m_SavedStates = NULL; } wxHtmlParser::~wxHtmlParser() { + while (RestoreState()) {} + DestroyDOMTree(); + + if (m_HandlersStack) + { + wxList& tmp = *m_HandlersStack; + wxList::iterator it, en; + for( it = tmp.begin(), en = tmp.end(); it != en; ++it ) + delete (wxHashTable*)*it; + tmp.clear(); + } delete m_HandlersStack; m_HandlersHash.Clear(); - m_HandlersList.DeleteContents(TRUE); - m_HandlersList.Clear(); + WX_CLEAR_LIST(wxList, m_HandlersList); delete m_entitiesParser; } wxObject* wxHtmlParser::Parse(const wxString& source) { - wxObject *result; - InitParser(source); DoParsing(); - result = GetProduct(); + wxObject *result = GetProduct(); DoneParser(); return result; } @@ -71,76 +113,204 @@ wxObject* wxHtmlParser::Parse(const wxString& source) void wxHtmlParser::InitParser(const wxString& source) { SetSource(source); + m_stopParsing = false; } void wxHtmlParser::DoneParser() { - delete m_Cache; - m_Cache = NULL; + DestroyDOMTree(); } void wxHtmlParser::SetSource(const wxString& src) { + DestroyDOMTree(); m_Source = src; - delete m_Cache; - m_Cache = new wxHtmlTagsCache(m_Source); + CreateDOMTree(); + m_CurTag = NULL; + m_CurTextPiece = 0; } -void wxHtmlParser::DoParsing(int begin_pos, int end_pos) +void wxHtmlParser::CreateDOMTree() +{ + wxHtmlTagsCache cache(m_Source); + m_TextPieces = new wxHtmlTextPieces; + CreateDOMSubTree(NULL, 0, m_Source.length(), &cache); + m_CurTextPiece = 0; +} + +extern bool wxIsCDATAElement(const wxChar *tag); + +void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur, + int begin_pos, int end_pos, + wxHtmlTagsCache *cache) { if (end_pos <= begin_pos) return; - char c; - char *temp = new char[end_pos - begin_pos + 1]; - int i; - int templen; + wxChar c; + int i = begin_pos; + int textBeginning = begin_pos; - templen = 0; - i = begin_pos; + // If the tag contains CDATA text, we include the text between beginning + // and ending tag verbosely. Setting i=end_pos will skip to the very + // end of this function where text piece is added, bypassing any child + // tags parsing (CDATA element can't have child elements by definition): + if (cur != NULL && wxIsCDATAElement(cur->GetName().c_str())) + { + i = end_pos; + } while (i < end_pos) { - c = m_Source[(unsigned int) i]; + c = m_Source.GetChar(i); - // continue building word: - if (c != '<') - { - temp[templen++] = c; - i++; - } + if (c == wxT('<')) + { + // add text to m_TextPieces: + if (i - textBeginning > 0) + m_TextPieces->Add( + wxHtmlTextPiece(textBeginning, i - textBeginning)); + + // if it is a comment, skip it: + wxString::const_iterator iter = m_Source.begin() + i; + if ( SkipCommentTag(iter, m_Source.end()) ) + { + textBeginning = + i = iter - m_Source.begin() + 1; // skip closing '>' too + } - else if (c == '<') - { - wxHtmlTag tag(m_Source, i, end_pos, m_Cache, m_entitiesParser); + // add another tag to the tree: + else if (i < end_pos-1 && m_Source.GetChar(i+1) != wxT('/')) + { + wxHtmlTag *chd; + if (cur) + chd = new wxHtmlTag(cur, m_Source, + i, end_pos, cache, m_entitiesParser); + else + { + chd = new wxHtmlTag(NULL, m_Source, + i, end_pos, cache, m_entitiesParser); + if (!m_Tags) + { + // if this is the first tag to be created make the root + // m_Tags point to it: + m_Tags = chd; + } + else + { + // if there is already a root tag add this tag as + // the last sibling: + chd->m_Prev = m_Tags->GetLastSibling(); + chd->m_Prev->m_Next = chd; + } + } + + if (chd->HasEnding()) + { + CreateDOMSubTree(chd, + chd->GetBeginPos(), chd->GetEndPos1(), + cache); + i = chd->GetEndPos2(); + } + else + i = chd->GetBeginPos(); + + textBeginning = i; + } - if (templen) - { - temp[templen] = 0; - AddText(temp); - templen = 0; + // ... or skip ending tag: + else + { + while (i < end_pos && m_Source.GetChar(i) != wxT('>')) i++; + textBeginning = i+1; } - AddTag(tag); - if (tag.HasEnding()) i = tag.GetEndPos2(); - else i = tag.GetBeginPos(); } + else i++; } - if (templen) - { // last word of block :-( - temp[templen] = 0; - AddText(temp); + // add remaining text to m_TextPieces: + if (end_pos - textBeginning > 0) + m_TextPieces->Add( + wxHtmlTextPiece(textBeginning, end_pos - textBeginning)); +} + +void wxHtmlParser::DestroyDOMTree() +{ + wxHtmlTag *t1, *t2; + t1 = m_Tags; + while (t1) + { + t2 = t1->GetNextSibling(); + delete t1; + t1 = t2; + } + m_Tags = m_CurTag = NULL; + + delete m_TextPieces; + m_TextPieces = NULL; +} + +void wxHtmlParser::DoParsing() +{ + m_CurTag = m_Tags; + m_CurTextPiece = 0; + DoParsing(0, m_Source.length()); +} + +void wxHtmlParser::DoParsing(int begin_pos, int end_pos) +{ + if (end_pos <= begin_pos) return; + + wxHtmlTextPieces& pieces = *m_TextPieces; + size_t piecesCnt = pieces.GetCount(); + + while (begin_pos < end_pos) + { + while (m_CurTag && m_CurTag->GetBeginPos() < begin_pos) + m_CurTag = m_CurTag->GetNextTag(); + while (m_CurTextPiece < piecesCnt && + pieces[m_CurTextPiece].m_pos < begin_pos) + m_CurTextPiece++; + + if (m_CurTextPiece < piecesCnt && + (!m_CurTag || + pieces[m_CurTextPiece].m_pos < m_CurTag->GetBeginPos())) + { + // Add text: + AddText(GetEntitiesParser()->Parse( + m_Source.Mid(pieces[m_CurTextPiece].m_pos, + pieces[m_CurTextPiece].m_lng))); + begin_pos = pieces[m_CurTextPiece].m_pos + + pieces[m_CurTextPiece].m_lng; + m_CurTextPiece++; + } + else if (m_CurTag) + { + if (m_CurTag->HasEnding()) + begin_pos = m_CurTag->GetEndPos2(); + else + begin_pos = m_CurTag->GetBeginPos(); + wxHtmlTag *t = m_CurTag; + m_CurTag = m_CurTag->GetNextTag(); + AddTag(*t); + if (m_stopParsing) + return; + } + else break; } - delete[] temp; } void wxHtmlParser::AddTag(const wxHtmlTag& tag) { wxHtmlTagHandler *h; - bool inner = FALSE; + bool inner = false; h = (wxHtmlTagHandler*) m_HandlersHash.Get(tag.GetName()); if (h) + { inner = h->HandleTag(tag); + if (m_stopParsing) + return; + } if (!inner) { if (tag.HasEnding()) @@ -151,10 +321,10 @@ void wxHtmlParser::AddTag(const wxHtmlTag& tag) void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler) { wxString s(handler->GetSupportedTags()); - wxStringTokenizer tokenizer(s, ", "); + wxStringTokenizer tokenizer(s, wxT(", ")); while (tokenizer.HasMoreTokens()) - m_HandlersHash.Put(tokenizer.NextToken(), handler); + m_HandlersHash.Put(tokenizer.GetNextToken(), handler); if (m_HandlersList.IndexOf(handler) == wxNOT_FOUND) m_HandlersList.Append(handler); @@ -162,22 +332,21 @@ void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler) handler->SetParser(this); } -void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, wxString tags) +void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags) { - wxStringTokenizer tokenizer(tags, ", "); + wxStringTokenizer tokenizer(tags, wxT(", ")); wxString key; if (m_HandlersStack == NULL) { m_HandlersStack = new wxList; - m_HandlersStack->DeleteContents(TRUE); } - m_HandlersStack->Insert(new wxHashTable(m_HandlersHash)); + m_HandlersStack->Insert((wxObject*)new wxHashTable(m_HandlersHash)); while (tokenizer.HasMoreTokens()) { - key = tokenizer.NextToken(); + key = tokenizer.GetNextToken(); m_HandlersHash.Delete(key); m_HandlersHash.Put(key, handler); } @@ -185,16 +354,69 @@ void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, wxString tags) void wxHtmlParser::PopTagHandler() { - wxNode *first; - - if (m_HandlersStack == NULL || - (first = m_HandlersStack->GetFirst()) == NULL) + wxList::compatibility_iterator first; + + if ( !m_HandlersStack || +#if wxUSE_STL + !(first = m_HandlersStack->GetFirst()) +#else // !wxUSE_STL + ((first = m_HandlersStack->GetFirst()) == NULL) +#endif // wxUSE_STL/!wxUSE_STL + ) { wxLogWarning(_("Warning: attempt to remove HTML tag handler from empty stack.")); return; } m_HandlersHash = *((wxHashTable*) first->GetData()); - m_HandlersStack->DeleteNode(first); + delete (wxHashTable*) first->GetData(); + m_HandlersStack->Erase(first); +} + +void wxHtmlParser::SetSourceAndSaveState(const wxString& src) +{ + wxHtmlParserState *s = new wxHtmlParserState; + + s->m_curTag = m_CurTag; + s->m_tags = m_Tags; + s->m_textPieces = m_TextPieces; + s->m_curTextPiece = m_CurTextPiece; + s->m_source = m_Source; + + s->m_nextState = m_SavedStates; + m_SavedStates = s; + + m_CurTag = NULL; + m_Tags = NULL; + m_TextPieces = NULL; + m_CurTextPiece = 0; + m_Source = wxEmptyString; + + SetSource(src); +} + +bool wxHtmlParser::RestoreState() +{ + if (!m_SavedStates) return false; + + DestroyDOMTree(); + + wxHtmlParserState *s = m_SavedStates; + m_SavedStates = s->m_nextState; + + m_CurTag = s->m_curTag; + m_Tags = s->m_tags; + m_TextPieces = s->m_textPieces; + m_CurTextPiece = s->m_curTextPiece; + m_Source = s->m_source; + + delete s; + return true; +} + +wxString wxHtmlParser::GetInnerSource(const wxHtmlTag& tag) +{ + return GetSource()->Mid(tag.GetBeginPos(), + tag.GetEndPos1() - tag.GetBeginPos()); } //----------------------------------------------------------------------------- @@ -203,6 +425,15 @@ void wxHtmlParser::PopTagHandler() IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject) +void wxHtmlTagHandler::ParseInnerSource(const wxString& source) +{ + // It is safe to temporarily change the source being parsed, + // provided we restore the state back after parsing + m_Parser->SetSourceAndSaveState(source); + m_Parser->DoParsing(); + m_Parser->RestoreState(); +} + //----------------------------------------------------------------------------- // wxHtmlEntitiesParser @@ -219,18 +450,26 @@ wxHtmlEntitiesParser::wxHtmlEntitiesParser() wxHtmlEntitiesParser::~wxHtmlEntitiesParser() { +#if wxUSE_WCHAR_T && !wxUSE_UNICODE delete m_conv; +#endif } void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding) { #if wxUSE_WCHAR_T && !wxUSE_UNICODE - if (encoding == m_encoding) return; + if (encoding == m_encoding) + return; + delete m_conv; - m_conv = NULL; + m_encoding = encoding; - if (m_encoding != wxFONTENCODING_SYSTEM) + if (m_encoding == wxFONTENCODING_SYSTEM) + m_conv = NULL; + else m_conv = new wxCSConv(wxFontMapper::GetEncodingName(m_encoding)); +#else + (void) encoding; #endif } @@ -240,23 +479,38 @@ wxString wxHtmlEntitiesParser::Parse(const wxString& input) const wxChar *in_str = input.c_str(); wxString output; + output.reserve(input.length()); + for (c = in_str, last = in_str; *c != wxT('\0'); c++) { if (*c == wxT('&')) { if (c - last > 0) output.append(last, c - last); - if (++c == wxT('\0')) break; + if ( *++c == wxT('\0') ) + break; + wxString entity; const wxChar *ent_s = c; + wxChar entity_char; + for (; (*c >= wxT('a') && *c <= wxT('z')) || (*c >= wxT('A') && *c <= wxT('Z')) || (*c >= wxT('0') && *c <= wxT('9')) || *c == wxT('_') || *c == wxT('#'); c++) {} entity.append(ent_s, c - ent_s); - if (*c == wxT(';')) c++; - output << GetEntityChar(entity); - last = c; + if (*c != wxT(';')) c--; + last = c+1; + entity_char = GetEntityChar(entity); + if (entity_char) + output << entity_char; + else + { + output.append(ent_s-1, c-ent_s+2); + wxLogTrace(wxTRACE_HTML_DEBUG, + wxT("Unrecognized HTML entity: '%s'"), + entity.c_str()); + } } } if (*last != wxT('\0')) @@ -270,28 +524,28 @@ struct wxHtmlEntityInfo unsigned code; }; -static int compar_entity(const void *key, const void *item) +extern "C" int LINKAGEMODE wxHtmlEntityCompare(const void *key, const void *item) { return wxStrcmp((wxChar*)key, ((wxHtmlEntityInfo*)item)->name); } +#if !wxUSE_UNICODE wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code) { -#if wxUSE_UNICODE - return (wxChar)code; -#elif wxUSE_WCHAR_T +#if wxUSE_WCHAR_T char buf[2]; wchar_t wbuf[2]; wbuf[0] = (wchar_t)code; wbuf[1] = 0; wxMBConv *conv = m_conv ? m_conv : &wxConvLocal; - if (conv->WC2MB(buf, wbuf, 1) == (size_t)-1) + if (conv->WC2MB(buf, wbuf, 2) == (size_t)-1) return '?'; return buf[0]; #else return (code < 256) ? (wxChar)code : '?'; #endif } +#endif wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity) { @@ -576,19 +830,158 @@ wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity) while (substitutions[substitutions_cnt].code != 0) substitutions_cnt++; - wxHtmlEntityInfo *info; + wxHtmlEntityInfo *info = NULL; +#ifdef __WXWINCE__ + // bsearch crashes under WinCE for some reason + size_t i; + for (i = 0; i < substitutions_cnt; i++) + { + if (entity == substitutions[i].name) + { + info = & substitutions[i]; + break; + } + } +#else info = (wxHtmlEntityInfo*) bsearch(entity.c_str(), substitutions, substitutions_cnt, sizeof(wxHtmlEntityInfo), - compar_entity); + wxHtmlEntityCompare); +#endif if (info) code = info->code; } if (code == 0) - return wxT('?'); + return 0; else return GetCharForCode(code); } -#endif +wxFSFile *wxHtmlParser::OpenURL(wxHtmlURLType WXUNUSED(type), + const wxString& url) const +{ + return m_FS ? m_FS->OpenFile(url) : NULL; + +} + + +//----------------------------------------------------------------------------- +// wxHtmlParser::ExtractCharsetInformation +//----------------------------------------------------------------------------- + +class wxMetaTagParser : public wxHtmlParser +{ +public: + wxMetaTagParser() { } + + wxObject* GetProduct() { return NULL; } + +protected: + virtual void AddText(const wxChar* WXUNUSED(txt)) {} + + DECLARE_NO_COPY_CLASS(wxMetaTagParser) +}; + +class wxMetaTagHandler : public wxHtmlTagHandler +{ +public: + wxMetaTagHandler(wxString *retval) : wxHtmlTagHandler(), m_retval(retval) {} + wxString GetSupportedTags() { return wxT("META,BODY"); } + bool HandleTag(const wxHtmlTag& tag); + +private: + wxString *m_retval; + + DECLARE_NO_COPY_CLASS(wxMetaTagHandler) +}; + +bool wxMetaTagHandler::HandleTag(const wxHtmlTag& tag) +{ + if (tag.GetName() == _T("BODY")) + { + m_Parser->StopParsing(); + return false; + } + + if (tag.HasParam(_T("HTTP-EQUIV")) && + tag.GetParam(_T("HTTP-EQUIV")).IsSameAs(_T("Content-Type"), false) && + tag.HasParam(_T("CONTENT"))) + { + wxString content = tag.GetParam(_T("CONTENT")).Lower(); + if (content.Left(19) == _T("text/html; charset=")) + { + *m_retval = content.Mid(19); + m_Parser->StopParsing(); + } + } + return false; +} + + +/*static*/ +wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup) +{ + wxString charset; + wxMetaTagParser *parser = new wxMetaTagParser(); + if(parser) + { + parser->AddTagHandler(new wxMetaTagHandler(&charset)); + parser->Parse(markup); + delete parser; + } + return charset; +} + +/* static */ +bool +wxHtmlParser::SkipCommentTag(wxString::const_iterator& start, + wxString::const_iterator end) +{ + wxASSERT_MSG( *start == '<', _T("should be called on the tag start") ); + + wxString::const_iterator p = start; + + // comments begin with "