include/wx/html/htmlpars.h

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        htmlpars.h
   3 // Purpose:     wxHtmlParser class (generic parser)
   4 // Author:      Vaclav Slavik
   5 // RCS-ID:      $Id$
   6 // Copyright:   (c) 1999 Vaclav Slavik
   7 // Licence:     wxWindows licence
   8 /////////////////////////////////////////////////////////////////////////////
   9
  10 #ifndef _WX_HTMLPARS_H_
  11 #define _WX_HTMLPARS_H_
  12
  13 #include "wx/defs.h"
  14 #if wxUSE_HTML
  15
  16 #include "wx/html/htmltag.h"
  17 #include "wx/filesys.h"
  18 #include "wx/hashmap.h"
  19 #include "wx/hashset.h"
  20 #include "wx/vector.h"
  21 #include "wx/fontenc.h"
  22
  23 class WXDLLIMPEXP_FWD_BASE wxMBConv;
  24 class WXDLLIMPEXP_FWD_HTML wxHtmlParser;
  25 class WXDLLIMPEXP_FWD_HTML wxHtmlTagHandler;
  26 class WXDLLIMPEXP_FWD_HTML wxHtmlEntitiesParser;
  27
  28 class wxHtmlTextPieces;
  29 class wxHtmlParserState;
  30
  31 WX_DECLARE_HASH_SET_WITH_DECL(wxHtmlTagHandler*,
  32                               wxPointerHash, wxPointerEqual,
  33                               wxHtmlTagHandlersSet,
  34                               class WXDLLIMPEXP_HTML);
  35 WX_DECLARE_STRING_HASH_MAP_WITH_DECL(wxHtmlTagHandler*,
  36                                      wxHtmlTagHandlersHash,
  37                                      class WXDLLIMPEXP_HTML);
  38
  39
  40 enum wxHtmlURLType
  41 {
  42     wxHTML_URL_PAGE,
  43     wxHTML_URL_IMAGE,
  44     wxHTML_URL_OTHER
  45 };
  46
  47 // This class handles generic parsing of HTML document : it scans
  48 // the document and divides it into blocks of tags (where one block
  49 // consists of starting and ending tag and of text between these
  50 // 2 tags.
  51 class WXDLLIMPEXP_HTML wxHtmlParser : public wxObject
  52 {
  53     DECLARE_ABSTRACT_CLASS(wxHtmlParser)
  54
  55 public:
  56     wxHtmlParser();
  57     virtual ~wxHtmlParser();
  58
  59     // Sets the class which will be used for opening files
  60     void SetFS(wxFileSystem *fs) { m_FS = fs; }
  61
  62     wxFileSystem* GetFS() const { return m_FS; }
  63
  64     // Opens file if the parser is allowed to open given URL (may be forbidden
  65     // for security reasons)
  66     virtual wxFSFile *OpenURL(wxHtmlURLType type, const wxString& url) const;
  67
  68     // You can simply call this method when you need parsed output.
  69     // This method does these things:
  70     // 1. call InitParser(source);
  71     // 2. call DoParsing();
  72     // 3. call GetProduct(); (its return value is then returned)
  73     // 4. call DoneParser();
  74     wxObject* Parse(const wxString& source);
  75
  76     // Sets the source. This must be called before running Parse() method.
  77     virtual void InitParser(const wxString& source);
  78     // This must be called after Parse().
  79     virtual void DoneParser();
  80
  81     // May be called during parsing to immediately return from Parse().
  82     virtual void StopParsing() { m_stopParsing = true; }
  83
  84     // Parses the m_Source from begin_pos to end_pos-1.
  85     // (in noparams version it parses whole m_Source)
  86     void DoParsing(const wxString::const_iterator& begin_pos,
  87                    const wxString::const_iterator& end_pos);
  88     void DoParsing();
  89
  90     // Returns pointer to the tag at parser's current position
  91     wxHtmlTag *GetCurrentTag() const { return m_CurTag; }
  92
  93     // Returns product of parsing
  94     // Returned value is result of parsing of the part. The type of this result
  95     // depends on internal representation in derived parser
  96     // (see wxHtmlWinParser for details).
  97     virtual wxObject* GetProduct() = 0;
  98
  99     // adds handler to the list & hash table of handlers.
 100     virtual void AddTagHandler(wxHtmlTagHandler *handler);
 101
 102     // Forces the handler to handle additional tags (not returned by GetSupportedTags).
 103     // The handler should already be in use by this parser.
 104     // Example: you want to parse following pseudo-html structure:
 105     //   <myitems>
 106     //     <it name="one" value="1">
 107     //     <it name="two" value="2">
 108     //   </myitems>
 109     //   <it> This last it has different meaning, we don't want it to be parsed by myitems handler!
 110     // handler can handle only 'myitems' (e.g. its GetSupportedTags returns "MYITEMS")
 111     // you can call PushTagHandler(handler, "IT") when you find <myitems>
 112     // and call PopTagHandler() when you find </myitems>
 113     void PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags);
 114
 115     // Restores state before last call to PushTagHandler
 116     void PopTagHandler();
 117
 118     const wxString* GetSource() {return m_Source;}
 119     void SetSource(const wxString& src);
 120
 121     // Sets HTML source and remembers current parser's state so that it can
 122     // later be restored. This is useful for on-line modifications of
 123     // HTML source (for example, <pre> handler replaces spaces with &nbsp;
 124     // and newlines with <br>)
 125     virtual void SetSourceAndSaveState(const wxString& src);
 126     // Restores parser's state from stack or returns false if the stack is
 127     // empty
 128     virtual bool RestoreState();
 129
 130     // Returns HTML source inside the element (i.e. between the starting
 131     // and ending tag)
 132     wxString GetInnerSource(const wxHtmlTag& tag);
 133
 134     // Parses HTML string 'markup' and extracts charset info from <meta> tag
 135     // if present. Returns empty string if the tag is missing.
 136     // For wxHTML's internal use.
 137     static wxString ExtractCharsetInformation(const wxString& markup);
 138
 139     // Returns entity parser object, used to substitute HTML &entities;
 140     wxHtmlEntitiesParser *GetEntitiesParser() const { return m_entitiesParser; }
 141
 142     // Returns true if the tag starting at the given position is a comment tag
 143     //
 144     // p should point to '<' character and is modified to point to the closing
 145     // '>' of the end comment tag if this is indeed a comment
 146     static bool
 147     SkipCommentTag(wxString::const_iterator& p, wxString::const_iterator end);
 148
 149 protected:
 150     // DOM structure
 151     void CreateDOMTree();
 152     void DestroyDOMTree();
 153     void CreateDOMSubTree(wxHtmlTag *cur,
 154                           const wxString::const_iterator& begin_pos,
 155                           const wxString::const_iterator& end_pos,
 156                           wxHtmlTagsCache *cache);
 157
 158     // Adds text to the output.
 159     // This is called from Parse() and must be overriden in derived classes.
 160     // txt is not guaranteed to be only one word. It is largest continuous part
 161     // of text (= not broken by tags)
 162     virtual void AddText(const wxString& txt) = 0;
 163
 164     // Adds tag and proceeds it. Parse() may (and usually is) called from this method.
 165     // This is called from Parse() and may be overriden.
 166     // Default behavior is that it looks for proper handler in m_Handlers. The tag is
 167     // ignored if no hander is found.
 168     // Derived class is *responsible* for filling in m_Handlers table.
 169     virtual void AddTag(const wxHtmlTag& tag);
 170
 171 protected:
 172     // DOM tree:
 173     wxHtmlTag *m_CurTag;
 174     wxHtmlTag *m_Tags;
 175     wxHtmlTextPieces *m_TextPieces;
 176     size_t m_CurTextPiece;
 177
 178     const wxString *m_Source;
 179
 180     wxHtmlParserState *m_SavedStates;
 181
 182     // handlers that handle particular tags. The table is accessed by
 183     // key = tag's name.
 184     // This attribute MUST be filled by derived class otherwise it would
 185     // be empty and no tags would be recognized
 186     // (see wxHtmlWinParser for details about filling it)
 187     // m_HandlersHash is for random access based on knowledge of tag name (BR, P, etc.)
 188     //      it may (and often does) contain more references to one object
 189     // m_HandlersList is list of all handlers and it is guaranteed to contain
 190     //      only one reference to each handler instance.
 191     wxHtmlTagHandlersSet m_HandlersSet;
 192     wxHtmlTagHandlersHash m_HandlersHash;
 193
 194     wxDECLARE_NO_COPY_CLASS(wxHtmlParser);
 195
 196     // class for opening files (file system)
 197     wxFileSystem *m_FS;
 198     // handlers stack used by PushTagHandler and PopTagHandler
 199     wxVector<wxHtmlTagHandlersHash*> m_HandlersStack;
 200
 201     // entity parse
 202     wxHtmlEntitiesParser *m_entitiesParser;
 203
 204     // flag indicating that the parser should stop
 205     bool m_stopParsing;
 206 };
 207
 208
 209
 210 // This class (and derived classes) cooperates with wxHtmlParser.
 211 // Each recognized tag is passed to handler which is capable
 212 // of handling it. Each tag is handled in 3 steps:
 213 // 1. Handler will modifies state of parser
 214 //    (using its public methods)
 215 // 2. Parser parses source between starting and ending tag
 216 // 3. Handler restores original state of the parser
 217 class WXDLLIMPEXP_HTML wxHtmlTagHandler : public wxObject
 218 {
 219     DECLARE_ABSTRACT_CLASS(wxHtmlTagHandler)
 220
 221 public:
 222     wxHtmlTagHandler() : wxObject () { m_Parser = NULL; }
 223
 224     // Sets the parser.
 225     // NOTE : each _instance_ of handler is guaranteed to be called
 226     // only by one parser. This means you don't have to care about
 227     // reentrancy.
 228     virtual void SetParser(wxHtmlParser *parser)
 229         { m_Parser = parser; }
 230
 231     // Returns list of supported tags. The list is in uppercase and
 232     // tags are delimited by ','.
 233     // Example : "I,B,FONT,P"
 234     //   is capable of handling italic, bold, font and paragraph tags
 235     virtual wxString GetSupportedTags() = 0;
 236
 237     // This is hadling core method. It does all the Steps 1-3.
 238     // To process step 2, you can call ParseInner()
 239     // returned value : true if it called ParseInner(),
 240     //                  false etherwise
 241     virtual bool HandleTag(const wxHtmlTag& tag) = 0;
 242
 243 protected:
 244     // parses input between beginning and ending tag.
 245     // m_Parser must be set.
 246     void ParseInner(const wxHtmlTag& tag)
 247         { m_Parser->DoParsing(tag.GetBeginIter(), tag.GetEndIter1()); }
 248
 249     // Parses given source as if it was tag's inner code (see
 250     // wxHtmlParser::GetInnerSource).  Unlike ParseInner(), this method lets
 251     // you specify the source code to parse. This is useful when you need to
 252     // modify the inner text before parsing.
 253     void ParseInnerSource(const wxString& source);
 254
 255     wxHtmlParser *m_Parser;
 256
 257     wxDECLARE_NO_COPY_CLASS(wxHtmlTagHandler);
 258 };
 259
 260
 261 // This class is used to parse HTML entities in strings. It can handle
 262 // both named entities and &#xxxx entries where xxxx is Unicode code.
 263 class WXDLLIMPEXP_HTML wxHtmlEntitiesParser : public wxObject
 264 {
 265     DECLARE_DYNAMIC_CLASS(wxHtmlEntitiesParser)
 266
 267 public:
 268     wxHtmlEntitiesParser();
 269     virtual ~wxHtmlEntitiesParser();
 270
 271     // Sets encoding of output string.
 272     // Has no effect if wxUSE_UNICODE==1
 273 #if wxUSE_UNICODE
 274     void SetEncoding(wxFontEncoding WXUNUSED(encoding)) {}
 275 #else
 276     void SetEncoding(wxFontEncoding encoding);
 277 #endif
 278
 279     // Parses entities in input and replaces them with respective characters
 280     // (with respect to output encoding)
 281     wxString Parse(const wxString& input) const;
 282
 283     // Returns character for given entity or 0 if the enity is unknown
 284     wxChar GetEntityChar(const wxString& entity) const;
 285
 286     // Returns character that represents given Unicode code
 287 #if wxUSE_UNICODE
 288     wxChar GetCharForCode(unsigned code) const { return (wxChar)code; }
 289 #else
 290     wxChar GetCharForCode(unsigned code) const;
 291 #endif
 292
 293 protected:
 294 #if !wxUSE_UNICODE
 295     wxMBConv *m_conv;
 296     wxFontEncoding m_encoding;
 297 #endif
 298
 299     wxDECLARE_NO_COPY_CLASS(wxHtmlEntitiesParser);
 300 };
 301
 302
 303 #endif
 304
 305 #endif // _WX_HTMLPARS_H_