1 ///////////////////////////////////////////////////////////////////////////// 
   2 // Name:        wx/html/htmlpars.h 
   3 // Purpose:     wxHtmlParser class (generic parser) 
   4 // Author:      Vaclav Slavik 
   6 // Copyright:   (c) 1999 Vaclav Slavik 
   7 // Licence:     wxWindows licence 
   8 ///////////////////////////////////////////////////////////////////////////// 
  10 #ifndef _WX_HTMLPARS_H_ 
  11 #define _WX_HTMLPARS_H_ 
  16 #include "wx/html/htmltag.h" 
  17 #include "wx/filesys.h" 
  18 #include "wx/hashmap.h" 
  19 #include "wx/hashset.h" 
  20 #include "wx/vector.h" 
  21 #include "wx/fontenc.h" 
  23 class WXDLLIMPEXP_FWD_BASE wxMBConv
; 
  24 class WXDLLIMPEXP_FWD_HTML wxHtmlParser
; 
  25 class WXDLLIMPEXP_FWD_HTML wxHtmlTagHandler
; 
  26 class WXDLLIMPEXP_FWD_HTML wxHtmlEntitiesParser
; 
  28 class wxHtmlTextPieces
; 
  29 class wxHtmlParserState
; 
  31 WX_DECLARE_HASH_SET_WITH_DECL_PTR(wxHtmlTagHandler
*, 
  32                                   ::wxPointerHash
, ::wxPointerEqual
, 
  34                                   class WXDLLIMPEXP_HTML
); 
  35 WX_DECLARE_STRING_HASH_MAP_WITH_DECL(wxHtmlTagHandler
*, 
  36                                      wxHtmlTagHandlersHash
, 
  37                                      class WXDLLIMPEXP_HTML
); 
  47 // This class handles generic parsing of HTML document : it scans 
  48 // the document and divides it into blocks of tags (where one block 
  49 // consists of starting and ending tag and of text between these 
  51 class WXDLLIMPEXP_HTML wxHtmlParser 
: public wxObject
 
  53     DECLARE_ABSTRACT_CLASS(wxHtmlParser
) 
  57     virtual ~wxHtmlParser(); 
  59     // Sets the class which will be used for opening files 
  60     void SetFS(wxFileSystem 
*fs
) { m_FS 
= fs
; } 
  62     wxFileSystem
* GetFS() const { return m_FS
; } 
  64     // Opens file if the parser is allowed to open given URL (may be forbidden 
  65     // for security reasons) 
  66     virtual wxFSFile 
*OpenURL(wxHtmlURLType type
, const wxString
& url
) const; 
  68     // You can simply call this method when you need parsed output. 
  69     // This method does these things: 
  70     // 1. call InitParser(source); 
  71     // 2. call DoParsing(); 
  72     // 3. call GetProduct(); (its return value is then returned) 
  73     // 4. call DoneParser(); 
  74     wxObject
* Parse(const wxString
& source
); 
  76     // Sets the source. This must be called before running Parse() method. 
  77     virtual void InitParser(const wxString
& source
); 
  78     // This must be called after Parse(). 
  79     virtual void DoneParser(); 
  81     // May be called during parsing to immediately return from Parse(). 
  82     virtual void StopParsing() { m_stopParsing 
= true; } 
  84     // Parses the m_Source from begin_pos to end_pos-1. 
  85     // (in noparams version it parses whole m_Source) 
  86     void DoParsing(const wxString::const_iterator
& begin_pos
, 
  87                    const wxString::const_iterator
& end_pos
); 
  90     // Returns pointer to the tag at parser's current position 
  91     wxHtmlTag 
*GetCurrentTag() const { return m_CurTag
; } 
  93     // Returns product of parsing 
  94     // Returned value is result of parsing of the part. The type of this result 
  95     // depends on internal representation in derived parser 
  96     // (see wxHtmlWinParser for details). 
  97     virtual wxObject
* GetProduct() = 0; 
  99     // adds handler to the list & hash table of handlers. 
 100     virtual void AddTagHandler(wxHtmlTagHandler 
*handler
); 
 102     // Forces the handler to handle additional tags (not returned by GetSupportedTags). 
 103     // The handler should already be in use by this parser. 
 104     // Example: you want to parse following pseudo-html structure: 
 106     //     <it name="one" value="1"> 
 107     //     <it name="two" value="2"> 
 109     //   <it> This last it has different meaning, we don't want it to be parsed by myitems handler! 
 110     // handler can handle only 'myitems' (e.g. its GetSupportedTags returns "MYITEMS") 
 111     // you can call PushTagHandler(handler, "IT") when you find <myitems> 
 112     // and call PopTagHandler() when you find </myitems> 
 113     void PushTagHandler(wxHtmlTagHandler 
*handler
, const wxString
& tags
); 
 115     // Restores state before last call to PushTagHandler 
 116     void PopTagHandler(); 
 118     const wxString
* GetSource() {return m_Source
;} 
 119     void SetSource(const wxString
& src
); 
 121     // Sets HTML source and remembers current parser's state so that it can 
 122     // later be restored. This is useful for on-line modifications of 
 123     // HTML source (for example, <pre> handler replaces spaces with   
 124     // and newlines with <br>) 
 125     virtual void SetSourceAndSaveState(const wxString
& src
); 
 126     // Restores parser's state from stack or returns false if the stack is 
 128     virtual bool RestoreState(); 
 130     // Returns HTML source inside the element (i.e. between the starting 
 132     wxString 
GetInnerSource(const wxHtmlTag
& tag
); 
 134     // Parses HTML string 'markup' and extracts charset info from <meta> tag 
 135     // if present. Returns empty string if the tag is missing. 
 136     // For wxHTML's internal use. 
 137     static wxString 
ExtractCharsetInformation(const wxString
& markup
); 
 139     // Returns entity parser object, used to substitute HTML &entities; 
 140     wxHtmlEntitiesParser 
*GetEntitiesParser() const { return m_entitiesParser
; } 
 142     // Returns true if the tag starting at the given position is a comment tag 
 144     // p should point to '<' character and is modified to point to the closing 
 145     // '>' of the end comment tag if this is indeed a comment 
 147     SkipCommentTag(wxString::const_iterator
& p
, wxString::const_iterator end
); 
 151     void CreateDOMTree(); 
 152     void DestroyDOMTree(); 
 153     void CreateDOMSubTree(wxHtmlTag 
*cur
, 
 154                           const wxString::const_iterator
& begin_pos
, 
 155                           const wxString::const_iterator
& end_pos
, 
 156                           wxHtmlTagsCache 
*cache
); 
 158     // Adds text to the output. 
 159     // This is called from Parse() and must be overridden in derived classes. 
 160     // txt is not guaranteed to be only one word. It is largest continuous part 
 161     // of text (= not broken by tags) 
 162     virtual void AddText(const wxString
& txt
) = 0; 
 164     // Adds tag and proceeds it. Parse() may (and usually is) called from this method. 
 165     // This is called from Parse() and may be overridden. 
 166     // Default behaviour is that it looks for proper handler in m_Handlers. The tag is 
 167     // ignored if no hander is found. 
 168     // Derived class is *responsible* for filling in m_Handlers table. 
 169     virtual void AddTag(const wxHtmlTag
& tag
); 
 175     wxHtmlTextPieces 
*m_TextPieces
; 
 176     size_t m_CurTextPiece
; 
 178     const wxString 
*m_Source
; 
 180     wxHtmlParserState 
*m_SavedStates
; 
 182     // handlers that handle particular tags. The table is accessed by 
 184     // This attribute MUST be filled by derived class otherwise it would 
 185     // be empty and no tags would be recognized 
 186     // (see wxHtmlWinParser for details about filling it) 
 187     // m_HandlersHash is for random access based on knowledge of tag name (BR, P, etc.) 
 188     //      it may (and often does) contain more references to one object 
 189     // m_HandlersList is list of all handlers and it is guaranteed to contain 
 190     //      only one reference to each handler instance. 
 191     wxHtmlTagHandlersSet m_HandlersSet
; 
 192     wxHtmlTagHandlersHash m_HandlersHash
; 
 194     wxDECLARE_NO_COPY_CLASS(wxHtmlParser
); 
 196     // class for opening files (file system) 
 198     // handlers stack used by PushTagHandler and PopTagHandler 
 199     wxVector
<wxHtmlTagHandlersHash
*> m_HandlersStack
; 
 202     wxHtmlEntitiesParser 
*m_entitiesParser
; 
 204     // flag indicating that the parser should stop 
 210 // This class (and derived classes) cooperates with wxHtmlParser. 
 211 // Each recognized tag is passed to handler which is capable 
 212 // of handling it. Each tag is handled in 3 steps: 
 213 // 1. Handler will modifies state of parser 
 214 //    (using its public methods) 
 215 // 2. Parser parses source between starting and ending tag 
 216 // 3. Handler restores original state of the parser 
 217 class WXDLLIMPEXP_HTML wxHtmlTagHandler 
: public wxObject
 
 219     DECLARE_ABSTRACT_CLASS(wxHtmlTagHandler
) 
 222     wxHtmlTagHandler() : wxObject () { m_Parser 
= NULL
; } 
 225     // NOTE : each _instance_ of handler is guaranteed to be called 
 226     // only by one parser. This means you don't have to care about 
 228     virtual void SetParser(wxHtmlParser 
*parser
) 
 229         { m_Parser 
= parser
; } 
 231     // Returns list of supported tags. The list is in uppercase and 
 232     // tags are delimited by ','. 
 233     // Example : "I,B,FONT,P" 
 234     //   is capable of handling italic, bold, font and paragraph tags 
 235     virtual wxString 
GetSupportedTags() = 0; 
 237     // This is hadling core method. It does all the Steps 1-3. 
 238     // To process step 2, you can call ParseInner() 
 239     // returned value : true if it called ParseInner(), 
 241     virtual bool HandleTag(const wxHtmlTag
& tag
) = 0; 
 244     // parses input between beginning and ending tag. 
 245     // m_Parser must be set. 
 246     void ParseInner(const wxHtmlTag
& tag
) 
 247         { m_Parser
->DoParsing(tag
.GetBeginIter(), tag
.GetEndIter1()); } 
 249     // Parses given source as if it was tag's inner code (see 
 250     // wxHtmlParser::GetInnerSource).  Unlike ParseInner(), this method lets 
 251     // you specify the source code to parse. This is useful when you need to 
 252     // modify the inner text before parsing. 
 253     void ParseInnerSource(const wxString
& source
); 
 255     wxHtmlParser 
*m_Parser
; 
 257     wxDECLARE_NO_COPY_CLASS(wxHtmlTagHandler
); 
 261 // This class is used to parse HTML entities in strings. It can handle 
 262 // both named entities and &#xxxx entries where xxxx is Unicode code. 
 263 class WXDLLIMPEXP_HTML wxHtmlEntitiesParser 
: public wxObject
 
 265     DECLARE_DYNAMIC_CLASS(wxHtmlEntitiesParser
) 
 268     wxHtmlEntitiesParser(); 
 269     virtual ~wxHtmlEntitiesParser(); 
 271     // Sets encoding of output string. 
 272     // Has no effect if wxUSE_UNICODE==1 
 274     void SetEncoding(wxFontEncoding 
WXUNUSED(encoding
)) {} 
 276     void SetEncoding(wxFontEncoding encoding
); 
 279     // Parses entities in input and replaces them with respective characters 
 280     // (with respect to output encoding) 
 281     wxString 
Parse(const wxString
& input
) const; 
 283     // Returns character for given entity or 0 if the enity is unknown 
 284     wxChar 
GetEntityChar(const wxString
& entity
) const; 
 286     // Returns character that represents given Unicode code 
 288     wxChar 
GetCharForCode(unsigned code
) const { return (wxChar
)code
; } 
 290     wxChar 
GetCharForCode(unsigned code
) const; 
 296     wxFontEncoding m_encoding
; 
 299     wxDECLARE_NO_COPY_CLASS(wxHtmlEntitiesParser
); 
 305 #endif // _WX_HTMLPARS_H_