| 1 | ///////////////////////////////////////////////////////////////////////////// |
| 2 | // Name: htmlpars.h |
| 3 | // Purpose: wxHtmlParser class (generic parser) |
| 4 | // Author: Vaclav Slavik |
| 5 | // RCS-ID: $Id$ |
| 6 | // Copyright: (c) 1999 Vaclav Slavik |
| 7 | // Licence: wxWindows licence |
| 8 | ///////////////////////////////////////////////////////////////////////////// |
| 9 | |
| 10 | #ifndef _WX_HTMLPARS_H_ |
| 11 | #define _WX_HTMLPARS_H_ |
| 12 | |
| 13 | #include "wx/defs.h" |
| 14 | #if wxUSE_HTML |
| 15 | |
| 16 | #include "wx/html/htmltag.h" |
| 17 | #include "wx/filesys.h" |
| 18 | #include "wx/hash.h" |
| 19 | #include "wx/fontenc.h" |
| 20 | |
| 21 | class WXDLLIMPEXP_BASE wxMBConv; |
| 22 | class WXDLLIMPEXP_HTML wxHtmlParser; |
| 23 | class WXDLLIMPEXP_HTML wxHtmlTagHandler; |
| 24 | class WXDLLIMPEXP_HTML wxHtmlEntitiesParser; |
| 25 | |
| 26 | class wxHtmlTextPieces; |
| 27 | class wxHtmlParserState; |
| 28 | |
| 29 | |
| 30 | enum wxHtmlURLType |
| 31 | { |
| 32 | wxHTML_URL_PAGE, |
| 33 | wxHTML_URL_IMAGE, |
| 34 | wxHTML_URL_OTHER |
| 35 | }; |
| 36 | |
| 37 | // This class handles generic parsing of HTML document : it scans |
| 38 | // the document and divide it into blocks of tags (where one block |
| 39 | // consists of starting and ending tag and of text between these |
| 40 | // 2 tags. |
| 41 | class WXDLLIMPEXP_HTML wxHtmlParser : public wxObject |
| 42 | { |
| 43 | DECLARE_ABSTRACT_CLASS(wxHtmlParser) |
| 44 | |
| 45 | public: |
| 46 | wxHtmlParser(); |
| 47 | virtual ~wxHtmlParser(); |
| 48 | |
| 49 | // Sets the class which will be used for opening files |
| 50 | void SetFS(wxFileSystem *fs) { m_FS = fs; } |
| 51 | |
| 52 | wxFileSystem* GetFS() const { return m_FS; } |
| 53 | |
| 54 | // Opens file if the parser is allowed to open given URL (may be forbidden |
| 55 | // for security reasons) |
| 56 | virtual wxFSFile *OpenURL(wxHtmlURLType type, const wxString& url) const; |
| 57 | |
| 58 | // You can simply call this method when you need parsed output. |
| 59 | // This method does these things: |
| 60 | // 1. call InitParser(source); |
| 61 | // 2. call DoParsing(); |
| 62 | // 3. call GetProduct(); (it's return value is then returned) |
| 63 | // 4. call DoneParser(); |
| 64 | wxObject* Parse(const wxString& source); |
| 65 | |
| 66 | // Sets the source. This must be called before running Parse() method. |
| 67 | virtual void InitParser(const wxString& source); |
| 68 | // This must be called after Parse(). |
| 69 | virtual void DoneParser(); |
| 70 | |
| 71 | // May be called during parsing to immediately return from Parse(). |
| 72 | virtual void StopParsing() { m_stopParsing = true; } |
| 73 | |
| 74 | // Parses the m_Source from begin_pos to end_pos-1. |
| 75 | // (in noparams version it parses whole m_Source) |
| 76 | void DoParsing(int begin_pos, int end_pos); |
| 77 | void DoParsing(); |
| 78 | |
| 79 | // Returns pointer to the tag at parser's current position |
| 80 | wxHtmlTag *GetCurrentTag() const { return m_CurTag; } |
| 81 | |
| 82 | // Returns product of parsing |
| 83 | // Returned value is result of parsing of the part. The type of this result |
| 84 | // depends on internal representation in derived parser |
| 85 | // (see wxHtmlWinParser for details). |
| 86 | virtual wxObject* GetProduct() = 0; |
| 87 | |
| 88 | // adds handler to the list & hash table of handlers. |
| 89 | virtual void AddTagHandler(wxHtmlTagHandler *handler); |
| 90 | |
| 91 | // Forces the handler to handle additional tags (not returned by GetSupportedTags). |
| 92 | // The handler should already be in use by this parser. |
| 93 | // Example: you want to parse following pseudo-html structure: |
| 94 | // <myitems> |
| 95 | // <it name="one" value="1"> |
| 96 | // <it name="two" value="2"> |
| 97 | // </myitems> |
| 98 | // <it> This last it has different meaning, we don't want it to be parsed by myitems handler! |
| 99 | // handler can handle only 'myitems' (e.g. it's GetSupportedTags returns "MYITEMS") |
| 100 | // you can call PushTagHandler(handler, "IT") when you find <myitems> |
| 101 | // and call PopTagHandler() when you find </myitems> |
| 102 | void PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags); |
| 103 | |
| 104 | // Restores state before last call to PushTagHandler |
| 105 | void PopTagHandler(); |
| 106 | |
| 107 | wxString* GetSource() {return &m_Source;} |
| 108 | void SetSource(const wxString& src); |
| 109 | |
| 110 | // Sets HTML source and remebers current parser's state so that it can |
| 111 | // later be restored. This is useful for on-line modifications of |
| 112 | // HTML source (for example, <pre> handler replaces spaces with |
| 113 | // and newlines with <br>) |
| 114 | virtual void SetSourceAndSaveState(const wxString& src); |
| 115 | // Restores parser's state from stack or returns false if the stack is |
| 116 | // empty |
| 117 | virtual bool RestoreState(); |
| 118 | |
| 119 | // Parses HTML string 'markup' and extracts charset info from <meta> tag |
| 120 | // if present. Returns empty string if the tag is missing. |
| 121 | // For wxHTML's internal use. |
| 122 | static wxString ExtractCharsetInformation(const wxString& markup); |
| 123 | |
| 124 | // Returns entity parser object, used to substitute HTML &entities; |
| 125 | wxHtmlEntitiesParser *GetEntitiesParser() const { return m_entitiesParser; } |
| 126 | |
| 127 | protected: |
| 128 | // DOM structure |
| 129 | void CreateDOMTree(); |
| 130 | void DestroyDOMTree(); |
| 131 | void CreateDOMSubTree(wxHtmlTag *cur, |
| 132 | int begin_pos, int end_pos, |
| 133 | wxHtmlTagsCache *cache); |
| 134 | |
| 135 | // Adds text to the output. |
| 136 | // This is called from Parse() and must be overriden in derived classes. |
| 137 | // txt is not guaranteed to be only one word. It is largest continuous part of text |
| 138 | // (= not broken by tags) |
| 139 | // NOTE : using char* because of speed improvements |
| 140 | virtual void AddText(const wxChar* txt) = 0; |
| 141 | |
| 142 | // Adds tag and proceeds it. Parse() may (and usually is) called from this method. |
| 143 | // This is called from Parse() and may be overriden. |
| 144 | // Default behavior is that it looks for proper handler in m_Handlers. The tag is |
| 145 | // ignored if no hander is found. |
| 146 | // Derived class is *responsible* for filling in m_Handlers table. |
| 147 | virtual void AddTag(const wxHtmlTag& tag); |
| 148 | |
| 149 | protected: |
| 150 | // DOM tree: |
| 151 | wxHtmlTag *m_CurTag; |
| 152 | wxHtmlTag *m_Tags; |
| 153 | wxHtmlTextPieces *m_TextPieces; |
| 154 | size_t m_CurTextPiece; |
| 155 | |
| 156 | wxString m_Source; |
| 157 | |
| 158 | wxHtmlParserState *m_SavedStates; |
| 159 | |
| 160 | // handlers that handle particular tags. The table is accessed by |
| 161 | // key = tag's name. |
| 162 | // This attribute MUST be filled by derived class otherwise it would |
| 163 | // be empty and no tags would be recognized |
| 164 | // (see wxHtmlWinParser for details about filling it) |
| 165 | // m_HandlersHash is for random access based on knowledge of tag name (BR, P, etc.) |
| 166 | // it may (and often does) contain more references to one object |
| 167 | // m_HandlersList is list of all handlers and it is guaranteed to contain |
| 168 | // only one reference to each handler instance. |
| 169 | wxList m_HandlersList; |
| 170 | wxHashTable m_HandlersHash; |
| 171 | |
| 172 | DECLARE_NO_COPY_CLASS(wxHtmlParser) |
| 173 | |
| 174 | // class for opening files (file system) |
| 175 | wxFileSystem *m_FS; |
| 176 | // handlers stack used by PushTagHandler and PopTagHandler |
| 177 | wxList *m_HandlersStack; |
| 178 | |
| 179 | // entity parse |
| 180 | wxHtmlEntitiesParser *m_entitiesParser; |
| 181 | |
| 182 | // flag indicating that the parser should stop |
| 183 | bool m_stopParsing; |
| 184 | }; |
| 185 | |
| 186 | |
| 187 | |
| 188 | // This class (and derived classes) cooperates with wxHtmlParser. |
| 189 | // Each recognized tag is passed to handler which is capable |
| 190 | // of handling it. Each tag is handled in 3 steps: |
| 191 | // 1. Handler will modifies state of parser |
| 192 | // (using it's public methods) |
| 193 | // 2. Parser parses source between starting and ending tag |
| 194 | // 3. Handler restores original state of the parser |
| 195 | class WXDLLIMPEXP_HTML wxHtmlTagHandler : public wxObject |
| 196 | { |
| 197 | DECLARE_ABSTRACT_CLASS(wxHtmlTagHandler) |
| 198 | |
| 199 | public: |
| 200 | wxHtmlTagHandler() : wxObject () { m_Parser = NULL; } |
| 201 | |
| 202 | // Sets the parser. |
| 203 | // NOTE : each _instance_ of handler is guaranteed to be called |
| 204 | // only by one parser. This means you don't have to care about |
| 205 | // reentrancy. |
| 206 | virtual void SetParser(wxHtmlParser *parser) |
| 207 | { m_Parser = parser; } |
| 208 | |
| 209 | // Returns list of supported tags. The list is in uppercase and |
| 210 | // tags are delimited by ','. |
| 211 | // Example : "I,B,FONT,P" |
| 212 | // is capable of handling italic, bold, font and paragraph tags |
| 213 | virtual wxString GetSupportedTags() = 0; |
| 214 | |
| 215 | // This is hadling core method. It does all the Steps 1-3. |
| 216 | // To process step 2, you can call ParseInner() |
| 217 | // returned value : true if it called ParseInner(), |
| 218 | // false etherwise |
| 219 | virtual bool HandleTag(const wxHtmlTag& tag) = 0; |
| 220 | |
| 221 | protected: |
| 222 | // parses input between beginning and ending tag. |
| 223 | // m_Parser must be set. |
| 224 | void ParseInner(const wxHtmlTag& tag) |
| 225 | { m_Parser->DoParsing(tag.GetBeginPos(), tag.GetEndPos1()); } |
| 226 | |
| 227 | wxHtmlParser *m_Parser; |
| 228 | |
| 229 | DECLARE_NO_COPY_CLASS(wxHtmlTagHandler) |
| 230 | }; |
| 231 | |
| 232 | |
| 233 | // This class is used to parse HTML entities in strings. It can handle |
| 234 | // both named entities and &#xxxx entries where xxxx is Unicode code. |
| 235 | class WXDLLIMPEXP_HTML wxHtmlEntitiesParser : public wxObject |
| 236 | { |
| 237 | DECLARE_DYNAMIC_CLASS(wxHtmlEntitiesParser) |
| 238 | |
| 239 | public: |
| 240 | wxHtmlEntitiesParser(); |
| 241 | virtual ~wxHtmlEntitiesParser(); |
| 242 | |
| 243 | // Sets encoding of output string. |
| 244 | // Has no effect if wxUSE_WCHAR_T==0 or wxUSE_UNICODE==1 |
| 245 | void SetEncoding(wxFontEncoding encoding); |
| 246 | |
| 247 | // Parses entities in input and replaces them with respective characters |
| 248 | // (with respect to output encoding) |
| 249 | wxString Parse(const wxString& input); |
| 250 | |
| 251 | // Returns character for given entity or 0 if the enity is unknown |
| 252 | wxChar GetEntityChar(const wxString& entity); |
| 253 | |
| 254 | // Returns character that represents given Unicode code |
| 255 | #if wxUSE_UNICODE |
| 256 | wxChar GetCharForCode(unsigned code) { return (wxChar)code; } |
| 257 | #else |
| 258 | wxChar GetCharForCode(unsigned code); |
| 259 | #endif |
| 260 | |
| 261 | protected: |
| 262 | #if wxUSE_WCHAR_T && !wxUSE_UNICODE |
| 263 | wxMBConv *m_conv; |
| 264 | wxFontEncoding m_encoding; |
| 265 | #endif |
| 266 | |
| 267 | DECLARE_NO_COPY_CLASS(wxHtmlEntitiesParser) |
| 268 | }; |
| 269 | |
| 270 | |
| 271 | #endif |
| 272 | |
| 273 | #endif // _WX_HTMLPARS_H_ |