tests/benchmarks/htmlparser/htmltag.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/html/htmltag.cpp
   3 // Purpose:     wx28HtmlTag class (represents single tag)
   4 // Author:      Vaclav Slavik
   5 // Copyright:   (c) 1999 Vaclav Slavik
   6 // Licence:     wxWindows licence
   7 /////////////////////////////////////////////////////////////////////////////
   8
   9 #include "wx/wxprec.h"
  10
  11 #ifdef __BORLANDC__
  12     #pragma hdrstop
  13 #endif
  14
  15 #include "htmltag.h"
  16
  17 #include "htmlpars.h"
  18 #include <stdio.h> // for vsscanf
  19 #include <stdarg.h>
  20
  21
  22 //-----------------------------------------------------------------------------
  23 // wx28HtmlTagsCache
  24 //-----------------------------------------------------------------------------
  25
  26 struct wx28HtmlCacheItem
  27 {
  28     // this is "pos" value passed to wx28HtmlTag's constructor.
  29     // it is position of '<' character of the tag
  30     int Key;
  31
  32     // end positions for the tag:
  33     // end1 is '<' of ending tag,
  34     // end2 is '>' or both are
  35     // -1 if there is no ending tag for this one...
  36     // or -2 if this is ending tag  </...>
  37     int End1, End2;
  38
  39     // name of this tag
  40     wxChar *Name;
  41 };
  42
  43
  44 IMPLEMENT_CLASS(wx28HtmlTagsCache,wxObject)
  45
  46 #define CACHE_INCREMENT  64
  47
  48 bool wxIsCDATAElement(const wxChar *tag)
  49 {
  50     return (wxStrcmp(tag, wxT("SCRIPT")) == 0) ||
  51            (wxStrcmp(tag, wxT("STYLE")) == 0);
  52 }
  53
  54 wx28HtmlTagsCache::wx28HtmlTagsCache(const wxString& source)
  55 {
  56     const wxChar *src = source.c_str();
  57     int lng = source.length();
  58     wxChar tagBuffer[256];
  59
  60     m_Cache = NULL;
  61     m_CacheSize = 0;
  62     m_CachePos = 0;
  63
  64     int pos = 0;
  65     while (pos < lng)
  66     {
  67         if (src[pos] == wxT('<'))   // tag found:
  68         {
  69             if (m_CacheSize % CACHE_INCREMENT == 0)
  70                 m_Cache = (wx28HtmlCacheItem*) realloc(m_Cache, (m_CacheSize + CACHE_INCREMENT) * sizeof(wx28HtmlCacheItem));
  71             int tg = m_CacheSize++;
  72             int stpos = pos++;
  73             m_Cache[tg].Key = stpos;
  74
  75             int i;
  76             for ( i = 0;
  77                   pos < lng && i < (int)WXSIZEOF(tagBuffer) - 1 &&
  78                   src[pos] != wxT('>') && !wxIsspace(src[pos]);
  79                   i++, pos++ )
  80             {
  81                 tagBuffer[i] = (wxChar)wxToupper(src[pos]);
  82             }
  83             tagBuffer[i] = wxT('\0');
  84
  85             m_Cache[tg].Name = new wxChar[i+1];
  86             memcpy(m_Cache[tg].Name, tagBuffer, (i+1)*sizeof(wxChar));
  87
  88             while (pos < lng && src[pos] != wxT('>')) pos++;
  89
  90             if (src[stpos+1] == wxT('/')) // ending tag:
  91             {
  92                 m_Cache[tg].End1 = m_Cache[tg].End2 = -2;
  93                 // find matching begin tag:
  94                 for (i = tg; i >= 0; i--)
  95                     if ((m_Cache[i].End1 == -1) && (wxStrcmp(m_Cache[i].Name, tagBuffer+1) == 0))
  96                     {
  97                         m_Cache[i].End1 = stpos;
  98                         m_Cache[i].End2 = pos + 1;
  99                         break;
 100                     }
 101             }
 102             else
 103             {
 104                 m_Cache[tg].End1 = m_Cache[tg].End2 = -1;
 105
 106                 if (wxIsCDATAElement(tagBuffer))
 107                 {
 108                     // store the orig pos in case we are missing the closing
 109                     // tag (see below)
 110                     wxInt32 old_pos = pos;
 111                     bool foundCloseTag = false;
 112
 113                     // find next matching tag
 114                     int tag_len = wxStrlen(tagBuffer);
 115                     while (pos < lng)
 116                     {
 117                         // find the ending tag
 118                         while (pos + 1 < lng &&
 119                                (src[pos] != '<' || src[pos+1] != '/'))
 120                             ++pos;
 121                         if (src[pos] == '<')
 122                             ++pos;
 123
 124                         // see if it matches
 125                         int match_pos = 0;
 126                         while (pos < lng && match_pos < tag_len && src[pos] != '>' && src[pos] != '<') {
 127                             // cast to wxChar needed to suppress warning in
 128                             // Unicode build
 129                             if ((wxChar)wxToupper(src[pos]) == tagBuffer[match_pos]) {
 130                                 ++match_pos;
 131                             }
 132                             else if (src[pos] == wxT(' ') || src[pos] == wxT('\n') ||
 133                                 src[pos] == wxT('\r') || src[pos] == wxT('\t')) {
 134                                 // need to skip over these
 135                             }
 136                             else {
 137                                 match_pos = 0;
 138                             }
 139                             ++pos;
 140                         }
 141
 142                         // found a match
 143                         if (match_pos == tag_len)
 144                         {
 145                             pos = pos - tag_len - 3;
 146                             foundCloseTag = true;
 147                             break;
 148                         }
 149                         else // keep looking for the closing tag
 150                         {
 151                             ++pos;
 152                         }
 153                     }
 154                     if (!foundCloseTag)
 155                     {
 156                         // we didn't find closing tag; this means the markup
 157                         // is incorrect and the best thing we can do is to
 158                         // ignore the unclosed tag and continue parsing as if
 159                         // it didn't exist:
 160                         pos = old_pos;
 161                     }
 162                 }
 163             }
 164         }
 165
 166         pos++;
 167     }
 168
 169     // ok, we're done, now we'll free .Name members of cache - we don't need it anymore:
 170     for (int i = 0; i < m_CacheSize; i++)
 171     {
 172         delete[] m_Cache[i].Name;
 173         m_Cache[i].Name = NULL;
 174     }
 175 }
 176
 177 void wx28HtmlTagsCache::QueryTag(int at, int* end1, int* end2)
 178 {
 179     if (m_Cache == NULL) return;
 180     if (m_Cache[m_CachePos].Key != at)
 181     {
 182         int delta = (at < m_Cache[m_CachePos].Key) ? -1 : 1;
 183         do
 184         {
 185             if ( m_CachePos < 0 || m_CachePos == m_CacheSize )
 186             {
 187                 // something is very wrong with HTML, give up by returning an
 188                 // impossibly large value which is going to be ignored by the
 189                 // caller
 190                 *end1 =
 191                 *end2 = INT_MAX;
 192                 return;
 193             }
 194
 195             m_CachePos += delta;
 196         }
 197         while (m_Cache[m_CachePos].Key != at);
 198     }
 199     *end1 = m_Cache[m_CachePos].End1;
 200     *end2 = m_Cache[m_CachePos].End2;
 201 }
 202
 203
 204
 205
 206 //-----------------------------------------------------------------------------
 207 // wx28HtmlTag
 208 //-----------------------------------------------------------------------------
 209
 210 IMPLEMENT_CLASS(wx28HtmlTag,wxObject)
 211
 212 wx28HtmlTag::wx28HtmlTag(wx28HtmlTag *parent,
 213                      const wxString& source, int pos, int end_pos,
 214                      wx28HtmlTagsCache *cache,
 215                      wx28HtmlEntitiesParser *entParser) : wxObject()
 216 {
 217     /* Setup DOM relations */
 218
 219     m_Next = NULL;
 220     m_FirstChild = m_LastChild = NULL;
 221     m_Parent = parent;
 222     if (parent)
 223     {
 224         m_Prev = m_Parent->m_LastChild;
 225         if (m_Prev == NULL)
 226             m_Parent->m_FirstChild = this;
 227         else
 228             m_Prev->m_Next = this;
 229         m_Parent->m_LastChild = this;
 230     }
 231     else
 232         m_Prev = NULL;
 233
 234     /* Find parameters and their values: */
 235
 236     int i;
 237     wxChar c;
 238
 239     // fill-in name, params and begin pos:
 240     i = pos+1;
 241
 242     // find tag's name and convert it to uppercase:
 243     while ((i < end_pos) &&
 244            ((c = source[i++]) != wxT(' ') && c != wxT('\r') &&
 245              c != wxT('\n') && c != wxT('\t') &&
 246              c != wxT('>')))
 247     {
 248         if ((c >= wxT('a')) && (c <= wxT('z')))
 249             c -= (wxT('a') - wxT('A'));
 250         m_Name << c;
 251     }
 252
 253     // if the tag has parameters, read them and "normalize" them,
 254     // i.e. convert to uppercase, replace whitespaces by spaces and
 255     // remove whitespaces around '=':
 256     if (source[i-1] != wxT('>'))
 257     {
 258         #define IS_WHITE(c) (c == wxT(' ') || c == wxT('\r') || \
 259                              c == wxT('\n') || c == wxT('\t'))
 260         wxString pname, pvalue;
 261         wxChar quote;
 262         enum
 263         {
 264             ST_BEFORE_NAME = 1,
 265             ST_NAME,
 266             ST_BEFORE_EQ,
 267             ST_BEFORE_VALUE,
 268             ST_VALUE
 269         } state;
 270
 271         quote = 0;
 272         state = ST_BEFORE_NAME;
 273         while (i < end_pos)
 274         {
 275             c = source[i++];
 276
 277             if (c == wxT('>') && !(state == ST_VALUE && quote != 0))
 278             {
 279                 if (state == ST_BEFORE_EQ || state == ST_NAME)
 280                 {
 281                     m_ParamNames.Add(pname);
 282                     m_ParamValues.Add(wxEmptyString);
 283                 }
 284                 else if (state == ST_VALUE && quote == 0)
 285                 {
 286                     m_ParamNames.Add(pname);
 287                     if (entParser)
 288                         m_ParamValues.Add(entParser->Parse(pvalue));
 289                     else
 290                         m_ParamValues.Add(pvalue);
 291                 }
 292                 break;
 293             }
 294             switch (state)
 295             {
 296                 case ST_BEFORE_NAME:
 297                     if (!IS_WHITE(c))
 298                     {
 299                         pname = c;
 300                         state = ST_NAME;
 301                     }
 302                     break;
 303                 case ST_NAME:
 304                     if (IS_WHITE(c))
 305                         state = ST_BEFORE_EQ;
 306                     else if (c == wxT('='))
 307                         state = ST_BEFORE_VALUE;
 308                     else
 309                         pname << c;
 310                     break;
 311                 case ST_BEFORE_EQ:
 312                     if (c == wxT('='))
 313                         state = ST_BEFORE_VALUE;
 314                     else if (!IS_WHITE(c))
 315                     {
 316                         m_ParamNames.Add(pname);
 317                         m_ParamValues.Add(wxEmptyString);
 318                         pname = c;
 319                         state = ST_NAME;
 320                     }
 321                     break;
 322                 case ST_BEFORE_VALUE:
 323                     if (!IS_WHITE(c))
 324                     {
 325                         if (c == wxT('"') || c == wxT('\''))
 326                             quote = c, pvalue = wxEmptyString;
 327                         else
 328                             quote = 0, pvalue = c;
 329                         state = ST_VALUE;
 330                     }
 331                     break;
 332                 case ST_VALUE:
 333                     if ((quote != 0 && c == quote) ||
 334                         (quote == 0 && IS_WHITE(c)))
 335                     {
 336                         m_ParamNames.Add(pname);
 337                         if (quote == 0)
 338                         {
 339                             // VS: backward compatibility, no real reason,
 340                             //     but wxHTML code relies on this... :(
 341                             pvalue.MakeUpper();
 342                         }
 343                         if (entParser)
 344                             m_ParamValues.Add(entParser->Parse(pvalue));
 345                         else
 346                             m_ParamValues.Add(pvalue);
 347                         state = ST_BEFORE_NAME;
 348                     }
 349                     else
 350                         pvalue << c;
 351                     break;
 352             }
 353         }
 354
 355         #undef IS_WHITE
 356     }
 357     m_Begin = i;
 358
 359     cache->QueryTag(pos, &m_End1, &m_End2);
 360     if (m_End1 > end_pos) m_End1 = end_pos;
 361     if (m_End2 > end_pos) m_End2 = end_pos;
 362 }
 363
 364 wx28HtmlTag::~wx28HtmlTag()
 365 {
 366     wx28HtmlTag *t1, *t2;
 367     t1 = m_FirstChild;
 368     while (t1)
 369     {
 370         t2 = t1->GetNextSibling();
 371         delete t1;
 372         t1 = t2;
 373     }
 374 }
 375
 376 bool wx28HtmlTag::HasParam(const wxString& par) const
 377 {
 378     return (m_ParamNames.Index(par, false) != wxNOT_FOUND);
 379 }
 380
 381 wxString wx28HtmlTag::GetParam(const wxString& par, bool with_commas) const
 382 {
 383     int index = m_ParamNames.Index(par, false);
 384     if (index == wxNOT_FOUND)
 385         return wxEmptyString;
 386     if (with_commas)
 387     {
 388         // VS: backward compatibility, seems to be never used by wxHTML...
 389         wxString s;
 390         s << wxT('"') << m_ParamValues[index] << wxT('"');
 391         return s;
 392     }
 393     else
 394         return m_ParamValues[index];
 395 }
 396
 397 int wx28HtmlTag::ScanParam(const wxString& par,
 398                          const wxChar *format,
 399                          void *param) const
 400 {
 401     wxString parval = GetParam(par);
 402     return wxSscanf(parval, format, param);
 403 }
 404
 405 bool wx28HtmlTag::GetParamAsInt(const wxString& par, int *clr) const
 406 {
 407     if ( !HasParam(par) )
 408         return false;
 409
 410     long i;
 411     if ( !GetParam(par).ToLong(&i) )
 412         return false;
 413
 414     *clr = (int)i;
 415     return true;
 416 }
 417
 418 wxString wx28HtmlTag::GetAllParams() const
 419 {
 420     // VS: this function is for backward compatibility only,
 421     //     never used by wxHTML
 422     wxString s;
 423     size_t cnt = m_ParamNames.GetCount();
 424     for (size_t i = 0; i < cnt; i++)
 425     {
 426         s << m_ParamNames[i];
 427         s << wxT('=');
 428         if (m_ParamValues[i].Find(wxT('"')) != wxNOT_FOUND)
 429             s << wxT('\'') << m_ParamValues[i] << wxT('\'');
 430         else
 431             s << wxT('"') << m_ParamValues[i] << wxT('"');
 432     }
 433     return s;
 434 }
 435
 436 wx28HtmlTag *wx28HtmlTag::GetFirstSibling() const
 437 {
 438     if (m_Parent)
 439         return m_Parent->m_FirstChild;
 440     else
 441     {
 442         wx28HtmlTag *cur = (wx28HtmlTag*)this;
 443         while (cur->m_Prev)
 444             cur = cur->m_Prev;
 445         return cur;
 446     }
 447 }
 448
 449 wx28HtmlTag *wx28HtmlTag::GetLastSibling() const
 450 {
 451     if (m_Parent)
 452         return m_Parent->m_LastChild;
 453     else
 454     {
 455         wx28HtmlTag *cur = (wx28HtmlTag*)this;
 456         while (cur->m_Next)
 457             cur = cur->m_Next;
 458         return cur;
 459     }
 460 }
 461
 462 wx28HtmlTag *wx28HtmlTag::GetNextTag() const
 463 {
 464     if (m_FirstChild) return m_FirstChild;
 465     if (m_Next) return m_Next;
 466     wx28HtmlTag *cur = m_Parent;
 467     if (!cur) return NULL;
 468     while (cur->m_Parent && !cur->m_Next)
 469         cur = cur->m_Parent;
 470     return cur->m_Next;
 471 }