src/html/htmltag.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/html/htmltag.cpp
   3 // Purpose:     wxHtmlTag class (represents single tag)
   4 // Author:      Vaclav Slavik
   5 // RCS-ID:      $Id$
   6 // Copyright:   (c) 1999 Vaclav Slavik
   7 // Licence:     wxWindows licence
   8 /////////////////////////////////////////////////////////////////////////////
   9
  10 #include "wx/wxprec.h"
  11
  12 #ifdef __BORLANDC__
  13     #pragma hdrstop
  14 #endif
  15
  16 #if wxUSE_HTML
  17
  18 #include "wx/html/htmltag.h"
  19
  20 #ifndef WX_PRECOMP
  21     #include "wx/colour.h"
  22     #include "wx/wxcrtvararg.h"
  23 #endif
  24
  25 #include "wx/html/htmlpars.h"
  26 #include "wx/html/styleparams.h"
  27
  28 #include "wx/vector.h"
  29
  30 #include <stdio.h> // for vsscanf
  31 #include <stdarg.h>
  32
  33 //-----------------------------------------------------------------------------
  34 // wxHtmlTagsCache
  35 //-----------------------------------------------------------------------------
  36
  37 struct wxHtmlCacheItem
  38 {
  39     // this is "pos" value passed to wxHtmlTag's constructor.
  40     // it is position of '<' character of the tag
  41     wxString::const_iterator Key;
  42
  43     // Tag type
  44     enum Type
  45     {
  46         Type_Normal, // normal tag with a matching ending tag
  47         Type_NoMatchingEndingTag, // there's no ending tag for this tag
  48         Type_EndingTag // this is ending tag </..>
  49     };
  50     Type type;
  51
  52     // end positions for the tag:
  53     // end1 is '<' of ending tag,
  54     // end2 is '>' or both are
  55     wxString::const_iterator End1, End2;
  56
  57     // name of this tag
  58     wxChar *Name;
  59 };
  60
  61 // NB: this is an empty class and not typedef because of forward declaration
  62 class wxHtmlTagsCacheData : public wxVector<wxHtmlCacheItem>
  63 {
  64 };
  65
  66 bool wxIsCDATAElement(const wxChar *tag)
  67 {
  68     return (wxStrcmp(tag, wxT("SCRIPT")) == 0) ||
  69            (wxStrcmp(tag, wxT("STYLE")) == 0);
  70 }
  71
  72 bool wxIsCDATAElement(const wxString& tag)
  73 {
  74     return (wxStrcmp(tag.wx_str(), wxS("SCRIPT")) == 0) ||
  75            (wxStrcmp(tag.wx_str(), wxS("STYLE")) == 0);
  76 }
  77
  78 wxHtmlTagsCache::wxHtmlTagsCache(const wxString& source)
  79 {
  80     m_Cache = new wxHtmlTagsCacheData;
  81     m_CachePos = 0;
  82
  83     wxChar tagBuffer[256];
  84
  85     const wxString::const_iterator end = source.end();
  86     for ( wxString::const_iterator pos = source.begin(); pos < end; ++pos )
  87     {
  88         if (*pos == wxT('<'))   // tag found:
  89         {
  90             // don't cache comment tags
  91             if ( wxHtmlParser::SkipCommentTag(pos, source.end()) )
  92                 continue;
  93
  94             size_t tg = Cache().size();
  95             Cache().push_back(wxHtmlCacheItem());
  96
  97             wxString::const_iterator stpos = pos++;
  98             Cache()[tg].Key = stpos;
  99
 100             int i;
 101             for ( i = 0;
 102                   pos < end && i < (int)WXSIZEOF(tagBuffer) - 1 &&
 103                   *pos != wxT('>') && !wxIsspace(*pos);
 104                   ++i, ++pos )
 105             {
 106                 tagBuffer[i] = (wxChar)wxToupper(*pos);
 107             }
 108             tagBuffer[i] = wxT('\0');
 109
 110             Cache()[tg].Name = new wxChar[i+1];
 111             memcpy(Cache()[tg].Name, tagBuffer, (i+1)*sizeof(wxChar));
 112
 113             while (pos < end && *pos != wxT('>'))
 114                 ++pos;
 115
 116             if ((stpos+1) < end && *(stpos+1) == wxT('/')) // ending tag:
 117             {
 118                 Cache()[tg].type = wxHtmlCacheItem::Type_EndingTag;
 119                 // find matching begin tag:
 120                 for (i = tg; i >= 0; i--)
 121                 {
 122                     if ((Cache()[i].type == wxHtmlCacheItem::Type_NoMatchingEndingTag) && (wxStrcmp(Cache()[i].Name, tagBuffer+1) == 0))
 123                     {
 124                         Cache()[i].type = wxHtmlCacheItem::Type_Normal;
 125                         Cache()[i].End1 = stpos;
 126                         Cache()[i].End2 = pos + 1;
 127                         break;
 128                     }
 129                 }
 130             }
 131             else
 132             {
 133                 Cache()[tg].type = wxHtmlCacheItem::Type_NoMatchingEndingTag;
 134
 135                 if (wxIsCDATAElement(tagBuffer))
 136                 {
 137                     // store the orig pos in case we are missing the closing
 138                     // tag (see below)
 139                     const wxString::const_iterator old_pos = pos;
 140                     bool foundCloseTag = false;
 141
 142                     // find next matching tag
 143                     int tag_len = wxStrlen(tagBuffer);
 144                     while (pos < end)
 145                     {
 146                         // find the ending tag
 147                         while (pos + 1 < end &&
 148                                (*pos != '<' || *(pos+1) != '/'))
 149                             ++pos;
 150                         if (*pos == '<')
 151                             ++pos;
 152
 153                         // see if it matches
 154                         int match_pos = 0;
 155                         while (pos < end && match_pos < tag_len )
 156                         {
 157                             wxChar c = *pos;
 158                             if ( c == '>' || c == '<' )
 159                                 break;
 160
 161                             // cast to wxChar needed to suppress warning in
 162                             // Unicode build
 163                             if ((wxChar)wxToupper(c) == tagBuffer[match_pos])
 164                             {
 165                                 ++match_pos;
 166                             }
 167                             else if (c == wxT(' ') || c == wxT('\n') ||
 168                                 c == wxT('\r') || c == wxT('\t'))
 169                             {
 170                                 // need to skip over these
 171                             }
 172                             else
 173                             {
 174                                 match_pos = 0;
 175                             }
 176                             ++pos;
 177                         }
 178
 179                         // found a match
 180                         if (match_pos == tag_len)
 181                         {
 182                             pos = pos - tag_len - 3;
 183                             foundCloseTag = true;
 184                             break;
 185                         }
 186                         else // keep looking for the closing tag
 187                         {
 188                             ++pos;
 189                         }
 190                     }
 191                     if (!foundCloseTag)
 192                     {
 193                         // we didn't find closing tag; this means the markup
 194                         // is incorrect and the best thing we can do is to
 195                         // ignore the unclosed tag and continue parsing as if
 196                         // it didn't exist:
 197                         pos = old_pos;
 198                     }
 199                 }
 200             }
 201         }
 202     }
 203
 204     // ok, we're done, now we'll free .Name members of cache - we don't need it anymore:
 205     for ( wxHtmlTagsCacheData::iterator i = Cache().begin();
 206           i != Cache().end(); ++i )
 207     {
 208         wxDELETEA(i->Name);
 209     }
 210 }
 211
 212 wxHtmlTagsCache::~wxHtmlTagsCache()
 213 {
 214     delete m_Cache;
 215 }
 216
 217 void wxHtmlTagsCache::QueryTag(const wxString::const_iterator& at,
 218                                const wxString::const_iterator& inputEnd,
 219                                wxString::const_iterator *end1,
 220                                wxString::const_iterator *end2,
 221                                bool *hasEnding)
 222 {
 223     if (Cache().empty())
 224         return;
 225
 226     if (Cache()[m_CachePos].Key != at)
 227     {
 228         int delta = (at < Cache()[m_CachePos].Key) ? -1 : 1;
 229         do
 230         {
 231             m_CachePos += delta;
 232
 233             if ( m_CachePos < 0 || m_CachePos >= (int)Cache().size() )
 234             {
 235                 if ( m_CachePos < 0 )
 236                     m_CachePos = 0;
 237                 else
 238                     m_CachePos = Cache().size() - 1;
 239                 // something is very wrong with HTML, give up by returning an
 240                 // impossibly large value which is going to be ignored by the
 241                 // caller
 242                 *end1 =
 243                 *end2 = inputEnd;
 244                 *hasEnding = true;
 245                 return;
 246             }
 247         }
 248         while (Cache()[m_CachePos].Key != at);
 249     }
 250
 251     switch ( Cache()[m_CachePos].type )
 252     {
 253         case wxHtmlCacheItem::Type_Normal:
 254             *end1 = Cache()[m_CachePos].End1;
 255             *end2 = Cache()[m_CachePos].End2;
 256             *hasEnding = true;
 257             break;
 258
 259         case wxHtmlCacheItem::Type_EndingTag:
 260             wxFAIL_MSG("QueryTag called for ending tag - can't be");
 261             // but if it does happen, fall through, better than crashing
 262
 263         case wxHtmlCacheItem::Type_NoMatchingEndingTag:
 264             // If input HTML is invalid and there's no closing tag for this
 265             // one, pretend that it runs all the way to the end of input
 266             *end1 = inputEnd;
 267             *end2 = inputEnd;
 268             *hasEnding = false;
 269             break;
 270     }
 271 }
 272
 273
 274
 275
 276 //-----------------------------------------------------------------------------
 277 // wxHtmlTag
 278 //-----------------------------------------------------------------------------
 279
 280 wxHtmlTag::wxHtmlTag(wxHtmlTag *parent,
 281                      const wxString *source,
 282                      const wxString::const_iterator& pos,
 283                      const wxString::const_iterator& end_pos,
 284                      wxHtmlTagsCache *cache,
 285                      wxHtmlEntitiesParser *entParser)
 286 {
 287     /* Setup DOM relations */
 288
 289     m_Next = NULL;
 290     m_FirstChild = m_LastChild = NULL;
 291     m_Parent = parent;
 292     if (parent)
 293     {
 294         m_Prev = m_Parent->m_LastChild;
 295         if (m_Prev == NULL)
 296             m_Parent->m_FirstChild = this;
 297         else
 298             m_Prev->m_Next = this;
 299         m_Parent->m_LastChild = this;
 300     }
 301     else
 302         m_Prev = NULL;
 303
 304     /* Find parameters and their values: */
 305
 306     wxChar c wxDUMMY_INITIALIZE(0);
 307
 308     // fill-in name, params and begin pos:
 309     wxString::const_iterator i(pos+1);
 310
 311     // find tag's name and convert it to uppercase:
 312     while ((i < end_pos) &&
 313            ((c = *(i++)) != wxT(' ') && c != wxT('\r') &&
 314              c != wxT('\n') && c != wxT('\t') &&
 315              c != wxT('>') && c != wxT('/')))
 316     {
 317         if ((c >= wxT('a')) && (c <= wxT('z')))
 318             c -= (wxT('a') - wxT('A'));
 319         m_Name << c;
 320     }
 321
 322     // if the tag has parameters, read them and "normalize" them,
 323     // i.e. convert to uppercase, replace whitespaces by spaces and
 324     // remove whitespaces around '=':
 325     if (*(i-1) != wxT('>'))
 326     {
 327         #define IS_WHITE(c) (c == wxT(' ') || c == wxT('\r') || \
 328                              c == wxT('\n') || c == wxT('\t'))
 329         wxString pname, pvalue;
 330         wxChar quote;
 331         enum
 332         {
 333             ST_BEFORE_NAME = 1,
 334             ST_NAME,
 335             ST_BEFORE_EQ,
 336             ST_BEFORE_VALUE,
 337             ST_VALUE
 338         } state;
 339
 340         quote = 0;
 341         state = ST_BEFORE_NAME;
 342         while (i < end_pos)
 343         {
 344             c = *(i++);
 345
 346             if (c == wxT('>') && !(state == ST_VALUE && quote != 0))
 347             {
 348                 if (state == ST_BEFORE_EQ || state == ST_NAME)
 349                 {
 350                     m_ParamNames.Add(pname);
 351                     m_ParamValues.Add(wxGetEmptyString());
 352                 }
 353                 else if (state == ST_VALUE && quote == 0)
 354                 {
 355                     m_ParamNames.Add(pname);
 356                     if (entParser)
 357                         m_ParamValues.Add(entParser->Parse(pvalue));
 358                     else
 359                         m_ParamValues.Add(pvalue);
 360                 }
 361                 break;
 362             }
 363             switch (state)
 364             {
 365                 case ST_BEFORE_NAME:
 366                     if (!IS_WHITE(c))
 367                     {
 368                         pname = c;
 369                         state = ST_NAME;
 370                     }
 371                     break;
 372                 case ST_NAME:
 373                     if (IS_WHITE(c))
 374                         state = ST_BEFORE_EQ;
 375                     else if (c == wxT('='))
 376                         state = ST_BEFORE_VALUE;
 377                     else
 378                         pname << c;
 379                     break;
 380                 case ST_BEFORE_EQ:
 381                     if (c == wxT('='))
 382                         state = ST_BEFORE_VALUE;
 383                     else if (!IS_WHITE(c))
 384                     {
 385                         m_ParamNames.Add(pname);
 386                         m_ParamValues.Add(wxGetEmptyString());
 387                         pname = c;
 388                         state = ST_NAME;
 389                     }
 390                     break;
 391                 case ST_BEFORE_VALUE:
 392                     if (!IS_WHITE(c))
 393                     {
 394                         if (c == wxT('"') || c == wxT('\''))
 395                             quote = c, pvalue = wxGetEmptyString();
 396                         else
 397                             quote = 0, pvalue = c;
 398                         state = ST_VALUE;
 399                     }
 400                     break;
 401                 case ST_VALUE:
 402                     if ((quote != 0 && c == quote) ||
 403                         (quote == 0 && IS_WHITE(c)))
 404                     {
 405                         m_ParamNames.Add(pname);
 406                         if (quote == 0)
 407                         {
 408                             // VS: backward compatibility, no real reason,
 409                             //     but wxHTML code relies on this... :(
 410                             pvalue.MakeUpper();
 411                         }
 412                         if (entParser)
 413                             m_ParamValues.Add(entParser->Parse(pvalue));
 414                         else
 415                             m_ParamValues.Add(pvalue);
 416                         state = ST_BEFORE_NAME;
 417                     }
 418                     else
 419                         pvalue << c;
 420                     break;
 421             }
 422         }
 423
 424         #undef IS_WHITE
 425     }
 426     m_Begin = i;
 427     cache->QueryTag(pos, source->end(), &m_End1, &m_End2, &m_hasEnding);
 428     if (m_End1 > end_pos) m_End1 = end_pos;
 429     if (m_End2 > end_pos) m_End2 = end_pos;
 430
 431 #if WXWIN_COMPATIBILITY_2_8
 432     m_sourceStart = source->begin();
 433 #endif
 434
 435     // Try to parse any style parameters that can be handled simply by
 436     // converting them to the equivalent HTML 3 attributes: this is a far cry
 437     // from perfect but better than nothing.
 438     static const struct EquivAttr
 439     {
 440         const char *style;
 441         const char *attr;
 442     } equivAttrs[] =
 443     {
 444         { "text-align",         "ALIGN"         },
 445         { "width",              "WIDTH"         },
 446         { "vertical-align",     "VALIGN"        },
 447         { "background",         "BGCOLOR"       },
 448     };
 449
 450     wxHtmlStyleParams styleParams(*this);
 451     for ( unsigned n = 0; n < WXSIZEOF(equivAttrs); n++ )
 452     {
 453         const EquivAttr& ea = equivAttrs[n];
 454         if ( styleParams.HasParam(ea.style) && !HasParam(ea.attr) )
 455         {
 456             m_ParamNames.Add(ea.attr);
 457             m_ParamValues.Add(styleParams.GetParam(ea.style));
 458         }
 459     }
 460 }
 461
 462 wxHtmlTag::~wxHtmlTag()
 463 {
 464     wxHtmlTag *t1, *t2;
 465     t1 = m_FirstChild;
 466     while (t1)
 467     {
 468         t2 = t1->GetNextSibling();
 469         delete t1;
 470         t1 = t2;
 471     }
 472 }
 473
 474 bool wxHtmlTag::HasParam(const wxString& par) const
 475 {
 476     return (m_ParamNames.Index(par, false) != wxNOT_FOUND);
 477 }
 478
 479 wxString wxHtmlTag::GetParam(const wxString& par, bool with_quotes) const
 480 {
 481     int index = m_ParamNames.Index(par, false);
 482     if (index == wxNOT_FOUND)
 483         return wxGetEmptyString();
 484     if (with_quotes)
 485     {
 486         // VS: backward compatibility, seems to be never used by wxHTML...
 487         wxString s;
 488         s << wxT('"') << m_ParamValues[index] << wxT('"');
 489         return s;
 490     }
 491     else
 492         return m_ParamValues[index];
 493 }
 494
 495 int wxHtmlTag::ScanParam(const wxString& par,
 496                          const char *format,
 497                          void *param) const
 498 {
 499     wxString parval = GetParam(par);
 500     return wxSscanf(parval, format, param);
 501 }
 502
 503 int wxHtmlTag::ScanParam(const wxString& par,
 504                          const wchar_t *format,
 505                          void *param) const
 506 {
 507     wxString parval = GetParam(par);
 508     return wxSscanf(parval, format, param);
 509 }
 510
 511 /* static */
 512 bool wxHtmlTag::ParseAsColour(const wxString& str, wxColour *clr)
 513 {
 514     wxCHECK_MSG( clr, false, wxT("invalid colour argument") );
 515
 516     // handle colours defined in HTML 4.0 first:
 517     if (str.length() > 1 && str[0] != wxT('#'))
 518     {
 519         #define HTML_COLOUR(name, r, g, b)              \
 520             if (str.IsSameAs(wxS(name), false))         \
 521                 { clr->Set(r, g, b); return true; }
 522         HTML_COLOUR("black",   0x00,0x00,0x00)
 523         HTML_COLOUR("silver",  0xC0,0xC0,0xC0)
 524         HTML_COLOUR("gray",    0x80,0x80,0x80)
 525         HTML_COLOUR("white",   0xFF,0xFF,0xFF)
 526         HTML_COLOUR("maroon",  0x80,0x00,0x00)
 527         HTML_COLOUR("red",     0xFF,0x00,0x00)
 528         HTML_COLOUR("purple",  0x80,0x00,0x80)
 529         HTML_COLOUR("fuchsia", 0xFF,0x00,0xFF)
 530         HTML_COLOUR("green",   0x00,0x80,0x00)
 531         HTML_COLOUR("lime",    0x00,0xFF,0x00)
 532         HTML_COLOUR("olive",   0x80,0x80,0x00)
 533         HTML_COLOUR("yellow",  0xFF,0xFF,0x00)
 534         HTML_COLOUR("navy",    0x00,0x00,0x80)
 535         HTML_COLOUR("blue",    0x00,0x00,0xFF)
 536         HTML_COLOUR("teal",    0x00,0x80,0x80)
 537         HTML_COLOUR("aqua",    0x00,0xFF,0xFF)
 538         #undef HTML_COLOUR
 539     }
 540
 541     // then try to parse #rrggbb representations or set from other well
 542     // known names (note that this doesn't strictly conform to HTML spec,
 543     // but it doesn't do real harm -- but it *must* be done after the standard
 544     // colors are handled above):
 545     if (clr->Set(str))
 546         return true;
 547
 548     return false;
 549 }
 550
 551 bool wxHtmlTag::GetParamAsColour(const wxString& par, wxColour *clr) const
 552 {
 553     const wxString str = GetParam(par);
 554     return !str.empty() && ParseAsColour(str, clr);
 555 }
 556
 557 bool wxHtmlTag::GetParamAsInt(const wxString& par, int *clr) const
 558 {
 559     if ( !HasParam(par) )
 560         return false;
 561
 562     long i;
 563     if ( !GetParam(par).ToLong(&i) )
 564         return false;
 565
 566     *clr = (int)i;
 567     return true;
 568 }
 569
 570 wxString wxHtmlTag::GetAllParams() const
 571 {
 572     // VS: this function is for backward compatibility only,
 573     //     never used by wxHTML
 574     wxString s;
 575     size_t cnt = m_ParamNames.GetCount();
 576     for (size_t i = 0; i < cnt; i++)
 577     {
 578         s << m_ParamNames[i];
 579         s << wxT('=');
 580         if (m_ParamValues[i].Find(wxT('"')) != wxNOT_FOUND)
 581             s << wxT('\'') << m_ParamValues[i] << wxT('\'');
 582         else
 583             s << wxT('"') << m_ParamValues[i] << wxT('"');
 584     }
 585     return s;
 586 }
 587
 588 wxHtmlTag *wxHtmlTag::GetFirstSibling() const
 589 {
 590     if (m_Parent)
 591         return m_Parent->m_FirstChild;
 592     else
 593     {
 594         wxHtmlTag *cur = (wxHtmlTag*)this;
 595         while (cur->m_Prev)
 596             cur = cur->m_Prev;
 597         return cur;
 598     }
 599 }
 600
 601 wxHtmlTag *wxHtmlTag::GetLastSibling() const
 602 {
 603     if (m_Parent)
 604         return m_Parent->m_LastChild;
 605     else
 606     {
 607         wxHtmlTag *cur = (wxHtmlTag*)this;
 608         while (cur->m_Next)
 609             cur = cur->m_Next;
 610         return cur;
 611     }
 612 }
 613
 614 wxHtmlTag *wxHtmlTag::GetNextTag() const
 615 {
 616     if (m_FirstChild) return m_FirstChild;
 617     if (m_Next) return m_Next;
 618     wxHtmlTag *cur = m_Parent;
 619     if (!cur) return NULL;
 620     while (cur->m_Parent && !cur->m_Next)
 621         cur = cur->m_Parent;
 622     return cur->m_Next;
 623 }
 624
 625 #endif