]>
git.saurik.com Git - wxWidgets.git/blob - src/common/uri.cpp
   1 ///////////////////////////////////////////////////////////////////////////// 
   3 // Purpose:     Implementation of a URI parser 
   4 // Author:      Ryan Norton, 
   5 //              Vadim Zeitlin (UTF-8 URI support, many other changes) 
   8 // Copyright:   (c) 2004 Ryan Norton, 
  10 // Licence:     wxWindows licence 
  11 ///////////////////////////////////////////////////////////////////////////// 
  13 // =========================================================================== 
  15 // =========================================================================== 
  17 // --------------------------------------------------------------------------- 
  19 // --------------------------------------------------------------------------- 
  21 // For compilers that support precompilation, includes "wx.h". 
  22 #include "wx/wxprec.h" 
  34 // --------------------------------------------------------------------------- 
  36 // --------------------------------------------------------------------------- 
  38 IMPLEMENT_CLASS(wxURI
, wxObject
) 
  40 // =========================================================================== 
  41 // wxURI implementation 
  42 // =========================================================================== 
  44 // --------------------------------------------------------------------------- 
  45 // Constructors and cleanup 
  46 // --------------------------------------------------------------------------- 
  49      : m_hostType(wxURI_REGNAME
), 
  54 wxURI::wxURI(const wxString
& uri
) 
  55      : m_hostType(wxURI_REGNAME
), 
  61 bool wxURI::Create(const wxString
& uri
) 
  66     return Parse(uri
.utf8_str()); 
  77     m_fragment 
= wxEmptyString
; 
  79     m_hostType 
= wxURI_REGNAME
; 
  84 // --------------------------------------------------------------------------- 
  85 // Escaped characters handling 
  86 // --------------------------------------------------------------------------- 
  88 // Converts a character into a numeric hexadecimal value, or -1 if the passed 
  89 // in character is not a valid hex character 
  92 int wxURI::CharToHex(char c
) 
  94     if ((c 
>= 'A') && (c 
<= 'Z')) 
  96     if ((c 
>= 'a') && (c 
<= 'z')) 
  98     if ((c 
>= '0') && (c 
<= '9')) 
 104 int wxURI::DecodeEscape(wxString::const_iterator
& i
) 
 106     int hi 
= CharToHex(*++i
); 
 110     int lo 
= CharToHex(*++i
); 
 114     return (hi 
<< 4) | lo
; 
 118 wxString 
wxURI::Unescape(const wxString
& uri
) 
 120     // the unescaped version can't be longer than the original one 
 121     wxCharBuffer 
buf(uri
.length()); 
 122     char *p 
= buf
.data(); 
 124     for ( wxString::const_iterator i 
= uri
.begin(); i 
!= uri
.end(); ++i
, ++p 
) 
 129             int n 
= wxURI::DecodeEscape(i
); 
 133             wxASSERT_MSG( n 
>= 0 && n 
<= 0xff, "unexpected character value" ); 
 135             c 
= wx_static_cast(char, n
); 
 143     // by default assume that the URI is in UTF-8, this is the most common 
 145     wxString s 
= wxString::FromUTF8(buf
); 
 148         // if it isn't, use latin-1 as a fallback -- at least this always 
 150         s 
= wxCSConv(wxFONTENCODING_ISO8859_1
).cMB2WC(buf
); 
 156 void wxURI::AppendNextEscaped(wxString
& s
, const char *& p
) 
 158     // check for an already encoded character: 
 160     // pct-encoded   = "%" HEXDIG HEXDIG 
 161     if ( p
[0] == '%' && IsHex(p
[1]) && IsHex(p
[2]) ) 
 167     else // really needs escaping 
 169         static const char* hexDigits 
= "0123456789abcdef"; 
 174         s 
+= hexDigits
[(c 
>> 4) & 15]; 
 175         s 
+= hexDigits
[c 
& 15]; 
 179 // --------------------------------------------------------------------------- 
 183 // Gets the username and password via the old URL method. 
 184 // --------------------------------------------------------------------------- 
 185 wxString 
wxURI::GetUser() const 
 187       size_t dwPasswordPos 
= m_userinfo
.find(':'); 
 189       if (dwPasswordPos 
== wxString::npos
) 
 192       return m_userinfo(0, dwPasswordPos
); 
 195 wxString 
wxURI::GetPassword() const 
 197       size_t dwPasswordPos 
= m_userinfo
.find(':'); 
 199       if (dwPasswordPos 
== wxString::npos
) 
 202           return m_userinfo(dwPasswordPos
+1, m_userinfo
.length() + 1); 
 205 // combine all URI fields in a single string, applying funcDecode to each 
 206 // component which it may make sense to decode (i.e. "unescape") 
 207 wxString 
wxURI::DoBuildURI(wxString (*funcDecode
)(const wxString
&)) const 
 212         ret 
+= m_scheme 
+ ":"; 
 219             ret 
+= funcDecode(m_userinfo
) + "@"; 
 221         if (m_hostType 
== wxURI_REGNAME
) 
 222             ret 
+= funcDecode(m_server
); 
 230     ret 
+= funcDecode(m_path
); 
 233         ret 
+= "?" + funcDecode(m_query
); 
 236         ret 
+= "#" + funcDecode(m_fragment
); 
 241 // --------------------------------------------------------------------------- 
 243 // --------------------------------------------------------------------------- 
 245 bool wxURI::operator==(const wxURI
& uri
) const 
 249         if(m_scheme 
!= uri
.m_scheme
) 
 252     else if (uri
.HasScheme()) 
 260             if (m_userinfo 
!= uri
.m_userinfo
) 
 263         else if (uri
.HasUserInfo()) 
 266         if (m_server 
!= uri
.m_server 
|| 
 267             m_hostType 
!= uri
.m_hostType
) 
 272             if(m_port 
!= uri
.m_port
) 
 275         else if (uri
.HasPort()) 
 278     else if (uri
.HasServer()) 
 284         if(m_path 
!= uri
.m_path
) 
 287     else if (uri
.HasPath()) 
 292         if (m_query 
!= uri
.m_query
) 
 295     else if (uri
.HasQuery()) 
 300         if (m_fragment 
!= uri
.m_fragment
) 
 303     else if (uri
.HasFragment()) 
 309 // --------------------------------------------------------------------------- 
 312 // if there is no authority or scheme, it is a reference 
 313 // --------------------------------------------------------------------------- 
 315 bool wxURI::IsReference() const 
 317     return !HasScheme() || !HasServer(); 
 320 // --------------------------------------------------------------------------- 
 323 // Master URI parsing method.  Just calls the individual parsing methods 
 325 // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] 
 326 // URI-reference = URI / relative 
 327 // --------------------------------------------------------------------------- 
 329 bool wxURI::Parse(const char *uri
) 
 331     uri 
= ParseScheme(uri
); 
 333         uri 
= ParseAuthority(uri
); 
 335         uri 
= ParsePath(uri
); 
 337         uri 
= ParseQuery(uri
); 
 339         uri 
= ParseFragment(uri
); 
 341     // we only succeed if we parsed the entire string 
 342     return uri 
&& *uri 
== '\0'; 
 345 const char* wxURI::ParseScheme(const char *uri
) 
 347     const char * const start 
= uri
; 
 349     // assume that we have a scheme if we have the valid start of it 
 354         //scheme        = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) 
 355         while (IsAlpha(*uri
) || IsDigit(*uri
) || 
 366             //mark the scheme as valid 
 367             m_fields 
|= wxURI_SCHEME
; 
 369             //move reference point up to input buffer 
 372         else // no valid scheme finally 
 374             uri 
= start
; // rewind 
 378     //else: can't have schema, possible a relative URI 
 383 const char* wxURI::ParseAuthority(const char* uri
) 
 385     // authority     = [ userinfo "@" ] host [ ":" port ] 
 386     if ( uri
[0] == '/' && uri
[1] == '/' ) 
 388         //skip past the two slashes 
 391         // ############# DEVIATION FROM RFC ######################### 
 392         // Don't parse the server component for file URIs 
 393         if(m_scheme 
!= "file") 
 396             uri 
= ParseUserInfo(uri
); 
 397             uri 
= ParseServer(uri
); 
 398             return ParsePort(uri
); 
 405 const char* wxURI::ParseUserInfo(const char* uri
) 
 407     const char * const start 
= uri
; 
 409     // userinfo      = *( unreserved / pct-encoded / sub-delims / ":" ) 
 410     while ( *uri 
&& *uri 
!= '@' && *uri 
!= '/' && *uri 
!= '#' && *uri 
!= '?' ) 
 412         if ( IsUnreserved(*uri
) || IsSubDelim(*uri
) || *uri 
== ':' ) 
 413             m_userinfo 
+= *uri
++; 
 415             AppendNextEscaped(m_userinfo
, uri
); 
 421         m_fields 
|= wxURI_USERINFO
; 
 425         uri 
= start
; // rewind 
 432 const char* wxURI::ParseServer(const char* uri
) 
 434     const char * const start 
= uri
; 
 436     // host          = IP-literal / IPv4address / reg-name 
 437     // IP-literal    = "[" ( IPv6address / IPvFuture  ) "]" 
 441         if (ParseIPv6address(uri
) && *uri 
== ']') 
 443             m_hostType 
= wxURI_IPV6ADDRESS
; 
 445             m_server
.assign(start
, uri 
- start 
- 1); 
 450             uri 
= start 
+ 1; // skip the leading '[' again 
 452             if (ParseIPvFuture(uri
) && *uri 
== ']') 
 454                 m_hostType 
= wxURI_IPVFUTURE
; 
 456                 m_server
.assign(start
, uri 
- start 
- 1); 
 459             else // unrecognized IP literal 
 465     else // IPv4 or a reg-name 
 467         if (ParseIPv4address(uri
)) 
 469             m_hostType 
= wxURI_IPV4ADDRESS
; 
 471             m_server
.assign(start
, uri 
- start 
- 1); 
 479     if ( m_hostType 
== wxURI_REGNAME 
) 
 482         // reg-name      = *( unreserved / pct-encoded / sub-delims ) 
 483         while ( *uri 
&& *uri 
!= '/' && *uri 
!= ':' && *uri 
!= '#' && *uri 
!= '?' ) 
 485             if ( IsUnreserved(*uri
) || IsSubDelim(*uri
) ) 
 488                 AppendNextEscaped(m_server
, uri
); 
 492     m_fields 
|= wxURI_SERVER
; 
 498 const char* wxURI::ParsePort(const char* uri
) 
 504         while ( IsDigit(*uri
) ) 
 509         m_fields 
|= wxURI_PORT
; 
 515 const char* wxURI::ParsePath(const char* uri
) 
 517     /// hier-part     = "//" authority path-abempty 
 522     /// relative-part = "//" authority path-abempty 
 527     /// path-abempty  = *( "/" segment ) 
 528     /// path-absolute = "/" [ segment-nz *( "/" segment ) ] 
 529     /// path-noscheme = segment-nz-nc *( "/" segment ) 
 530     /// path-rootless = segment-nz *( "/" segment ) 
 531     /// path-empty    = 0<pchar> 
 534     /// segment-nz    = 1*pchar 
 535     /// segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" ) 
 536     ///               ; non-zero-length segment without any colon ":" 
 538     /// pchar         = unreserved / pct-encoded / sub-delims / ":" / "@" 
 540     if ( IsEndPath(*uri
) ) 
 543     const bool isAbs 
= *uri 
== '/'; 
 547     wxArrayString segments
; 
 551         const bool endPath 
= IsEndPath(*uri
); 
 552         if ( endPath 
|| *uri 
== '/' ) 
 554             // end of a segment, look at what we got 
 555             if ( segment 
== ".." ) 
 557                 if ( !segments
.empty() && *segments
.rbegin() != ".." ) 
 560                     segments
.push_back(".."); 
 562             else if ( segment 
== "." ) 
 564                 // normally we ignore "." but the last one should be taken into 
 565                 // account as "path/." is the same as "path/" and not just "path" 
 567                     segments
.push_back(""); 
 569             else // normal segment 
 571                 segments
.push_back(segment
); 
 582         if ( IsUnreserved(*uri
) || IsSubDelim(*uri
) || *uri 
== ':' || *uri 
== '@' ) 
 585             AppendNextEscaped(segment
, uri
); 
 588     m_path 
+= wxJoin(segments
, '/', '\0'); 
 589     m_fields 
|= wxURI_PATH
; 
 595 const char* wxURI::ParseQuery(const char* uri
) 
 597     // query         = *( pchar / "/" / "?" ) 
 601         while ( *uri 
&& *uri 
!= '#' ) 
 603             if ( IsUnreserved(*uri
) || IsSubDelim(*uri
) || 
 604                     *uri 
== ':' || *uri 
== '@' || *uri 
== '/' || *uri 
== '?' ) 
 607                 AppendNextEscaped(m_query
, uri
); 
 610         m_fields 
|= wxURI_QUERY
; 
 617 const char* wxURI::ParseFragment(const char* uri
) 
 619     // fragment      = *( pchar / "/" / "?" ) 
 625             if ( IsUnreserved(*uri
) || IsSubDelim(*uri
) || 
 626                     *uri 
== ':' || *uri 
== '@' || *uri 
== '/' || *uri 
== '?') 
 627                 m_fragment 
+= *uri
++; 
 629                 AppendNextEscaped(m_fragment
, uri
); 
 632         m_fields 
|= wxURI_FRAGMENT
; 
 638 // --------------------------------------------------------------------------- 
 641 // Builds missing components of this uri from a base uri 
 643 // A version of the algorithm outlined in the RFC is used here 
 644 // (it is shown in comments) 
 646 // Note that an empty URI inherits all components 
 647 // --------------------------------------------------------------------------- 
 650 wxArrayString 
wxURI::SplitInSegments(const wxString
& path
) 
 652     return wxSplit(path
, '/', '\0' /* no escape character */); 
 655 void wxURI::Resolve(const wxURI
& base
, int flags
) 
 657     wxASSERT_MSG(!base
.IsReference(), 
 658                 "wxURI to inherit from must not be a reference!"); 
 660     // If we aren't being strict, enable the older (pre-RFC2396) loophole that 
 661     // allows this uri to inherit other properties from the base uri - even if 
 662     // the scheme is defined 
 663     if ( !(flags 
& wxURI_STRICT
) && 
 664             HasScheme() && base
.HasScheme() && 
 665                 m_scheme 
== base
.m_scheme 
) 
 667         m_fields 
-= wxURI_SCHEME
; 
 671     // Do nothing if this is an absolute wxURI 
 672     //    if defined(R.scheme) then 
 673     //       T.scheme    = R.scheme; 
 674     //       T.authority = R.authority; 
 675     //       T.path      = remove_dot_segments(R.path); 
 676     //       T.query     = R.query; 
 680     //No scheme - inherit 
 681     m_scheme 
= base
.m_scheme
; 
 682     m_fields 
|= wxURI_SCHEME
; 
 684     // All we need to do for relative URIs with an 
 685     // authority component is just inherit the scheme 
 686     //       if defined(R.authority) then 
 687     //          T.authority = R.authority; 
 688     //          T.path      = remove_dot_segments(R.path); 
 689     //          T.query     = R.query; 
 693     //No authority - inherit 
 694     if (base
.HasUserInfo()) 
 696         m_userinfo 
= base
.m_userinfo
; 
 697         m_fields 
|= wxURI_USERINFO
; 
 700     m_server 
= base
.m_server
; 
 701     m_hostType 
= base
.m_hostType
; 
 702     m_fields 
|= wxURI_SERVER
; 
 706         m_port 
= base
.m_port
; 
 707         m_fields 
|= wxURI_PORT
; 
 711     // Simple path inheritance from base 
 714         //             T.path = Base.path; 
 715         m_path 
= base
.m_path
; 
 716         m_fields 
|= wxURI_PATH
; 
 719         //             if defined(R.query) then 
 720         //                T.query = R.query; 
 722         //                T.query = Base.query; 
 726             m_query 
= base
.m_query
; 
 727             m_fields 
|= wxURI_QUERY
; 
 730     else if ( m_path
.empty() || m_path
[0u] != '/' ) 
 732         //             if (R.path starts-with "/") then 
 733         //                T.path = remove_dot_segments(R.path); 
 735         //                T.path = merge(Base.path, R.path); 
 736         //                T.path = remove_dot_segments(T.path); 
 738         //             T.query = R.query; 
 740         // So we don't do anything for absolute paths and implement merge for 
 743         wxArrayString 
our(SplitInSegments(m_path
)), 
 744                       result(SplitInSegments(base
.m_path
)); 
 746         if ( !result
.empty() ) 
 751             // if we have an empty path it means we were constructed from a "." 
 752             // string or something similar (e.g. "././././"), it should count 
 753             // as (empty) segment 
 757         const wxArrayString::const_iterator end 
= our
.end(); 
 758         for ( wxArrayString::const_iterator i 
= our
.begin(); i 
!= end
; ++i 
) 
 760             if ( i
->empty() || *i 
== "." ) 
 762                 // as in ParsePath(), while normally we ignore the empty 
 763                 // segments, we need to take account of them at the end 
 765                     result
.push_back(""); 
 771                 if ( !result
.empty() ) 
 776                         result
.push_back(""); 
 778                 //else: just ignore, extra ".." don't accumulate 
 782                 if ( result
.empty() ) 
 784                     // ensure that the resulting path will always be absolute 
 785                     result
.push_back(""); 
 788                 result
.push_back(*i
); 
 792         m_path 
= wxJoin(result
, '/', '\0'); 
 795     //T.fragment = R.fragment; 
 798 // --------------------------------------------------------------------------- 
 801 // Parses 1 to 4 hex values.  Returns true if the first character of the input 
 802 // string is a valid hex character.  It is the caller's responsibility to move 
 803 // the input string back to its original position on failure. 
 804 // --------------------------------------------------------------------------- 
 806 bool wxURI::ParseH16(const char*& uri
) 
 812     if(IsHex(*++uri
) && IsHex(*++uri
) && IsHex(*++uri
)) 
 818 // --------------------------------------------------------------------------- 
 821 // Parses a certain version of an IP address and moves the input string past 
 822 // it.  Returns true if the input  string contains the proper version of an ip 
 823 // address.  It is the caller's responsability to move the input string back 
 824 // to its original position on failure. 
 825 // --------------------------------------------------------------------------- 
 827 bool wxURI::ParseIPv4address(const char*& uri
) 
 829     //IPv4address   = dec-octet "." dec-octet "." dec-octet "." dec-octet 
 831     //dec-octet     =      DIGIT                    ; 0-9 
 832     //                / %x31-39 DIGIT               ; 10-99 
 833     //                / "1" 2DIGIT                  ; 100-199 
 834     //                / "2" %x30-34 DIGIT           ; 200-249 
 835     //                / "25" %x30-35                ; 250-255 
 842         //each ip part must be between 0-255 (dupe of version in for loop) 
 843         if( IsDigit(*++uri
) && IsDigit(*++uri
) && 
 844            //100 or less  (note !) 
 845            !( (*(uri
-2) < '2') || 
 848                (*(uri
-1) < '5' || (*(uri
-1) == '5' && *uri 
<= '5')) 
 856         if(IsDigit(*uri
))++uri
; 
 858         //compilers should unroll this loop 
 859         for(; iIPv4 
< 4; ++iIPv4
) 
 861             if (*uri 
!= '.' || !IsDigit(*++uri
)) 
 864             //each ip part must be between 0-255 
 865             if( IsDigit(*++uri
) && IsDigit(*++uri
) && 
 866                //100 or less  (note !) 
 867                !( (*(uri
-2) < '2') || 
 870                    (*(uri
-1) < '5' || (*(uri
-1) == '5' && *uri 
<= '5')) 
 877             if(IsDigit(*uri
))++uri
; 
 883 bool wxURI::ParseIPv6address(const char*& uri
) 
 885     // IPv6address   =                            6( h16 ":" ) ls32 
 886     //               /                       "::" 5( h16 ":" ) ls32 
 887     //               / [               h16 ] "::" 4( h16 ":" ) ls32 
 888     //               / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 
 889     //               / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 
 890     //               / [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32 
 891     //               / [ *4( h16 ":" ) h16 ] "::"              ls32 
 892     //               / [ *5( h16 ":" ) h16 ] "::"              h16 
 893     //               / [ *6( h16 ":" ) h16 ] "::" 
 895     size_t numPrefix 
= 0, 
 898     bool bEndHex 
= false; 
 900     for( ; numPrefix 
< 6; ++numPrefix
) 
 915     if(!bEndHex 
&& !ParseH16(uri
)) 
 934         if (*uri 
!= ':' || *(uri
+1) != ':') 
 939             while (*--uri 
!= ':') {} 
 942             const char * const start 
= uri
; 
 944             // ls32          = ( h16 ":" h16 ) / IPv4address 
 945             if (ParseH16(uri
) && *uri 
== ':' && ParseH16(uri
)) 
 950             if (ParseIPv4address(uri
)) 
 962                 maxPostfix 
= 4 - numPrefix
; 
 966     bool bAllowAltEnding 
= maxPostfix 
== 0; 
 968     for(; maxPostfix 
!= 0; --maxPostfix
) 
 970         if(!ParseH16(uri
) || *uri 
!= ':') 
 976         const char * const start 
= uri
; 
 978         // ls32          = ( h16 ":" h16 ) / IPv4address 
 979         if (ParseH16(uri
) && *uri 
== ':' && ParseH16(uri
)) 
 984         if (ParseIPv4address(uri
)) 
 989         if (!bAllowAltEnding
) 
 993     if(numPrefix 
<= 5 && ParseH16(uri
)) 
 999 bool wxURI::ParseIPvFuture(const char*& uri
) 
1001     // IPvFuture     = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) 
1002     if (*++uri 
!= 'v' || !IsHex(*++uri
)) 
1005     while (IsHex(*++uri
)) 
1008     if (*uri 
!= '.' || !(IsUnreserved(*++uri
) || IsSubDelim(*uri
) || *uri 
== ':')) 
1011     while(IsUnreserved(*++uri
) || IsSubDelim(*uri
) || *uri 
== ':') {} 
1017 // --------------------------------------------------------------------------- 
1020 // Returns true if the passed in character meets the criteria of the method 
1021 // --------------------------------------------------------------------------- 
1023 // unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~" 
1024 bool wxURI::IsUnreserved(char c
) 
1026     return IsAlpha(c
) || 
1035 bool wxURI::IsReserved(char c
) 
1037     return IsGenDelim(c
) || IsSubDelim(c
); 
1040 // gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@" 
1041 bool wxURI::IsGenDelim(char c
) 
1052 // sub-delims    = "!" / "$" / "&" / "'" / "(" / ")" 
1053 //               / "*" / "+" / "," / ";" / "=" 
1054 bool wxURI::IsSubDelim(char c
) 
1070 bool wxURI::IsHex(char c
) 
1072     return IsDigit(c
) || 
1073            (c 
>= 'a' && c 
<= 'f') || 
1074            (c 
>= 'A' && c 
<= 'F'); 
1077 bool wxURI::IsAlpha(char c
) 
1079     return (c 
>= 'a' && c 
<= 'z') || (c 
>= 'A' && c 
<= 'Z'); 
1082 bool wxURI::IsDigit(char c
) 
1084     return c 
>= '0' && c 
<= '9'; 
1087 bool wxURI::IsEndPath(char c
) 
1089     return c 
== '\0' || c 
== '#' || c 
== '?';