]>
git.saurik.com Git - wxWidgets.git/blob - src/common/uri.cpp
807268f80d19741500e9beb61e92b75e64d15e48
1 /////////////////////////////////////////////////////////////////////////////
3 // Purpose: Implementation of a URI parser
4 // Author: Ryan Norton,
5 // Vadim Zeitlin (UTF-8 URI support, many other changes)
8 // Copyright: (c) 2004 Ryan Norton,
10 // Licence: wxWindows licence
11 /////////////////////////////////////////////////////////////////////////////
13 // ===========================================================================
15 // ===========================================================================
17 // ---------------------------------------------------------------------------
19 // ---------------------------------------------------------------------------
21 // For compilers that support precompilation, includes "wx.h".
22 #include "wx/wxprec.h"
30 #include "wx/arrstr.h"
35 // ---------------------------------------------------------------------------
37 // ---------------------------------------------------------------------------
39 IMPLEMENT_CLASS(wxURI
, wxObject
)
41 // ===========================================================================
42 // wxURI implementation
43 // ===========================================================================
45 // ---------------------------------------------------------------------------
46 // Constructors and cleanup
47 // ---------------------------------------------------------------------------
50 : m_hostType(wxURI_REGNAME
),
55 wxURI::wxURI(const wxString
& uri
)
56 : m_hostType(wxURI_REGNAME
),
62 bool wxURI::Create(const wxString
& uri
)
67 return Parse(uri
.utf8_str());
78 m_fragment
= wxEmptyString
;
80 m_hostType
= wxURI_REGNAME
;
85 // ---------------------------------------------------------------------------
86 // Escaped characters handling
87 // ---------------------------------------------------------------------------
89 // Converts a character into a numeric hexadecimal value, or -1 if the passed
90 // in character is not a valid hex character
93 int wxURI::CharToHex(char c
)
95 if ((c
>= 'A') && (c
<= 'Z'))
97 if ((c
>= 'a') && (c
<= 'z'))
99 if ((c
>= '0') && (c
<= '9'))
105 int wxURI::DecodeEscape(wxString::const_iterator
& i
)
107 int hi
= CharToHex(*++i
);
111 int lo
= CharToHex(*++i
);
115 return (hi
<< 4) | lo
;
119 wxString
wxURI::Unescape(const wxString
& uri
)
121 // the unescaped version can't be longer than the original one
122 wxCharBuffer
buf(uri
.length());
123 char *p
= buf
.data();
125 for ( wxString::const_iterator i
= uri
.begin(); i
!= uri
.end(); ++i
, ++p
)
130 int n
= wxURI::DecodeEscape(i
);
134 wxASSERT_MSG( n
>= 0 && n
<= 0xff, "unexpected character value" );
136 c
= wx_static_cast(char, n
);
144 // by default assume that the URI is in UTF-8, this is the most common
146 wxString s
= wxString::FromUTF8(buf
);
149 // if it isn't, use latin-1 as a fallback -- at least this always
151 s
= wxCSConv(wxFONTENCODING_ISO8859_1
).cMB2WC(buf
);
157 void wxURI::AppendNextEscaped(wxString
& s
, const char *& p
)
159 // check for an already encoded character:
161 // pct-encoded = "%" HEXDIG HEXDIG
162 if ( p
[0] == '%' && IsHex(p
[1]) && IsHex(p
[2]) )
168 else // really needs escaping
170 static const char* hexDigits
= "0123456789abcdef";
175 s
+= hexDigits
[(c
>> 4) & 15];
176 s
+= hexDigits
[c
& 15];
180 // ---------------------------------------------------------------------------
184 // Gets the username and password via the old URL method.
185 // ---------------------------------------------------------------------------
186 wxString
wxURI::GetUser() const
188 size_t dwPasswordPos
= m_userinfo
.find(':');
190 if (dwPasswordPos
== wxString::npos
)
193 return m_userinfo(0, dwPasswordPos
);
196 wxString
wxURI::GetPassword() const
198 size_t dwPasswordPos
= m_userinfo
.find(':');
200 if (dwPasswordPos
== wxString::npos
)
203 return m_userinfo(dwPasswordPos
+1, m_userinfo
.length() + 1);
206 // combine all URI fields in a single string, applying funcDecode to each
207 // component which it may make sense to decode (i.e. "unescape")
208 wxString
wxURI::DoBuildURI(wxString (*funcDecode
)(const wxString
&)) const
213 ret
+= m_scheme
+ ":";
220 ret
+= funcDecode(m_userinfo
) + "@";
222 if (m_hostType
== wxURI_REGNAME
)
223 ret
+= funcDecode(m_server
);
231 ret
+= funcDecode(m_path
);
234 ret
+= "?" + funcDecode(m_query
);
237 ret
+= "#" + funcDecode(m_fragment
);
242 // ---------------------------------------------------------------------------
244 // ---------------------------------------------------------------------------
246 bool wxURI::operator==(const wxURI
& uri
) const
250 if(m_scheme
!= uri
.m_scheme
)
253 else if (uri
.HasScheme())
261 if (m_userinfo
!= uri
.m_userinfo
)
264 else if (uri
.HasUserInfo())
267 if (m_server
!= uri
.m_server
||
268 m_hostType
!= uri
.m_hostType
)
273 if(m_port
!= uri
.m_port
)
276 else if (uri
.HasPort())
279 else if (uri
.HasServer())
285 if(m_path
!= uri
.m_path
)
288 else if (uri
.HasPath())
293 if (m_query
!= uri
.m_query
)
296 else if (uri
.HasQuery())
301 if (m_fragment
!= uri
.m_fragment
)
304 else if (uri
.HasFragment())
310 // ---------------------------------------------------------------------------
313 // if there is no authority or scheme, it is a reference
314 // ---------------------------------------------------------------------------
316 bool wxURI::IsReference() const
318 return !HasScheme() || !HasServer();
321 // ---------------------------------------------------------------------------
324 // Master URI parsing method. Just calls the individual parsing methods
326 // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
327 // URI-reference = URI / relative
328 // ---------------------------------------------------------------------------
330 bool wxURI::Parse(const char *uri
)
332 uri
= ParseScheme(uri
);
334 uri
= ParseAuthority(uri
);
336 uri
= ParsePath(uri
);
338 uri
= ParseQuery(uri
);
340 uri
= ParseFragment(uri
);
342 // we only succeed if we parsed the entire string
343 return uri
&& *uri
== '\0';
346 const char* wxURI::ParseScheme(const char *uri
)
348 const char * const start
= uri
;
350 // assume that we have a scheme if we have the valid start of it
355 //scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
356 while (IsAlpha(*uri
) || IsDigit(*uri
) ||
367 //mark the scheme as valid
368 m_fields
|= wxURI_SCHEME
;
370 //move reference point up to input buffer
373 else // no valid scheme finally
375 uri
= start
; // rewind
379 //else: can't have schema, possible a relative URI
384 const char* wxURI::ParseAuthority(const char* uri
)
386 // authority = [ userinfo "@" ] host [ ":" port ]
387 if ( uri
[0] == '/' && uri
[1] == '/' )
389 //skip past the two slashes
392 // ############# DEVIATION FROM RFC #########################
393 // Don't parse the server component for file URIs
394 if(m_scheme
!= "file")
397 uri
= ParseUserInfo(uri
);
398 uri
= ParseServer(uri
);
399 return ParsePort(uri
);
406 const char* wxURI::ParseUserInfo(const char* uri
)
408 const char * const start
= uri
;
410 // userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
411 while ( *uri
&& *uri
!= '@' && *uri
!= '/' && *uri
!= '#' && *uri
!= '?' )
413 if ( IsUnreserved(*uri
) || IsSubDelim(*uri
) || *uri
== ':' )
414 m_userinfo
+= *uri
++;
416 AppendNextEscaped(m_userinfo
, uri
);
422 m_fields
|= wxURI_USERINFO
;
426 uri
= start
; // rewind
433 const char* wxURI::ParseServer(const char* uri
)
435 const char * const start
= uri
;
437 // host = IP-literal / IPv4address / reg-name
438 // IP-literal = "[" ( IPv6address / IPvFuture ) "]"
442 if (ParseIPv6address(uri
) && *uri
== ']')
444 m_hostType
= wxURI_IPV6ADDRESS
;
446 m_server
.assign(start
, uri
- start
- 1);
451 uri
= start
+ 1; // skip the leading '[' again
453 if (ParseIPvFuture(uri
) && *uri
== ']')
455 m_hostType
= wxURI_IPVFUTURE
;
457 m_server
.assign(start
, uri
- start
- 1);
460 else // unrecognized IP literal
466 else // IPv4 or a reg-name
468 if (ParseIPv4address(uri
))
470 m_hostType
= wxURI_IPV4ADDRESS
;
472 m_server
.assign(start
, uri
- start
- 1);
480 if ( m_hostType
== wxURI_REGNAME
)
483 // reg-name = *( unreserved / pct-encoded / sub-delims )
484 while ( *uri
&& *uri
!= '/' && *uri
!= ':' && *uri
!= '#' && *uri
!= '?' )
486 if ( IsUnreserved(*uri
) || IsSubDelim(*uri
) )
489 AppendNextEscaped(m_server
, uri
);
493 m_fields
|= wxURI_SERVER
;
499 const char* wxURI::ParsePort(const char* uri
)
505 while ( IsDigit(*uri
) )
510 m_fields
|= wxURI_PORT
;
516 const char* wxURI::ParsePath(const char* uri
)
518 /// hier-part = "//" authority path-abempty
523 /// relative-part = "//" authority path-abempty
528 /// path-abempty = *( "/" segment )
529 /// path-absolute = "/" [ segment-nz *( "/" segment ) ]
530 /// path-noscheme = segment-nz-nc *( "/" segment )
531 /// path-rootless = segment-nz *( "/" segment )
532 /// path-empty = 0<pchar>
535 /// segment-nz = 1*pchar
536 /// segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
537 /// ; non-zero-length segment without any colon ":"
539 /// pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
541 if ( IsEndPath(*uri
) )
544 const bool isAbs
= *uri
== '/';
548 wxArrayString segments
;
552 const bool endPath
= IsEndPath(*uri
);
553 if ( endPath
|| *uri
== '/' )
555 // end of a segment, look at what we got
556 if ( segment
== ".." )
558 if ( !segments
.empty() && *segments
.rbegin() != ".." )
561 segments
.push_back("..");
563 else if ( segment
== "." )
565 // normally we ignore "." but the last one should be taken into
566 // account as "path/." is the same as "path/" and not just "path"
568 segments
.push_back("");
570 else // normal segment
572 segments
.push_back(segment
);
583 if ( IsUnreserved(*uri
) || IsSubDelim(*uri
) || *uri
== ':' || *uri
== '@' )
586 AppendNextEscaped(segment
, uri
);
589 m_path
+= wxJoin(segments
, '/', '\0');
590 m_fields
|= wxURI_PATH
;
596 const char* wxURI::ParseQuery(const char* uri
)
598 // query = *( pchar / "/" / "?" )
602 while ( *uri
&& *uri
!= '#' )
604 if ( IsUnreserved(*uri
) || IsSubDelim(*uri
) ||
605 *uri
== ':' || *uri
== '@' || *uri
== '/' || *uri
== '?' )
608 AppendNextEscaped(m_query
, uri
);
611 m_fields
|= wxURI_QUERY
;
618 const char* wxURI::ParseFragment(const char* uri
)
620 // fragment = *( pchar / "/" / "?" )
626 if ( IsUnreserved(*uri
) || IsSubDelim(*uri
) ||
627 *uri
== ':' || *uri
== '@' || *uri
== '/' || *uri
== '?')
628 m_fragment
+= *uri
++;
630 AppendNextEscaped(m_fragment
, uri
);
633 m_fields
|= wxURI_FRAGMENT
;
639 // ---------------------------------------------------------------------------
642 // Builds missing components of this uri from a base uri
644 // A version of the algorithm outlined in the RFC is used here
645 // (it is shown in comments)
647 // Note that an empty URI inherits all components
648 // ---------------------------------------------------------------------------
651 wxArrayString
wxURI::SplitInSegments(const wxString
& path
)
653 return wxSplit(path
, '/', '\0' /* no escape character */);
656 void wxURI::Resolve(const wxURI
& base
, int flags
)
658 wxASSERT_MSG(!base
.IsReference(),
659 "wxURI to inherit from must not be a reference!");
661 // If we aren't being strict, enable the older (pre-RFC2396) loophole that
662 // allows this uri to inherit other properties from the base uri - even if
663 // the scheme is defined
664 if ( !(flags
& wxURI_STRICT
) &&
665 HasScheme() && base
.HasScheme() &&
666 m_scheme
== base
.m_scheme
)
668 m_fields
-= wxURI_SCHEME
;
672 // Do nothing if this is an absolute wxURI
673 // if defined(R.scheme) then
674 // T.scheme = R.scheme;
675 // T.authority = R.authority;
676 // T.path = remove_dot_segments(R.path);
677 // T.query = R.query;
681 //No scheme - inherit
682 m_scheme
= base
.m_scheme
;
683 m_fields
|= wxURI_SCHEME
;
685 // All we need to do for relative URIs with an
686 // authority component is just inherit the scheme
687 // if defined(R.authority) then
688 // T.authority = R.authority;
689 // T.path = remove_dot_segments(R.path);
690 // T.query = R.query;
694 //No authority - inherit
695 if (base
.HasUserInfo())
697 m_userinfo
= base
.m_userinfo
;
698 m_fields
|= wxURI_USERINFO
;
701 m_server
= base
.m_server
;
702 m_hostType
= base
.m_hostType
;
703 m_fields
|= wxURI_SERVER
;
707 m_port
= base
.m_port
;
708 m_fields
|= wxURI_PORT
;
712 // Simple path inheritance from base
715 // T.path = Base.path;
716 m_path
= base
.m_path
;
717 m_fields
|= wxURI_PATH
;
720 // if defined(R.query) then
721 // T.query = R.query;
723 // T.query = Base.query;
727 m_query
= base
.m_query
;
728 m_fields
|= wxURI_QUERY
;
731 else if ( m_path
.empty() || m_path
[0u] != '/' )
733 // if (R.path starts-with "/") then
734 // T.path = remove_dot_segments(R.path);
736 // T.path = merge(Base.path, R.path);
737 // T.path = remove_dot_segments(T.path);
739 // T.query = R.query;
741 // So we don't do anything for absolute paths and implement merge for
744 wxArrayString
our(SplitInSegments(m_path
)),
745 result(SplitInSegments(base
.m_path
));
747 if ( !result
.empty() )
752 // if we have an empty path it means we were constructed from a "."
753 // string or something similar (e.g. "././././"), it should count
754 // as (empty) segment
758 const wxArrayString::const_iterator end
= our
.end();
759 for ( wxArrayString::const_iterator i
= our
.begin(); i
!= end
; ++i
)
761 if ( i
->empty() || *i
== "." )
763 // as in ParsePath(), while normally we ignore the empty
764 // segments, we need to take account of them at the end
766 result
.push_back("");
772 if ( !result
.empty() )
777 result
.push_back("");
779 //else: just ignore, extra ".." don't accumulate
783 if ( result
.empty() )
785 // ensure that the resulting path will always be absolute
786 result
.push_back("");
789 result
.push_back(*i
);
793 m_path
= wxJoin(result
, '/', '\0');
796 //T.fragment = R.fragment;
799 // ---------------------------------------------------------------------------
802 // Parses 1 to 4 hex values. Returns true if the first character of the input
803 // string is a valid hex character. It is the caller's responsibility to move
804 // the input string back to its original position on failure.
805 // ---------------------------------------------------------------------------
807 bool wxURI::ParseH16(const char*& uri
)
813 if(IsHex(*++uri
) && IsHex(*++uri
) && IsHex(*++uri
))
819 // ---------------------------------------------------------------------------
822 // Parses a certain version of an IP address and moves the input string past
823 // it. Returns true if the input string contains the proper version of an ip
824 // address. It is the caller's responsability to move the input string back
825 // to its original position on failure.
826 // ---------------------------------------------------------------------------
828 bool wxURI::ParseIPv4address(const char*& uri
)
830 //IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
832 //dec-octet = DIGIT ; 0-9
833 // / %x31-39 DIGIT ; 10-99
834 // / "1" 2DIGIT ; 100-199
835 // / "2" %x30-34 DIGIT ; 200-249
836 // / "25" %x30-35 ; 250-255
843 //each ip part must be between 0-255 (dupe of version in for loop)
844 if( IsDigit(*++uri
) && IsDigit(*++uri
) &&
845 //100 or less (note !)
846 !( (*(uri
-2) < '2') ||
849 (*(uri
-1) < '5' || (*(uri
-1) == '5' && *uri
<= '5'))
857 if(IsDigit(*uri
))++uri
;
859 //compilers should unroll this loop
860 for(; iIPv4
< 4; ++iIPv4
)
862 if (*uri
!= '.' || !IsDigit(*++uri
))
865 //each ip part must be between 0-255
866 if( IsDigit(*++uri
) && IsDigit(*++uri
) &&
867 //100 or less (note !)
868 !( (*(uri
-2) < '2') ||
871 (*(uri
-1) < '5' || (*(uri
-1) == '5' && *uri
<= '5'))
878 if(IsDigit(*uri
))++uri
;
884 bool wxURI::ParseIPv6address(const char*& uri
)
886 // IPv6address = 6( h16 ":" ) ls32
887 // / "::" 5( h16 ":" ) ls32
888 // / [ h16 ] "::" 4( h16 ":" ) ls32
889 // / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
890 // / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
891 // / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
892 // / [ *4( h16 ":" ) h16 ] "::" ls32
893 // / [ *5( h16 ":" ) h16 ] "::" h16
894 // / [ *6( h16 ":" ) h16 ] "::"
896 size_t numPrefix
= 0,
899 bool bEndHex
= false;
901 for( ; numPrefix
< 6; ++numPrefix
)
916 if(!bEndHex
&& !ParseH16(uri
))
935 if (*uri
!= ':' || *(uri
+1) != ':')
940 while (*--uri
!= ':') {}
943 const char * const start
= uri
;
945 // ls32 = ( h16 ":" h16 ) / IPv4address
946 if (ParseH16(uri
) && *uri
== ':' && ParseH16(uri
))
951 if (ParseIPv4address(uri
))
963 maxPostfix
= 4 - numPrefix
;
967 bool bAllowAltEnding
= maxPostfix
== 0;
969 for(; maxPostfix
!= 0; --maxPostfix
)
971 if(!ParseH16(uri
) || *uri
!= ':')
977 const char * const start
= uri
;
979 // ls32 = ( h16 ":" h16 ) / IPv4address
980 if (ParseH16(uri
) && *uri
== ':' && ParseH16(uri
))
985 if (ParseIPv4address(uri
))
990 if (!bAllowAltEnding
)
994 if(numPrefix
<= 5 && ParseH16(uri
))
1000 bool wxURI::ParseIPvFuture(const char*& uri
)
1002 // IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
1003 if (*++uri
!= 'v' || !IsHex(*++uri
))
1006 while (IsHex(*++uri
))
1009 if (*uri
!= '.' || !(IsUnreserved(*++uri
) || IsSubDelim(*uri
) || *uri
== ':'))
1012 while(IsUnreserved(*++uri
) || IsSubDelim(*uri
) || *uri
== ':') {}
1018 // ---------------------------------------------------------------------------
1021 // Returns true if the passed in character meets the criteria of the method
1022 // ---------------------------------------------------------------------------
1024 // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
1025 bool wxURI::IsUnreserved(char c
)
1027 return IsAlpha(c
) ||
1036 bool wxURI::IsReserved(char c
)
1038 return IsGenDelim(c
) || IsSubDelim(c
);
1041 // gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
1042 bool wxURI::IsGenDelim(char c
)
1053 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
1054 // / "*" / "+" / "," / ";" / "="
1055 bool wxURI::IsSubDelim(char c
)
1071 bool wxURI::IsHex(char c
)
1073 return IsDigit(c
) ||
1074 (c
>= 'a' && c
<= 'f') ||
1075 (c
>= 'A' && c
<= 'F');
1078 bool wxURI::IsAlpha(char c
)
1080 return (c
>= 'a' && c
<= 'z') || (c
>= 'A' && c
<= 'Z');
1083 bool wxURI::IsDigit(char c
)
1085 return c
>= '0' && c
<= '9';
1088 bool wxURI::IsEndPath(char c
)
1090 return c
== '\0' || c
== '#' || c
== '?';