]>
git.saurik.com Git - wxWidgets.git/blob - src/common/uri.cpp
1 /////////////////////////////////////////////////////////////////////////////
3 // Purpose: Implementation of a URI parser
4 // Author: Ryan Norton,
5 // Vadim Zeitlin (UTF-8 URI support, many other changes)
8 // Copyright: (c) 2004 Ryan Norton,
10 // Licence: wxWindows licence
11 /////////////////////////////////////////////////////////////////////////////
13 // ===========================================================================
15 // ===========================================================================
17 // ---------------------------------------------------------------------------
19 // ---------------------------------------------------------------------------
21 // For compilers that support precompilation, includes "wx.h".
22 #include "wx/wxprec.h"
34 // ---------------------------------------------------------------------------
36 // ---------------------------------------------------------------------------
38 IMPLEMENT_CLASS(wxURI
, wxObject
)
40 // ===========================================================================
41 // wxURI implementation
42 // ===========================================================================
44 // ---------------------------------------------------------------------------
45 // Constructors and cleanup
46 // ---------------------------------------------------------------------------
49 : m_hostType(wxURI_REGNAME
),
54 wxURI::wxURI(const wxString
& uri
)
55 : m_hostType(wxURI_REGNAME
),
61 bool wxURI::Create(const wxString
& uri
)
66 return Parse(uri
.utf8_str());
77 m_fragment
= wxEmptyString
;
79 m_hostType
= wxURI_REGNAME
;
84 // ---------------------------------------------------------------------------
85 // Escaped characters handling
86 // ---------------------------------------------------------------------------
88 // Converts a character into a numeric hexadecimal value, or -1 if the passed
89 // in character is not a valid hex character
92 int wxURI::CharToHex(char c
)
94 if ((c
>= 'A') && (c
<= 'Z'))
96 if ((c
>= 'a') && (c
<= 'z'))
98 if ((c
>= '0') && (c
<= '9'))
104 int wxURI::DecodeEscape(wxString::const_iterator
& i
)
106 int hi
= CharToHex(*++i
);
110 int lo
= CharToHex(*++i
);
114 return (hi
<< 4) | lo
;
118 wxString
wxURI::Unescape(const wxString
& uri
)
120 // the unescaped version can't be longer than the original one
121 wxCharBuffer
buf(uri
.length());
122 char *p
= buf
.data();
124 for ( wxString::const_iterator i
= uri
.begin(); i
!= uri
.end(); ++i
, ++p
)
129 int n
= wxURI::DecodeEscape(i
);
133 wxASSERT_MSG( n
>= 0 && n
<= 0xff, "unexpected character value" );
135 c
= wx_static_cast(char, n
);
143 // by default assume that the URI is in UTF-8, this is the most common
145 wxString s
= wxString::FromUTF8(buf
);
148 // if it isn't, use latin-1 as a fallback -- at least this always
150 s
= wxCSConv(wxFONTENCODING_ISO8859_1
).cMB2WC(buf
);
156 void wxURI::AppendNextEscaped(wxString
& s
, const char *& p
)
158 // check for an already encoded character:
160 // pct-encoded = "%" HEXDIG HEXDIG
161 if ( p
[0] == '%' && IsHex(p
[1]) && IsHex(p
[2]) )
167 else // really needs escaping
169 static const char* hexDigits
= "0123456789abcdef";
174 s
+= hexDigits
[(c
>> 4) & 15];
175 s
+= hexDigits
[c
& 15];
179 // ---------------------------------------------------------------------------
183 // Gets the username and password via the old URL method.
184 // ---------------------------------------------------------------------------
185 wxString
wxURI::GetUser() const
187 size_t dwPasswordPos
= m_userinfo
.find(':');
189 if (dwPasswordPos
== wxString::npos
)
192 return m_userinfo(0, dwPasswordPos
);
195 wxString
wxURI::GetPassword() const
197 size_t dwPasswordPos
= m_userinfo
.find(':');
199 if (dwPasswordPos
== wxString::npos
)
202 return m_userinfo(dwPasswordPos
+1, m_userinfo
.length() + 1);
205 // combine all URI fields in a single string, applying funcDecode to each
206 // component which it may make sense to decode (i.e. "unescape")
207 wxString
wxURI::DoBuildURI(wxString (*funcDecode
)(const wxString
&)) const
212 ret
+= m_scheme
+ ":";
219 ret
+= funcDecode(m_userinfo
) + "@";
221 if (m_hostType
== wxURI_REGNAME
)
222 ret
+= funcDecode(m_server
);
230 ret
+= funcDecode(m_path
);
233 ret
+= "?" + funcDecode(m_query
);
236 ret
+= "#" + funcDecode(m_fragment
);
241 // ---------------------------------------------------------------------------
243 // ---------------------------------------------------------------------------
245 bool wxURI::operator==(const wxURI
& uri
) const
249 if(m_scheme
!= uri
.m_scheme
)
252 else if (uri
.HasScheme())
260 if (m_userinfo
!= uri
.m_userinfo
)
263 else if (uri
.HasUserInfo())
266 if (m_server
!= uri
.m_server
||
267 m_hostType
!= uri
.m_hostType
)
272 if(m_port
!= uri
.m_port
)
275 else if (uri
.HasPort())
278 else if (uri
.HasServer())
284 if(m_path
!= uri
.m_path
)
287 else if (uri
.HasPath())
292 if (m_query
!= uri
.m_query
)
295 else if (uri
.HasQuery())
300 if (m_fragment
!= uri
.m_fragment
)
303 else if (uri
.HasFragment())
309 // ---------------------------------------------------------------------------
312 // if there is no authority or scheme, it is a reference
313 // ---------------------------------------------------------------------------
315 bool wxURI::IsReference() const
317 return !HasScheme() || !HasServer();
320 // ---------------------------------------------------------------------------
323 // Master URI parsing method. Just calls the individual parsing methods
325 // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
326 // URI-reference = URI / relative
327 // ---------------------------------------------------------------------------
329 bool wxURI::Parse(const char *uri
)
331 uri
= ParseScheme(uri
);
333 uri
= ParseAuthority(uri
);
335 uri
= ParsePath(uri
);
337 uri
= ParseQuery(uri
);
339 uri
= ParseFragment(uri
);
341 // we only succeed if we parsed the entire string
342 return uri
&& *uri
== '\0';
345 const char* wxURI::ParseScheme(const char *uri
)
347 const char * const start
= uri
;
349 // assume that we have a scheme if we have the valid start of it
354 //scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
355 while (IsAlpha(*uri
) || IsDigit(*uri
) ||
366 //mark the scheme as valid
367 m_fields
|= wxURI_SCHEME
;
369 //move reference point up to input buffer
372 else // no valid scheme finally
374 uri
= start
; // rewind
378 //else: can't have schema, possible a relative URI
383 const char* wxURI::ParseAuthority(const char* uri
)
385 // authority = [ userinfo "@" ] host [ ":" port ]
386 if ( uri
[0] == '/' && uri
[1] == '/' )
388 //skip past the two slashes
391 // ############# DEVIATION FROM RFC #########################
392 // Don't parse the server component for file URIs
393 if(m_scheme
!= "file")
396 uri
= ParseUserInfo(uri
);
397 uri
= ParseServer(uri
);
398 return ParsePort(uri
);
405 const char* wxURI::ParseUserInfo(const char* uri
)
407 const char * const start
= uri
;
409 // userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
410 while ( *uri
&& *uri
!= '@' && *uri
!= '/' && *uri
!= '#' && *uri
!= '?' )
412 if ( IsUnreserved(*uri
) || IsSubDelim(*uri
) || *uri
== ':' )
413 m_userinfo
+= *uri
++;
415 AppendNextEscaped(m_userinfo
, uri
);
421 m_fields
|= wxURI_USERINFO
;
425 uri
= start
; // rewind
432 const char* wxURI::ParseServer(const char* uri
)
434 const char * const start
= uri
;
436 // host = IP-literal / IPv4address / reg-name
437 // IP-literal = "[" ( IPv6address / IPvFuture ) "]"
441 if (ParseIPv6address(uri
) && *uri
== ']')
443 m_hostType
= wxURI_IPV6ADDRESS
;
445 m_server
.assign(start
+ 1, uri
- start
- 1);
450 uri
= start
+ 1; // skip the leading '[' again
452 if (ParseIPvFuture(uri
) && *uri
== ']')
454 m_hostType
= wxURI_IPVFUTURE
;
456 m_server
.assign(start
+ 1, uri
- start
- 1);
459 else // unrecognized IP literal
465 else // IPv4 or a reg-name
467 if (ParseIPv4address(uri
))
469 m_hostType
= wxURI_IPV4ADDRESS
;
471 m_server
.assign(start
, uri
- start
);
479 if ( m_hostType
== wxURI_REGNAME
)
482 // reg-name = *( unreserved / pct-encoded / sub-delims )
483 while ( *uri
&& *uri
!= '/' && *uri
!= ':' && *uri
!= '#' && *uri
!= '?' )
485 if ( IsUnreserved(*uri
) || IsSubDelim(*uri
) )
488 AppendNextEscaped(m_server
, uri
);
492 m_fields
|= wxURI_SERVER
;
498 const char* wxURI::ParsePort(const char* uri
)
504 while ( IsDigit(*uri
) )
509 m_fields
|= wxURI_PORT
;
515 const char* wxURI::ParsePath(const char* uri
)
517 /// hier-part = "//" authority path-abempty
522 /// relative-part = "//" authority path-abempty
527 /// path-abempty = *( "/" segment )
528 /// path-absolute = "/" [ segment-nz *( "/" segment ) ]
529 /// path-noscheme = segment-nz-nc *( "/" segment )
530 /// path-rootless = segment-nz *( "/" segment )
531 /// path-empty = 0<pchar>
534 /// segment-nz = 1*pchar
535 /// segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
536 /// ; non-zero-length segment without any colon ":"
538 /// pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
540 if ( IsEndPath(*uri
) )
543 const bool isAbs
= *uri
== '/';
547 wxArrayString segments
;
551 const bool endPath
= IsEndPath(*uri
);
552 if ( endPath
|| *uri
== '/' )
554 // end of a segment, look at what we got
555 if ( segment
== ".." )
557 if ( !segments
.empty() && *segments
.rbegin() != ".." )
560 segments
.push_back("..");
562 else if ( segment
== "." )
564 // normally we ignore "." but the last one should be taken into
565 // account as "path/." is the same as "path/" and not just "path"
567 segments
.push_back("");
569 else // normal segment
571 segments
.push_back(segment
);
582 if ( IsUnreserved(*uri
) || IsSubDelim(*uri
) || *uri
== ':' || *uri
== '@' )
585 AppendNextEscaped(segment
, uri
);
588 m_path
+= wxJoin(segments
, '/', '\0');
589 m_fields
|= wxURI_PATH
;
595 const char* wxURI::ParseQuery(const char* uri
)
597 // query = *( pchar / "/" / "?" )
601 while ( *uri
&& *uri
!= '#' )
603 if ( IsUnreserved(*uri
) || IsSubDelim(*uri
) ||
604 *uri
== ':' || *uri
== '@' || *uri
== '/' || *uri
== '?' )
607 AppendNextEscaped(m_query
, uri
);
610 m_fields
|= wxURI_QUERY
;
617 const char* wxURI::ParseFragment(const char* uri
)
619 // fragment = *( pchar / "/" / "?" )
625 if ( IsUnreserved(*uri
) || IsSubDelim(*uri
) ||
626 *uri
== ':' || *uri
== '@' || *uri
== '/' || *uri
== '?')
627 m_fragment
+= *uri
++;
629 AppendNextEscaped(m_fragment
, uri
);
632 m_fields
|= wxURI_FRAGMENT
;
638 // ---------------------------------------------------------------------------
641 // Builds missing components of this uri from a base uri
643 // A version of the algorithm outlined in the RFC is used here
644 // (it is shown in comments)
646 // Note that an empty URI inherits all components
647 // ---------------------------------------------------------------------------
650 wxArrayString
wxURI::SplitInSegments(const wxString
& path
)
652 return wxSplit(path
, '/', '\0' /* no escape character */);
655 void wxURI::Resolve(const wxURI
& base
, int flags
)
657 wxASSERT_MSG(!base
.IsReference(),
658 "wxURI to inherit from must not be a reference!");
660 // If we aren't being strict, enable the older (pre-RFC2396) loophole that
661 // allows this uri to inherit other properties from the base uri - even if
662 // the scheme is defined
663 if ( !(flags
& wxURI_STRICT
) &&
664 HasScheme() && base
.HasScheme() &&
665 m_scheme
== base
.m_scheme
)
667 m_fields
-= wxURI_SCHEME
;
671 // Do nothing if this is an absolute wxURI
672 // if defined(R.scheme) then
673 // T.scheme = R.scheme;
674 // T.authority = R.authority;
675 // T.path = remove_dot_segments(R.path);
676 // T.query = R.query;
680 //No scheme - inherit
681 m_scheme
= base
.m_scheme
;
682 m_fields
|= wxURI_SCHEME
;
684 // All we need to do for relative URIs with an
685 // authority component is just inherit the scheme
686 // if defined(R.authority) then
687 // T.authority = R.authority;
688 // T.path = remove_dot_segments(R.path);
689 // T.query = R.query;
693 //No authority - inherit
694 if (base
.HasUserInfo())
696 m_userinfo
= base
.m_userinfo
;
697 m_fields
|= wxURI_USERINFO
;
700 m_server
= base
.m_server
;
701 m_hostType
= base
.m_hostType
;
702 m_fields
|= wxURI_SERVER
;
706 m_port
= base
.m_port
;
707 m_fields
|= wxURI_PORT
;
711 // Simple path inheritance from base
714 // T.path = Base.path;
715 m_path
= base
.m_path
;
716 m_fields
|= wxURI_PATH
;
719 // if defined(R.query) then
720 // T.query = R.query;
722 // T.query = Base.query;
726 m_query
= base
.m_query
;
727 m_fields
|= wxURI_QUERY
;
730 else if ( m_path
.empty() || m_path
[0u] != '/' )
732 // if (R.path starts-with "/") then
733 // T.path = remove_dot_segments(R.path);
735 // T.path = merge(Base.path, R.path);
736 // T.path = remove_dot_segments(T.path);
738 // T.query = R.query;
740 // So we don't do anything for absolute paths and implement merge for
743 wxArrayString
our(SplitInSegments(m_path
)),
744 result(SplitInSegments(base
.m_path
));
746 if ( !result
.empty() )
751 // if we have an empty path it means we were constructed from a "."
752 // string or something similar (e.g. "././././"), it should count
753 // as (empty) segment
757 const wxArrayString::const_iterator end
= our
.end();
758 for ( wxArrayString::const_iterator i
= our
.begin(); i
!= end
; ++i
)
760 if ( i
->empty() || *i
== "." )
762 // as in ParsePath(), while normally we ignore the empty
763 // segments, we need to take account of them at the end
765 result
.push_back("");
771 if ( !result
.empty() )
776 result
.push_back("");
778 //else: just ignore, extra ".." don't accumulate
782 if ( result
.empty() )
784 // ensure that the resulting path will always be absolute
785 result
.push_back("");
788 result
.push_back(*i
);
792 m_path
= wxJoin(result
, '/', '\0');
795 //T.fragment = R.fragment;
798 // ---------------------------------------------------------------------------
801 // Parses 1 to 4 hex values. Returns true if the first character of the input
802 // string is a valid hex character. It is the caller's responsibility to move
803 // the input string back to its original position on failure.
804 // ---------------------------------------------------------------------------
806 bool wxURI::ParseH16(const char*& uri
)
812 if(IsHex(*++uri
) && IsHex(*++uri
) && IsHex(*++uri
))
818 // ---------------------------------------------------------------------------
821 // Parses a certain version of an IP address and moves the input string past
822 // it. Returns true if the input string contains the proper version of an ip
823 // address. It is the caller's responsability to move the input string back
824 // to its original position on failure.
825 // ---------------------------------------------------------------------------
827 bool wxURI::ParseIPv4address(const char*& uri
)
829 //IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
831 //dec-octet = DIGIT ; 0-9
832 // / %x31-39 DIGIT ; 10-99
833 // / "1" 2DIGIT ; 100-199
834 // / "2" %x30-34 DIGIT ; 200-249
835 // / "25" %x30-35 ; 250-255
842 //each ip part must be between 0-255 (dupe of version in for loop)
843 if( IsDigit(*++uri
) && IsDigit(*++uri
) &&
844 //100 or less (note !)
845 !( (*(uri
-2) < '2') ||
848 (*(uri
-1) < '5' || (*(uri
-1) == '5' && *uri
<= '5'))
856 if(IsDigit(*uri
))++uri
;
858 //compilers should unroll this loop
859 for(; iIPv4
< 4; ++iIPv4
)
861 if (*uri
!= '.' || !IsDigit(*++uri
))
864 //each ip part must be between 0-255
865 if( IsDigit(*++uri
) && IsDigit(*++uri
) &&
866 //100 or less (note !)
867 !( (*(uri
-2) < '2') ||
870 (*(uri
-1) < '5' || (*(uri
-1) == '5' && *uri
<= '5'))
877 if(IsDigit(*uri
))++uri
;
883 bool wxURI::ParseIPv6address(const char*& uri
)
885 // IPv6address = 6( h16 ":" ) ls32
886 // / "::" 5( h16 ":" ) ls32
887 // / [ h16 ] "::" 4( h16 ":" ) ls32
888 // / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
889 // / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
890 // / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
891 // / [ *4( h16 ":" ) h16 ] "::" ls32
892 // / [ *5( h16 ":" ) h16 ] "::" h16
893 // / [ *6( h16 ":" ) h16 ] "::"
895 size_t numPrefix
= 0,
898 bool bEndHex
= false;
900 for( ; numPrefix
< 6; ++numPrefix
)
915 if(!bEndHex
&& !ParseH16(uri
))
934 if (*uri
!= ':' || *(uri
+1) != ':')
939 while (*--uri
!= ':') {}
942 const char * const start
= uri
;
944 // ls32 = ( h16 ":" h16 ) / IPv4address
945 if (ParseH16(uri
) && *uri
== ':' && ParseH16(uri
))
950 if (ParseIPv4address(uri
))
962 maxPostfix
= 4 - numPrefix
;
966 bool bAllowAltEnding
= maxPostfix
== 0;
968 for(; maxPostfix
!= 0; --maxPostfix
)
970 if(!ParseH16(uri
) || *uri
!= ':')
976 const char * const start
= uri
;
978 // ls32 = ( h16 ":" h16 ) / IPv4address
979 if (ParseH16(uri
) && *uri
== ':' && ParseH16(uri
))
984 if (ParseIPv4address(uri
))
989 if (!bAllowAltEnding
)
993 if(numPrefix
<= 5 && ParseH16(uri
))
999 bool wxURI::ParseIPvFuture(const char*& uri
)
1001 // IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
1002 if (*++uri
!= 'v' || !IsHex(*++uri
))
1005 while (IsHex(*++uri
))
1008 if (*uri
!= '.' || !(IsUnreserved(*++uri
) || IsSubDelim(*uri
) || *uri
== ':'))
1011 while(IsUnreserved(*++uri
) || IsSubDelim(*uri
) || *uri
== ':') {}
1017 // ---------------------------------------------------------------------------
1020 // Returns true if the passed in character meets the criteria of the method
1021 // ---------------------------------------------------------------------------
1023 // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
1024 bool wxURI::IsUnreserved(char c
)
1026 return IsAlpha(c
) ||
1035 bool wxURI::IsReserved(char c
)
1037 return IsGenDelim(c
) || IsSubDelim(c
);
1040 // gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
1041 bool wxURI::IsGenDelim(char c
)
1052 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
1053 // / "*" / "+" / "," / ";" / "="
1054 bool wxURI::IsSubDelim(char c
)
1070 bool wxURI::IsHex(char c
)
1072 return IsDigit(c
) ||
1073 (c
>= 'a' && c
<= 'f') ||
1074 (c
>= 'A' && c
<= 'F');
1077 bool wxURI::IsAlpha(char c
)
1079 return (c
>= 'a' && c
<= 'z') || (c
>= 'A' && c
<= 'Z');
1082 bool wxURI::IsDigit(char c
)
1084 return c
>= '0' && c
<= '9';
1087 bool wxURI::IsEndPath(char c
)
1089 return c
== '\0' || c
== '#' || c
== '?';