]>
git.saurik.com Git - wxWidgets.git/blob - src/common/uri.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/uri.cpp
3 // Purpose: Implementation of a URI parser
4 // Author: Ryan Norton,
5 // Vadim Zeitlin (UTF-8 URI support, many other changes)
7 // Copyright: (c) 2004 Ryan Norton,
9 // Licence: wxWindows licence
10 /////////////////////////////////////////////////////////////////////////////
12 // ===========================================================================
14 // ===========================================================================
16 // ---------------------------------------------------------------------------
18 // ---------------------------------------------------------------------------
20 // For compilers that support precompilation, includes "wx.h".
21 #include "wx/wxprec.h"
33 // ---------------------------------------------------------------------------
35 // ---------------------------------------------------------------------------
37 IMPLEMENT_CLASS(wxURI
, wxObject
)
39 // ===========================================================================
40 // wxURI implementation
41 // ===========================================================================
43 // ---------------------------------------------------------------------------
44 // Constructors and cleanup
45 // ---------------------------------------------------------------------------
48 : m_hostType(wxURI_REGNAME
),
53 wxURI::wxURI(const wxString
& uri
)
54 : m_hostType(wxURI_REGNAME
),
60 bool wxURI::Create(const wxString
& uri
)
65 return Parse(uri
.utf8_str());
76 m_fragment
= wxEmptyString
;
78 m_hostType
= wxURI_REGNAME
;
83 // ---------------------------------------------------------------------------
84 // Escaped characters handling
85 // ---------------------------------------------------------------------------
87 // Converts a character into a numeric hexadecimal value, or -1 if the passed
88 // in character is not a valid hex character
91 int wxURI::CharToHex(char c
)
93 if ((c
>= 'A') && (c
<= 'Z'))
95 if ((c
>= 'a') && (c
<= 'z'))
97 if ((c
>= '0') && (c
<= '9'))
103 int wxURI::DecodeEscape(wxString::const_iterator
& i
)
105 int hi
= CharToHex(*++i
);
109 int lo
= CharToHex(*++i
);
113 return (hi
<< 4) | lo
;
117 wxString
wxURI::Unescape(const wxString
& uri
)
119 // the unescaped version can't be longer than the original one
120 wxCharBuffer
buf(uri
.length());
121 char *p
= buf
.data();
123 for ( wxString::const_iterator i
= uri
.begin(); i
!= uri
.end(); ++i
, ++p
)
128 int n
= wxURI::DecodeEscape(i
);
132 wxASSERT_MSG( n
>= 0 && n
<= 0xff, "unexpected character value" );
134 c
= static_cast<char>(n
);
142 // by default assume that the URI is in UTF-8, this is the most common
144 wxString s
= wxString::FromUTF8(buf
);
147 // if it isn't, use latin-1 as a fallback -- at least this always
149 s
= wxCSConv(wxFONTENCODING_ISO8859_1
).cMB2WC(buf
);
155 void wxURI::AppendNextEscaped(wxString
& s
, const char *& p
)
157 // check for an already encoded character:
159 // pct-encoded = "%" HEXDIG HEXDIG
160 if ( p
[0] == '%' && IsHex(p
[1]) && IsHex(p
[2]) )
166 else // really needs escaping
168 static const char* hexDigits
= "0123456789abcdef";
173 s
+= hexDigits
[(c
>> 4) & 15];
174 s
+= hexDigits
[c
& 15];
178 // ---------------------------------------------------------------------------
182 // Gets the username and password via the old URL method.
183 // ---------------------------------------------------------------------------
184 wxString
wxURI::GetUser() const
186 // if there is no colon at all, find() returns npos and this method returns
187 // the entire string which is correct as it means that password was omitted
188 return m_userinfo(0, m_userinfo
.find(':'));
191 wxString
wxURI::GetPassword() const
193 size_t posColon
= m_userinfo
.find(':');
195 if ( posColon
== wxString::npos
)
198 return m_userinfo(posColon
+ 1, wxString::npos
);
201 // combine all URI fields in a single string, applying funcDecode to each
202 // component which it may make sense to decode (i.e. "unescape")
203 wxString
wxURI::DoBuildURI(wxString (*funcDecode
)(const wxString
&)) const
208 ret
+= m_scheme
+ ":";
215 ret
+= funcDecode(m_userinfo
) + "@";
217 if (m_hostType
== wxURI_REGNAME
)
218 ret
+= funcDecode(m_server
);
226 ret
+= funcDecode(m_path
);
229 ret
+= "?" + funcDecode(m_query
);
232 ret
+= "#" + funcDecode(m_fragment
);
237 // ---------------------------------------------------------------------------
239 // ---------------------------------------------------------------------------
241 bool wxURI::operator==(const wxURI
& uri
) const
245 if(m_scheme
!= uri
.m_scheme
)
248 else if (uri
.HasScheme())
256 if (m_userinfo
!= uri
.m_userinfo
)
259 else if (uri
.HasUserInfo())
262 if (m_server
!= uri
.m_server
||
263 m_hostType
!= uri
.m_hostType
)
268 if(m_port
!= uri
.m_port
)
271 else if (uri
.HasPort())
274 else if (uri
.HasServer())
280 if(m_path
!= uri
.m_path
)
283 else if (uri
.HasPath())
288 if (m_query
!= uri
.m_query
)
291 else if (uri
.HasQuery())
296 if (m_fragment
!= uri
.m_fragment
)
299 else if (uri
.HasFragment())
305 // ---------------------------------------------------------------------------
308 // if there is no authority or scheme, it is a reference
309 // ---------------------------------------------------------------------------
311 bool wxURI::IsReference() const
313 return !HasScheme() || !HasServer();
316 // ---------------------------------------------------------------------------
319 // Master URI parsing method. Just calls the individual parsing methods
321 // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
322 // URI-reference = URI / relative
323 // ---------------------------------------------------------------------------
325 bool wxURI::Parse(const char *uri
)
327 uri
= ParseScheme(uri
);
329 uri
= ParseAuthority(uri
);
331 uri
= ParsePath(uri
);
333 uri
= ParseQuery(uri
);
335 uri
= ParseFragment(uri
);
337 // we only succeed if we parsed the entire string
338 return uri
&& *uri
== '\0';
341 const char* wxURI::ParseScheme(const char *uri
)
343 const char * const start
= uri
;
345 // assume that we have a scheme if we have the valid start of it
350 //scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
351 while (IsAlpha(*uri
) || IsDigit(*uri
) ||
362 //mark the scheme as valid
363 m_fields
|= wxURI_SCHEME
;
365 //move reference point up to input buffer
368 else // no valid scheme finally
370 uri
= start
; // rewind
374 //else: can't have schema, possible a relative URI
379 const char* wxURI::ParseAuthority(const char* uri
)
381 // authority = [ userinfo "@" ] host [ ":" port ]
382 if ( uri
[0] == '/' && uri
[1] == '/' )
384 //skip past the two slashes
387 // ############# DEVIATION FROM RFC #########################
388 // Don't parse the server component for file URIs
389 if(m_scheme
!= "file")
392 uri
= ParseUserInfo(uri
);
393 uri
= ParseServer(uri
);
394 return ParsePort(uri
);
401 const char* wxURI::ParseUserInfo(const char* uri
)
403 const char * const start
= uri
;
405 // userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
406 while ( *uri
&& *uri
!= '@' && *uri
!= '/' && *uri
!= '#' && *uri
!= '?' )
408 if ( IsUnreserved(*uri
) || IsSubDelim(*uri
) || *uri
== ':' )
409 m_userinfo
+= *uri
++;
411 AppendNextEscaped(m_userinfo
, uri
);
417 m_fields
|= wxURI_USERINFO
;
421 uri
= start
; // rewind
428 const char* wxURI::ParseServer(const char* uri
)
430 const char * const start
= uri
;
432 // host = IP-literal / IPv4address / reg-name
433 // IP-literal = "[" ( IPv6address / IPvFuture ) "]"
437 if (ParseIPv6address(uri
) && *uri
== ']')
439 m_hostType
= wxURI_IPV6ADDRESS
;
441 m_server
.assign(start
+ 1, uri
- start
- 1);
446 uri
= start
+ 1; // skip the leading '[' again
448 if (ParseIPvFuture(uri
) && *uri
== ']')
450 m_hostType
= wxURI_IPVFUTURE
;
452 m_server
.assign(start
+ 1, uri
- start
- 1);
455 else // unrecognized IP literal
461 else // IPv4 or a reg-name
463 if (ParseIPv4address(uri
))
465 m_hostType
= wxURI_IPV4ADDRESS
;
467 m_server
.assign(start
, uri
- start
);
475 if ( m_hostType
== wxURI_REGNAME
)
478 // reg-name = *( unreserved / pct-encoded / sub-delims )
479 while ( *uri
&& *uri
!= '/' && *uri
!= ':' && *uri
!= '#' && *uri
!= '?' )
481 if ( IsUnreserved(*uri
) || IsSubDelim(*uri
) )
484 AppendNextEscaped(m_server
, uri
);
488 m_fields
|= wxURI_SERVER
;
494 const char* wxURI::ParsePort(const char* uri
)
500 while ( IsDigit(*uri
) )
505 m_fields
|= wxURI_PORT
;
511 const char* wxURI::ParsePath(const char* uri
)
513 /// hier-part = "//" authority path-abempty
518 /// relative-part = "//" authority path-abempty
523 /// path-abempty = *( "/" segment )
524 /// path-absolute = "/" [ segment-nz *( "/" segment ) ]
525 /// path-noscheme = segment-nz-nc *( "/" segment )
526 /// path-rootless = segment-nz *( "/" segment )
527 /// path-empty = 0<pchar>
530 /// segment-nz = 1*pchar
531 /// segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
532 /// ; non-zero-length segment without any colon ":"
534 /// pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
536 if ( IsEndPath(*uri
) )
539 const bool isAbs
= *uri
== '/';
543 wxArrayString segments
;
547 const bool endPath
= IsEndPath(*uri
);
548 if ( endPath
|| *uri
== '/' )
550 // end of a segment, look at what we got
551 if ( segment
== ".." )
553 if ( !segments
.empty() && *segments
.rbegin() != ".." )
556 segments
.push_back("..");
558 else if ( segment
== "." )
560 // normally we ignore "." but the last one should be taken into
561 // account as "path/." is the same as "path/" and not just "path"
563 segments
.push_back("");
565 else // normal segment
567 segments
.push_back(segment
);
578 if ( IsUnreserved(*uri
) || IsSubDelim(*uri
) || *uri
== ':' || *uri
== '@' )
581 AppendNextEscaped(segment
, uri
);
584 m_path
+= wxJoin(segments
, '/', '\0');
585 m_fields
|= wxURI_PATH
;
591 const char* wxURI::ParseQuery(const char* uri
)
593 // query = *( pchar / "/" / "?" )
597 while ( *uri
&& *uri
!= '#' )
599 if ( IsUnreserved(*uri
) || IsSubDelim(*uri
) ||
600 *uri
== ':' || *uri
== '@' || *uri
== '/' || *uri
== '?' )
603 AppendNextEscaped(m_query
, uri
);
606 m_fields
|= wxURI_QUERY
;
613 const char* wxURI::ParseFragment(const char* uri
)
615 // fragment = *( pchar / "/" / "?" )
621 if ( IsUnreserved(*uri
) || IsSubDelim(*uri
) ||
622 *uri
== ':' || *uri
== '@' || *uri
== '/' || *uri
== '?')
623 m_fragment
+= *uri
++;
625 AppendNextEscaped(m_fragment
, uri
);
628 m_fields
|= wxURI_FRAGMENT
;
634 // ---------------------------------------------------------------------------
637 // Builds missing components of this uri from a base uri
639 // A version of the algorithm outlined in the RFC is used here
640 // (it is shown in comments)
642 // Note that an empty URI inherits all components
643 // ---------------------------------------------------------------------------
646 wxArrayString
wxURI::SplitInSegments(const wxString
& path
)
648 return wxSplit(path
, '/', '\0' /* no escape character */);
651 void wxURI::Resolve(const wxURI
& base
, int flags
)
653 wxASSERT_MSG(!base
.IsReference(),
654 "wxURI to inherit from must not be a reference!");
656 // If we aren't being strict, enable the older (pre-RFC2396) loophole that
657 // allows this uri to inherit other properties from the base uri - even if
658 // the scheme is defined
659 if ( !(flags
& wxURI_STRICT
) &&
660 HasScheme() && base
.HasScheme() &&
661 m_scheme
== base
.m_scheme
)
663 m_fields
-= wxURI_SCHEME
;
667 // Do nothing if this is an absolute wxURI
668 // if defined(R.scheme) then
669 // T.scheme = R.scheme;
670 // T.authority = R.authority;
671 // T.path = remove_dot_segments(R.path);
672 // T.query = R.query;
676 //No scheme - inherit
677 m_scheme
= base
.m_scheme
;
678 m_fields
|= wxURI_SCHEME
;
680 // All we need to do for relative URIs with an
681 // authority component is just inherit the scheme
682 // if defined(R.authority) then
683 // T.authority = R.authority;
684 // T.path = remove_dot_segments(R.path);
685 // T.query = R.query;
689 //No authority - inherit
690 if (base
.HasUserInfo())
692 m_userinfo
= base
.m_userinfo
;
693 m_fields
|= wxURI_USERINFO
;
696 m_server
= base
.m_server
;
697 m_hostType
= base
.m_hostType
;
698 m_fields
|= wxURI_SERVER
;
702 m_port
= base
.m_port
;
703 m_fields
|= wxURI_PORT
;
707 // Simple path inheritance from base
710 // T.path = Base.path;
711 m_path
= base
.m_path
;
712 m_fields
|= wxURI_PATH
;
715 // if defined(R.query) then
716 // T.query = R.query;
718 // T.query = Base.query;
722 m_query
= base
.m_query
;
723 m_fields
|= wxURI_QUERY
;
726 else if ( m_path
.empty() || m_path
[0u] != '/' )
728 // if (R.path starts-with "/") then
729 // T.path = remove_dot_segments(R.path);
731 // T.path = merge(Base.path, R.path);
732 // T.path = remove_dot_segments(T.path);
734 // T.query = R.query;
736 // So we don't do anything for absolute paths and implement merge for
739 wxArrayString
our(SplitInSegments(m_path
)),
740 result(SplitInSegments(base
.m_path
));
742 if ( !result
.empty() )
747 // if we have an empty path it means we were constructed from a "."
748 // string or something similar (e.g. "././././"), it should count
749 // as (empty) segment
753 const wxArrayString::const_iterator end
= our
.end();
754 for ( wxArrayString::const_iterator i
= our
.begin(); i
!= end
; ++i
)
756 if ( i
->empty() || *i
== "." )
758 // as in ParsePath(), while normally we ignore the empty
759 // segments, we need to take account of them at the end
761 result
.push_back("");
767 if ( !result
.empty() )
772 result
.push_back("");
774 //else: just ignore, extra ".." don't accumulate
778 if ( result
.empty() )
780 // ensure that the resulting path will always be absolute
781 result
.push_back("");
784 result
.push_back(*i
);
788 m_path
= wxJoin(result
, '/', '\0');
791 //T.fragment = R.fragment;
794 // ---------------------------------------------------------------------------
797 // Parses 1 to 4 hex values. Returns true if the first character of the input
798 // string is a valid hex character. It is the caller's responsibility to move
799 // the input string back to its original position on failure.
800 // ---------------------------------------------------------------------------
802 bool wxURI::ParseH16(const char*& uri
)
808 if(IsHex(*++uri
) && IsHex(*++uri
) && IsHex(*++uri
))
814 // ---------------------------------------------------------------------------
817 // Parses a certain version of an IP address and moves the input string past
818 // it. Returns true if the input string contains the proper version of an ip
819 // address. It is the caller's responsibility to move the input string back
820 // to its original position on failure.
821 // ---------------------------------------------------------------------------
823 bool wxURI::ParseIPv4address(const char*& uri
)
825 //IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
827 //dec-octet = DIGIT ; 0-9
828 // / %x31-39 DIGIT ; 10-99
829 // / "1" 2DIGIT ; 100-199
830 // / "2" %x30-34 DIGIT ; 200-249
831 // / "25" %x30-35 ; 250-255
838 //each ip part must be between 0-255 (dupe of version in for loop)
839 if( IsDigit(*++uri
) && IsDigit(*++uri
) &&
840 //100 or less (note !)
841 !( (*(uri
-2) < '2') ||
844 (*(uri
-1) < '5' || (*(uri
-1) == '5' && *uri
<= '5'))
852 if(IsDigit(*uri
))++uri
;
854 //compilers should unroll this loop
855 for(; iIPv4
< 4; ++iIPv4
)
857 if (*uri
!= '.' || !IsDigit(*++uri
))
860 //each ip part must be between 0-255
861 if( IsDigit(*++uri
) && IsDigit(*++uri
) &&
862 //100 or less (note !)
863 !( (*(uri
-2) < '2') ||
866 (*(uri
-1) < '5' || (*(uri
-1) == '5' && *uri
<= '5'))
873 if(IsDigit(*uri
))++uri
;
879 bool wxURI::ParseIPv6address(const char*& uri
)
881 // IPv6address = 6( h16 ":" ) ls32
882 // / "::" 5( h16 ":" ) ls32
883 // / [ h16 ] "::" 4( h16 ":" ) ls32
884 // / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
885 // / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
886 // / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
887 // / [ *4( h16 ":" ) h16 ] "::" ls32
888 // / [ *5( h16 ":" ) h16 ] "::" h16
889 // / [ *6( h16 ":" ) h16 ] "::"
891 size_t numPrefix
= 0,
894 bool bEndHex
= false;
896 for( ; numPrefix
< 6; ++numPrefix
)
911 if(!bEndHex
&& !ParseH16(uri
))
930 if (*uri
!= ':' || *(uri
+1) != ':')
935 while (*--uri
!= ':') {}
938 const char * const start
= uri
;
940 // ls32 = ( h16 ":" h16 ) / IPv4address
941 if (ParseH16(uri
) && *uri
== ':' && ParseH16(uri
))
946 if (ParseIPv4address(uri
))
958 maxPostfix
= 4 - numPrefix
;
962 bool bAllowAltEnding
= maxPostfix
== 0;
964 for(; maxPostfix
!= 0; --maxPostfix
)
966 if(!ParseH16(uri
) || *uri
!= ':')
972 const char * const start
= uri
;
974 // ls32 = ( h16 ":" h16 ) / IPv4address
975 if (ParseH16(uri
) && *uri
== ':' && ParseH16(uri
))
980 if (ParseIPv4address(uri
))
985 if (!bAllowAltEnding
)
989 if(numPrefix
<= 5 && ParseH16(uri
))
995 bool wxURI::ParseIPvFuture(const char*& uri
)
997 // IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
998 if (*++uri
!= 'v' || !IsHex(*++uri
))
1001 while (IsHex(*++uri
))
1004 if (*uri
!= '.' || !(IsUnreserved(*++uri
) || IsSubDelim(*uri
) || *uri
== ':'))
1007 while(IsUnreserved(*++uri
) || IsSubDelim(*uri
) || *uri
== ':') {}
1013 // ---------------------------------------------------------------------------
1016 // Returns true if the passed in character meets the criteria of the method
1017 // ---------------------------------------------------------------------------
1019 // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
1020 bool wxURI::IsUnreserved(char c
)
1022 return IsAlpha(c
) ||
1031 bool wxURI::IsReserved(char c
)
1033 return IsGenDelim(c
) || IsSubDelim(c
);
1036 // gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
1037 bool wxURI::IsGenDelim(char c
)
1048 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
1049 // / "*" / "+" / "," / ";" / "="
1050 bool wxURI::IsSubDelim(char c
)
1066 bool wxURI::IsHex(char c
)
1068 return IsDigit(c
) ||
1069 (c
>= 'a' && c
<= 'f') ||
1070 (c
>= 'A' && c
<= 'F');
1073 bool wxURI::IsAlpha(char c
)
1075 return (c
>= 'a' && c
<= 'z') || (c
>= 'A' && c
<= 'Z');
1078 bool wxURI::IsDigit(char c
)
1080 return c
>= '0' && c
<= '9';
1083 bool wxURI::IsEndPath(char c
)
1085 return c
== '\0' || c
== '#' || c
== '?';