]>
git.saurik.com Git - apple/javascriptcore.git/blob - wtf/url/src/URLParser.h
4d5ca51fd1d5a1a74593607698c87e07f53925a8
1 /* Based on nsURLParsers.cc from Mozilla
2 * -------------------------------------
3 * Copyright (C) 1998 Netscape Communications Corporation.
6 * Darin Fisher (original author)
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with this library; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * Alternatively, the contents of this file may be used under the terms
23 * of either the Mozilla Public License Version 1.1, found at
24 * http://www.mozilla.org/MPL/ (the "MPL") or the GNU General Public
25 * License Version 2.0, found at http://www.fsf.org/copyleft/gpl.html
26 * (the "GPL"), in which case the provisions of the MPL or the GPL are
27 * applicable instead of those above. If you wish to allow use of your
28 * version of this file only under the terms of one of those two
29 * licenses (the MPL or the GPL) and not to allow others to use your
30 * version of this file under the LGPL, indicate your decision by
31 * deletingthe provisions above and replace them with the notice and
32 * other provisions required by the MPL or the GPL, as the case may be.
33 * If you do not delete the provisions above, a recipient may use your
34 * version of this file under any of the LGPL, the MPL or the GPL.
40 #include "URLComponent.h"
41 #include "URLSegments.h"
45 template<typename CHAR
>
53 // This handles everything that may be an authority terminator, including
54 // backslash. For special backslash handling see parseAfterScheme.
55 static bool isPossibleAuthorityTerminator(CHAR ch
)
57 return isURLSlash(ch
) || ch
== '?' || ch
== '#' || ch
== ';';
60 // Given an already-identified auth section, breaks it into its constituent
61 // parts. The port number will be parsed and the resulting integer will be
62 // filled into the given *port variable, or -1 if there is no port number
64 static void parseAuthority(const CHAR
* spec
, const URLComponent
& auth
, URLComponent
& username
, URLComponent
& password
, URLComponent
& host
, URLComponent
& port
)
66 // FIXME: add ASSERT(auth.isValid()); // We should always get an authority.
75 // Search backwards for @, which is the separator between the user info
76 // and the server info. RFC 3986 forbids @ from occuring in auth, but
77 // someone might include it in a password unescaped.
78 int i
= auth
.begin() + auth
.length() - 1;
79 while (i
> auth
.begin() && spec
[i
] != '@')
83 // Found user info: <user-info>@<server-info>
84 parseUserInfo(spec
, URLComponent(auth
.begin(), i
- auth
.begin()), username
, password
);
85 parseServerInfo(spec
, URLComponent::fromRange(i
+ 1, auth
.begin() + auth
.length()), host
, port
);
87 // No user info, everything is server info.
90 parseServerInfo(spec
, auth
, host
, port
);
94 static bool extractScheme(const CHAR
* spec
, int specLength
, URLComponent
& scheme
)
96 // Skip leading whitespace and control characters.
98 while (begin
< specLength
&& shouldTrimFromURL(spec
[begin
]))
100 if (begin
== specLength
)
101 return false; // Input is empty or all whitespace.
103 // Find the first colon character.
104 for (int i
= begin
; i
< specLength
; i
++) {
105 if (spec
[i
] == ':') {
106 scheme
= URLComponent::fromRange(begin
, i
);
110 return false; // No colon found: no scheme
113 // Fills in all members of the URLSegments structure (except for the
114 // scheme) for standard URLs.
116 // |spec| is the full spec being parsed, of length |specLength|.
117 // |afterScheme| is the character immediately following the scheme (after
118 // the colon) where we'll begin parsing.
119 static void parseAfterScheme(const CHAR
* spec
, int specLength
, int afterScheme
, URLSegments
& parsed
)
121 int numberOfSlashes
= consecutiveSlashes(spec
, afterScheme
, specLength
);
122 int afterSlashes
= afterScheme
+ numberOfSlashes
;
124 // First split into two main parts, the authority (username, password,
125 // host, and port) and the full path (path, query, and reference).
126 URLComponent authority
;
127 URLComponent fullPath
;
129 // Found "//<some data>", looks like an authority section. Treat
130 // everything from there to the next slash (or end of spec) to be the
131 // authority. Note that we ignore the number of slashes and treat it as
133 int authEnd
= nextAuthorityTerminator(spec
, afterSlashes
, specLength
);
134 authority
= URLComponent(afterSlashes
, authEnd
- afterSlashes
);
136 if (authEnd
== specLength
) // No beginning of path found.
137 fullPath
= URLComponent();
138 else // Everything starting from the slash to the end is the path.
139 fullPath
= URLComponent(authEnd
, specLength
- authEnd
);
141 // Now parse those two sub-parts.
142 parseAuthority(spec
, authority
, parsed
.username
, parsed
.password
, parsed
.host
, parsed
.port
);
143 parsePath(spec
, fullPath
, parsed
.path
, parsed
.query
, parsed
.fragment
);
146 // The main parsing function for standard URLs. Standard URLs have a scheme,
148 static void parseStandardURL(const CHAR
* spec
, int specLength
, URLSegments
& parsed
)
150 // FIXME: add ASSERT(specLength >= 0);
152 // Strip leading & trailing spaces and control characters.
154 trimURL(spec
, begin
, specLength
);
157 if (extractScheme(spec
, specLength
, parsed
.scheme
))
158 afterScheme
= parsed
.scheme
.end() + 1; // Skip past the colon.
160 // Say there's no scheme when there is a colon. We could also say
161 // that everything is the scheme. Both would produce an invalid
162 // URL, but this way seems less wrong in more cases.
163 parsed
.scheme
.reset();
166 parseAfterScheme(spec
, specLength
, afterScheme
, parsed
);
169 static void parsePath(const CHAR
* spec
, const URLComponent
& path
, URLComponent
& filepath
, URLComponent
& query
, URLComponent
& fragment
)
171 // path = [/]<segment1>/<segment2>/<...>/<segmentN>;<param>?<query>#<fragment>
173 // Special case when there is no path.
174 if (!path
.isValid()) {
180 // FIXME: add ASSERT(path.length() > 0); // We should never have 0 length paths.
182 // Search for first occurrence of either ? or #.
183 int pathEnd
= path
.begin() + path
.length();
185 int querySeparator
= -1; // Index of the '?'
186 int refSeparator
= -1; // Index of the '#'
187 for (int i
= path
.begin(); i
< pathEnd
; i
++) {
190 if (querySeparator
< 0)
195 i
= pathEnd
; // Break out of the loop.
202 // Markers pointing to the character after each of these corresponding
203 // components. The code below works from the end back to the beginning,
204 // and will update these indices as it finds components that exist.
205 int fileEnd
, queryEnd
;
207 // Fragment: from the # to the end of the path.
208 if (refSeparator
>= 0) {
209 fileEnd
= refSeparator
;
210 queryEnd
= refSeparator
;
211 fragment
= URLComponent::fromRange(refSeparator
+ 1, pathEnd
);
218 // Query fragment: everything from the ? to the next boundary (either
219 // the end of the path or the fragment fragment).
220 if (querySeparator
>= 0) {
221 fileEnd
= querySeparator
;
222 query
= URLComponent::fromRange(querySeparator
+ 1, queryEnd
);
226 // File path: treat an empty file path as no file path.
227 if (fileEnd
!= path
.begin())
228 filepath
= URLComponent::fromRange(path
.begin(), fileEnd
);
233 // Initializes a path URL which is merely a scheme followed by a path.
234 // Examples include "about:foo" and "javascript:alert('bar');"
235 static void parsePathURL(const CHAR
* spec
, int specLength
, URLSegments
& parsed
)
237 // Get the non-path and non-scheme parts of the URL out of the way, we
239 parsed
.username
.reset();
240 parsed
.password
.reset();
243 parsed
.query
.reset();
244 parsed
.fragment
.reset();
246 // Strip leading & trailing spaces and control characters.
247 // FIXME: Perhaps this is unnecessary?
249 trimURL(spec
, begin
, specLength
);
251 // Handle empty specs or ones that contain only whitespace or control
253 if (begin
== specLength
) {
254 parsed
.scheme
.reset();
259 // Extract the scheme, with the path being everything following. We also
260 // handle the case where there is no scheme.
261 if (extractScheme(&spec
[begin
], specLength
- begin
, parsed
.scheme
)) {
262 // Offset the results since we gave extractScheme a substring.
263 parsed
.scheme
.setBegin(parsed
.scheme
.begin() + begin
);
265 // For compatibility with the standard URL parser, we treat no path
266 // as -1, rather than having a length of 0 (we normally wouldn't
267 // care so much for these non-standard URLs).
268 if (parsed
.scheme
.end() == specLength
- 1)
271 parsed
.path
= URLComponent::fromRange(parsed
.scheme
.end() + 1, specLength
);
273 // No scheme found, just path.
274 parsed
.scheme
.reset();
275 parsed
.path
= URLComponent::fromRange(begin
, specLength
);
279 static void parseMailtoURL(const CHAR
* spec
, int specLength
, URLSegments
& parsed
)
281 // FIXME: add ASSERT(specLength >= 0);
283 // Get the non-path and non-scheme parts of the URL out of the way, we
285 parsed
.username
.reset();
286 parsed
.password
.reset();
289 parsed
.fragment
.reset();
290 parsed
.query
.reset(); // May use this; reset for convenience.
292 // Strip leading & trailing spaces and control characters.
294 trimURL(spec
, begin
, specLength
);
296 // Handle empty specs or ones that contain only whitespace or control
298 if (begin
== specLength
) {
299 parsed
.scheme
.reset();
307 // Extract the scheme, with the path being everything following. We also
308 // handle the case where there is no scheme.
309 if (extractScheme(&spec
[begin
], specLength
- begin
, parsed
.scheme
)) {
310 // Offset the results since we gave extractScheme a substring.
311 parsed
.scheme
.setBegin(parsed
.scheme
.begin() + begin
);
313 if (parsed
.scheme
.end() != specLength
- 1) {
314 pathBegin
= parsed
.scheme
.end() + 1;
315 pathEnd
= specLength
;
318 // No scheme found, just path.
319 parsed
.scheme
.reset();
321 pathEnd
= specLength
;
324 // Split [pathBegin, pathEnd) into a path + query.
325 for (int i
= pathBegin
; i
< pathEnd
; ++i
) {
326 if (spec
[i
] == '?') {
327 parsed
.query
= URLComponent::fromRange(i
+ 1, pathEnd
);
333 // For compatibility with the standard URL parser, treat no path as
334 // -1, rather than having a length of 0
335 if (pathBegin
== pathEnd
)
338 parsed
.path
= URLComponent::fromRange(pathBegin
, pathEnd
);
341 static int parsePort(const CHAR
* spec
, const URLComponent
& component
)
343 // Easy success case when there is no port.
344 const int maxDigits
= 5;
345 if (component
.isEmptyOrInvalid())
346 return UnspecifiedPort
;
348 URLComponent
nonZeroDigits(component
.end(), 0);
349 for (int i
= 0; i
< component
.length(); ++i
) {
350 if (spec
[component
.begin() + i
] != '0') {
351 nonZeroDigits
= URLComponent::fromRange(component
.begin() + i
, component
.end());
355 if (!nonZeroDigits
.length())
356 return 0; // All digits were 0.
358 if (nonZeroDigits
.length() > maxDigits
)
362 for (int i
= 0; i
< nonZeroDigits
.length(); ++i
) {
363 CHAR ch
= spec
[nonZeroDigits
.begin() + i
];
364 if (!isPortDigit(ch
))
367 port
+= static_cast<char>(ch
) - '0';
374 static void extractFileName(const CHAR
* spec
, const URLComponent
& path
, URLComponent
& fileName
)
376 // Handle empty paths: they have no file names.
377 if (path
.isEmptyOrInvalid()) {
382 // Search backwards for a parameter, which is a normally unused field
383 // in a URL delimited by a semicolon. We parse the parameter as part of
384 // the path, but here, we don't want to count it. The last semicolon is
386 int fileEnd
= path
.end();
387 for (int i
= path
.end() - 1; i
> path
.begin(); --i
) {
388 if (spec
[i
] == ';') {
394 // Now search backwards from the filename end to the previous slash
395 // to find the beginning of the filename.
396 for (int i
= fileEnd
- 1; i
>= path
.begin(); --i
) {
397 if (isURLSlash(spec
[i
])) {
398 // File name is everything following this character to the end
399 fileName
= URLComponent::fromRange(i
+ 1, fileEnd
);
404 // No slash found, this means the input was degenerate (generally paths
405 // will start with a slash). Let's call everything the file name.
406 fileName
= URLComponent::fromRange(path
.begin(), fileEnd
);
409 static bool extractQueryKeyValue(const CHAR
* spec
, URLComponent
& query
, URLComponent
& key
, URLComponent
& value
)
411 if (query
.isEmptyOrInvalid())
414 int start
= query
.begin();
416 int end
= query
.end();
418 // We assume the beginning of the input is the beginning of the "key"
419 // and we skip to the end of it.
420 key
.setBegin(current
);
421 while (current
< end
&& spec
[current
] != '&' && spec
[current
] != '=')
423 key
.setLength(current
- key
.begin());
425 // Skip the separator after the key (if any).
426 if (current
< end
&& spec
[current
] == '=')
429 // Find the value part.
430 value
.setBegin(current
);
431 while (current
< end
&& spec
[current
] != '&')
433 value
.setLength(current
- value
.begin());
435 // Finally skip the next separator if any
436 if (current
< end
&& spec
[current
] == '&')
439 // Save the new query
440 query
= URLComponent::fromRange(current
, end
);
444 // FIXME: This should be protected or private.
446 // We treat slashes and backslashes the same for IE compatibility.
447 static inline bool isURLSlash(CHAR ch
)
449 return ch
== '/' || ch
== '\\';
452 // Returns true if we should trim this character from the URL because it is
453 // a space or a control character.
454 static inline bool shouldTrimFromURL(CHAR ch
)
459 // Given an already-initialized begin index and end index (the index after
460 // the last CHAR in spec), this shrinks the range to eliminate
461 // "should-be-trimmed" characters.
462 static inline void trimURL(const CHAR
* spec
, int& begin
, int& end
)
464 // Strip leading whitespace and control characters.
465 while (begin
< end
&& shouldTrimFromURL(spec
[begin
]))
468 // Strip trailing whitespace and control characters. We need the >i
469 // test for when the input string is all blanks; we don't want to back
471 while (end
> begin
&& shouldTrimFromURL(spec
[end
- 1]))
475 // Counts the number of consecutive slashes starting at the given offset
476 // in the given string of the given length.
477 static inline int consecutiveSlashes(const CHAR
*string
, int beginOffset
, int stringLength
)
480 while (beginOffset
+ count
< stringLength
&& isURLSlash(string
[beginOffset
+ count
]))
486 // URLParser cannot be constructed.
489 // Returns true if the given character is a valid digit to use in a port.
490 static inline bool isPortDigit(CHAR ch
)
492 return ch
>= '0' && ch
<= '9';
495 // Returns the offset of the next authority terminator in the input starting
496 // from startOffset. If no terminator is found, the return value will be equal
498 static int nextAuthorityTerminator(const CHAR
* spec
, int startOffset
, int specLength
)
500 for (int i
= startOffset
; i
< specLength
; i
++) {
501 if (isPossibleAuthorityTerminator(spec
[i
]))
504 return specLength
; // Not found.
507 static void parseUserInfo(const CHAR
* spec
, const URLComponent
& user
, URLComponent
& username
, URLComponent
& password
)
509 // Find the first colon in the user section, which separates the
510 // username and password.
512 while (colonOffset
< user
.length() && spec
[user
.begin() + colonOffset
] != ':')
515 if (colonOffset
< user
.length()) {
516 // Found separator: <username>:<password>
517 username
= URLComponent(user
.begin(), colonOffset
);
518 password
= URLComponent::fromRange(user
.begin() + colonOffset
+ 1, user
.begin() + user
.length());
520 // No separator, treat everything as the username
522 password
= URLComponent();
526 static void parseServerInfo(const CHAR
* spec
, const URLComponent
& serverInfo
, URLComponent
& host
, URLComponent
& port
)
528 if (!serverInfo
.length()) {
529 // No server info, host name is empty.
535 // If the host starts with a left-bracket, assume the entire host is an
536 // IPv6 literal. Otherwise, assume none of the host is an IPv6 literal.
537 // This assumption will be overridden if we find a right-bracket.
539 // Our IPv6 address canonicalization code requires both brackets to
540 // exist, but the ability to locate an incomplete address can still be
542 int ipv6Terminator
= spec
[serverInfo
.begin()] == '[' ? serverInfo
.end() : -1;
545 // Find the last right-bracket, and the last colon.
546 for (int i
= serverInfo
.begin(); i
< serverInfo
.end(); i
++) {
559 if (colon
> ipv6Terminator
) {
560 // Found a port number: <hostname>:<port>
561 host
= URLComponent::fromRange(serverInfo
.begin(), colon
);
564 port
= URLComponent::fromRange(colon
+ 1, serverInfo
.end());
566 // No port: <hostname>
575 #endif // URLParser_h