]> git.saurik.com Git - apple/javascriptcore.git/blob - wtf/url/src/URLParser.h
4d5ca51fd1d5a1a74593607698c87e07f53925a8
[apple/javascriptcore.git] / wtf / url / src / URLParser.h
1 /* Based on nsURLParsers.cc from Mozilla
2 * -------------------------------------
3 * Copyright (C) 1998 Netscape Communications Corporation.
4 *
5 * Other contributors:
6 * Darin Fisher (original author)
7 *
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with this library; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 *
22 * Alternatively, the contents of this file may be used under the terms
23 * of either the Mozilla Public License Version 1.1, found at
24 * http://www.mozilla.org/MPL/ (the "MPL") or the GNU General Public
25 * License Version 2.0, found at http://www.fsf.org/copyleft/gpl.html
26 * (the "GPL"), in which case the provisions of the MPL or the GPL are
27 * applicable instead of those above. If you wish to allow use of your
28 * version of this file only under the terms of one of those two
29 * licenses (the MPL or the GPL) and not to allow others to use your
30 * version of this file under the LGPL, indicate your decision by
31 * deletingthe provisions above and replace them with the notice and
32 * other provisions required by the MPL or the GPL, as the case may be.
33 * If you do not delete the provisions above, a recipient may use your
34 * version of this file under any of the LGPL, the MPL or the GPL.
35 */
36
37 #ifndef URLParser_h
38 #define URLParser_h
39
40 #include "URLComponent.h"
41 #include "URLSegments.h"
42
43 namespace WTF {
44
45 template<typename CHAR>
46 class URLParser {
47 public:
48 enum SpecialPort {
49 UnspecifiedPort = -1,
50 InvalidPort = -2,
51 };
52
53 // This handles everything that may be an authority terminator, including
54 // backslash. For special backslash handling see parseAfterScheme.
55 static bool isPossibleAuthorityTerminator(CHAR ch)
56 {
57 return isURLSlash(ch) || ch == '?' || ch == '#' || ch == ';';
58 }
59
60 // Given an already-identified auth section, breaks it into its constituent
61 // parts. The port number will be parsed and the resulting integer will be
62 // filled into the given *port variable, or -1 if there is no port number
63 // or it is invalid.
64 static void parseAuthority(const CHAR* spec, const URLComponent& auth, URLComponent& username, URLComponent& password, URLComponent& host, URLComponent& port)
65 {
66 // FIXME: add ASSERT(auth.isValid()); // We should always get an authority.
67 if (!auth.length()) {
68 username.reset();
69 password.reset();
70 host.reset();
71 port.reset();
72 return;
73 }
74
75 // Search backwards for @, which is the separator between the user info
76 // and the server info. RFC 3986 forbids @ from occuring in auth, but
77 // someone might include it in a password unescaped.
78 int i = auth.begin() + auth.length() - 1;
79 while (i > auth.begin() && spec[i] != '@')
80 --i;
81
82 if (spec[i] == '@') {
83 // Found user info: <user-info>@<server-info>
84 parseUserInfo(spec, URLComponent(auth.begin(), i - auth.begin()), username, password);
85 parseServerInfo(spec, URLComponent::fromRange(i + 1, auth.begin() + auth.length()), host, port);
86 } else {
87 // No user info, everything is server info.
88 username.reset();
89 password.reset();
90 parseServerInfo(spec, auth, host, port);
91 }
92 }
93
94 static bool extractScheme(const CHAR* spec, int specLength, URLComponent& scheme)
95 {
96 // Skip leading whitespace and control characters.
97 int begin = 0;
98 while (begin < specLength && shouldTrimFromURL(spec[begin]))
99 begin++;
100 if (begin == specLength)
101 return false; // Input is empty or all whitespace.
102
103 // Find the first colon character.
104 for (int i = begin; i < specLength; i++) {
105 if (spec[i] == ':') {
106 scheme = URLComponent::fromRange(begin, i);
107 return true;
108 }
109 }
110 return false; // No colon found: no scheme
111 }
112
113 // Fills in all members of the URLSegments structure (except for the
114 // scheme) for standard URLs.
115 //
116 // |spec| is the full spec being parsed, of length |specLength|.
117 // |afterScheme| is the character immediately following the scheme (after
118 // the colon) where we'll begin parsing.
119 static void parseAfterScheme(const CHAR* spec, int specLength, int afterScheme, URLSegments& parsed)
120 {
121 int numberOfSlashes = consecutiveSlashes(spec, afterScheme, specLength);
122 int afterSlashes = afterScheme + numberOfSlashes;
123
124 // First split into two main parts, the authority (username, password,
125 // host, and port) and the full path (path, query, and reference).
126 URLComponent authority;
127 URLComponent fullPath;
128
129 // Found "//<some data>", looks like an authority section. Treat
130 // everything from there to the next slash (or end of spec) to be the
131 // authority. Note that we ignore the number of slashes and treat it as
132 // the authority.
133 int authEnd = nextAuthorityTerminator(spec, afterSlashes, specLength);
134 authority = URLComponent(afterSlashes, authEnd - afterSlashes);
135
136 if (authEnd == specLength) // No beginning of path found.
137 fullPath = URLComponent();
138 else // Everything starting from the slash to the end is the path.
139 fullPath = URLComponent(authEnd, specLength - authEnd);
140
141 // Now parse those two sub-parts.
142 parseAuthority(spec, authority, parsed.username, parsed.password, parsed.host, parsed.port);
143 parsePath(spec, fullPath, parsed.path, parsed.query, parsed.fragment);
144 }
145
146 // The main parsing function for standard URLs. Standard URLs have a scheme,
147 // host, path, etc.
148 static void parseStandardURL(const CHAR* spec, int specLength, URLSegments& parsed)
149 {
150 // FIXME: add ASSERT(specLength >= 0);
151
152 // Strip leading & trailing spaces and control characters.
153 int begin = 0;
154 trimURL(spec, begin, specLength);
155
156 int afterScheme;
157 if (extractScheme(spec, specLength, parsed.scheme))
158 afterScheme = parsed.scheme.end() + 1; // Skip past the colon.
159 else {
160 // Say there's no scheme when there is a colon. We could also say
161 // that everything is the scheme. Both would produce an invalid
162 // URL, but this way seems less wrong in more cases.
163 parsed.scheme.reset();
164 afterScheme = begin;
165 }
166 parseAfterScheme(spec, specLength, afterScheme, parsed);
167 }
168
169 static void parsePath(const CHAR* spec, const URLComponent& path, URLComponent& filepath, URLComponent& query, URLComponent& fragment)
170 {
171 // path = [/]<segment1>/<segment2>/<...>/<segmentN>;<param>?<query>#<fragment>
172
173 // Special case when there is no path.
174 if (!path.isValid()) {
175 filepath.reset();
176 query.reset();
177 fragment.reset();
178 return;
179 }
180 // FIXME: add ASSERT(path.length() > 0); // We should never have 0 length paths.
181
182 // Search for first occurrence of either ? or #.
183 int pathEnd = path.begin() + path.length();
184
185 int querySeparator = -1; // Index of the '?'
186 int refSeparator = -1; // Index of the '#'
187 for (int i = path.begin(); i < pathEnd; i++) {
188 switch (spec[i]) {
189 case '?':
190 if (querySeparator < 0)
191 querySeparator = i;
192 break;
193 case '#':
194 refSeparator = i;
195 i = pathEnd; // Break out of the loop.
196 break;
197 default:
198 break;
199 }
200 }
201
202 // Markers pointing to the character after each of these corresponding
203 // components. The code below works from the end back to the beginning,
204 // and will update these indices as it finds components that exist.
205 int fileEnd, queryEnd;
206
207 // Fragment: from the # to the end of the path.
208 if (refSeparator >= 0) {
209 fileEnd = refSeparator;
210 queryEnd = refSeparator;
211 fragment = URLComponent::fromRange(refSeparator + 1, pathEnd);
212 } else {
213 fileEnd = pathEnd;
214 queryEnd = pathEnd;
215 fragment.reset();
216 }
217
218 // Query fragment: everything from the ? to the next boundary (either
219 // the end of the path or the fragment fragment).
220 if (querySeparator >= 0) {
221 fileEnd = querySeparator;
222 query = URLComponent::fromRange(querySeparator + 1, queryEnd);
223 } else
224 query.reset();
225
226 // File path: treat an empty file path as no file path.
227 if (fileEnd != path.begin())
228 filepath = URLComponent::fromRange(path.begin(), fileEnd);
229 else
230 filepath.reset();
231 }
232
233 // Initializes a path URL which is merely a scheme followed by a path.
234 // Examples include "about:foo" and "javascript:alert('bar');"
235 static void parsePathURL(const CHAR* spec, int specLength, URLSegments& parsed)
236 {
237 // Get the non-path and non-scheme parts of the URL out of the way, we
238 // never use them.
239 parsed.username.reset();
240 parsed.password.reset();
241 parsed.host.reset();
242 parsed.port.reset();
243 parsed.query.reset();
244 parsed.fragment.reset();
245
246 // Strip leading & trailing spaces and control characters.
247 // FIXME: Perhaps this is unnecessary?
248 int begin = 0;
249 trimURL(spec, begin, specLength);
250
251 // Handle empty specs or ones that contain only whitespace or control
252 // chars.
253 if (begin == specLength) {
254 parsed.scheme.reset();
255 parsed.path.reset();
256 return;
257 }
258
259 // Extract the scheme, with the path being everything following. We also
260 // handle the case where there is no scheme.
261 if (extractScheme(&spec[begin], specLength - begin, parsed.scheme)) {
262 // Offset the results since we gave extractScheme a substring.
263 parsed.scheme.setBegin(parsed.scheme.begin() + begin);
264
265 // For compatibility with the standard URL parser, we treat no path
266 // as -1, rather than having a length of 0 (we normally wouldn't
267 // care so much for these non-standard URLs).
268 if (parsed.scheme.end() == specLength - 1)
269 parsed.path.reset();
270 else
271 parsed.path = URLComponent::fromRange(parsed.scheme.end() + 1, specLength);
272 } else {
273 // No scheme found, just path.
274 parsed.scheme.reset();
275 parsed.path = URLComponent::fromRange(begin, specLength);
276 }
277 }
278
279 static void parseMailtoURL(const CHAR* spec, int specLength, URLSegments& parsed)
280 {
281 // FIXME: add ASSERT(specLength >= 0);
282
283 // Get the non-path and non-scheme parts of the URL out of the way, we
284 // never use them.
285 parsed.username.reset();
286 parsed.password.reset();
287 parsed.host.reset();
288 parsed.port.reset();
289 parsed.fragment.reset();
290 parsed.query.reset(); // May use this; reset for convenience.
291
292 // Strip leading & trailing spaces and control characters.
293 int begin = 0;
294 trimURL(spec, begin, specLength);
295
296 // Handle empty specs or ones that contain only whitespace or control
297 // chars.
298 if (begin == specLength) {
299 parsed.scheme.reset();
300 parsed.path.reset();
301 return;
302 }
303
304 int pathBegin = -1;
305 int pathEnd = -1;
306
307 // Extract the scheme, with the path being everything following. We also
308 // handle the case where there is no scheme.
309 if (extractScheme(&spec[begin], specLength - begin, parsed.scheme)) {
310 // Offset the results since we gave extractScheme a substring.
311 parsed.scheme.setBegin(parsed.scheme.begin() + begin);
312
313 if (parsed.scheme.end() != specLength - 1) {
314 pathBegin = parsed.scheme.end() + 1;
315 pathEnd = specLength;
316 }
317 } else {
318 // No scheme found, just path.
319 parsed.scheme.reset();
320 pathBegin = begin;
321 pathEnd = specLength;
322 }
323
324 // Split [pathBegin, pathEnd) into a path + query.
325 for (int i = pathBegin; i < pathEnd; ++i) {
326 if (spec[i] == '?') {
327 parsed.query = URLComponent::fromRange(i + 1, pathEnd);
328 pathEnd = i;
329 break;
330 }
331 }
332
333 // For compatibility with the standard URL parser, treat no path as
334 // -1, rather than having a length of 0
335 if (pathBegin == pathEnd)
336 parsed.path.reset();
337 else
338 parsed.path = URLComponent::fromRange(pathBegin, pathEnd);
339 }
340
341 static int parsePort(const CHAR* spec, const URLComponent& component)
342 {
343 // Easy success case when there is no port.
344 const int maxDigits = 5;
345 if (component.isEmptyOrInvalid())
346 return UnspecifiedPort;
347
348 URLComponent nonZeroDigits(component.end(), 0);
349 for (int i = 0; i < component.length(); ++i) {
350 if (spec[component.begin() + i] != '0') {
351 nonZeroDigits = URLComponent::fromRange(component.begin() + i, component.end());
352 break;
353 }
354 }
355 if (!nonZeroDigits.length())
356 return 0; // All digits were 0.
357
358 if (nonZeroDigits.length() > maxDigits)
359 return InvalidPort;
360
361 int port = 0;
362 for (int i = 0; i < nonZeroDigits.length(); ++i) {
363 CHAR ch = spec[nonZeroDigits.begin() + i];
364 if (!isPortDigit(ch))
365 return InvalidPort;
366 port *= 10;
367 port += static_cast<char>(ch) - '0';
368 }
369 if (port > 65535)
370 return InvalidPort;
371 return port;
372 }
373
374 static void extractFileName(const CHAR* spec, const URLComponent& path, URLComponent& fileName)
375 {
376 // Handle empty paths: they have no file names.
377 if (path.isEmptyOrInvalid()) {
378 fileName.reset();
379 return;
380 }
381
382 // Search backwards for a parameter, which is a normally unused field
383 // in a URL delimited by a semicolon. We parse the parameter as part of
384 // the path, but here, we don't want to count it. The last semicolon is
385 // the parameter.
386 int fileEnd = path.end();
387 for (int i = path.end() - 1; i > path.begin(); --i) {
388 if (spec[i] == ';') {
389 fileEnd = i;
390 break;
391 }
392 }
393
394 // Now search backwards from the filename end to the previous slash
395 // to find the beginning of the filename.
396 for (int i = fileEnd - 1; i >= path.begin(); --i) {
397 if (isURLSlash(spec[i])) {
398 // File name is everything following this character to the end
399 fileName = URLComponent::fromRange(i + 1, fileEnd);
400 return;
401 }
402 }
403
404 // No slash found, this means the input was degenerate (generally paths
405 // will start with a slash). Let's call everything the file name.
406 fileName = URLComponent::fromRange(path.begin(), fileEnd);
407 }
408
409 static bool extractQueryKeyValue(const CHAR* spec, URLComponent& query, URLComponent& key, URLComponent& value)
410 {
411 if (query.isEmptyOrInvalid())
412 return false;
413
414 int start = query.begin();
415 int current = start;
416 int end = query.end();
417
418 // We assume the beginning of the input is the beginning of the "key"
419 // and we skip to the end of it.
420 key.setBegin(current);
421 while (current < end && spec[current] != '&' && spec[current] != '=')
422 ++current;
423 key.setLength(current - key.begin());
424
425 // Skip the separator after the key (if any).
426 if (current < end && spec[current] == '=')
427 ++current;
428
429 // Find the value part.
430 value.setBegin(current);
431 while (current < end && spec[current] != '&')
432 ++current;
433 value.setLength(current - value.begin());
434
435 // Finally skip the next separator if any
436 if (current < end && spec[current] == '&')
437 ++current;
438
439 // Save the new query
440 query = URLComponent::fromRange(current, end);
441 return true;
442 }
443
444 // FIXME: This should be protected or private.
445 public:
446 // We treat slashes and backslashes the same for IE compatibility.
447 static inline bool isURLSlash(CHAR ch)
448 {
449 return ch == '/' || ch == '\\';
450 }
451
452 // Returns true if we should trim this character from the URL because it is
453 // a space or a control character.
454 static inline bool shouldTrimFromURL(CHAR ch)
455 {
456 return ch <= ' ';
457 }
458
459 // Given an already-initialized begin index and end index (the index after
460 // the last CHAR in spec), this shrinks the range to eliminate
461 // "should-be-trimmed" characters.
462 static inline void trimURL(const CHAR* spec, int& begin, int& end)
463 {
464 // Strip leading whitespace and control characters.
465 while (begin < end && shouldTrimFromURL(spec[begin]))
466 ++begin;
467
468 // Strip trailing whitespace and control characters. We need the >i
469 // test for when the input string is all blanks; we don't want to back
470 // past the input.
471 while (end > begin && shouldTrimFromURL(spec[end - 1]))
472 --end;
473 }
474
475 // Counts the number of consecutive slashes starting at the given offset
476 // in the given string of the given length.
477 static inline int consecutiveSlashes(const CHAR *string, int beginOffset, int stringLength)
478 {
479 int count = 0;
480 while (beginOffset + count < stringLength && isURLSlash(string[beginOffset + count]))
481 ++count;
482 return count;
483 }
484
485 private:
486 // URLParser cannot be constructed.
487 URLParser();
488
489 // Returns true if the given character is a valid digit to use in a port.
490 static inline bool isPortDigit(CHAR ch)
491 {
492 return ch >= '0' && ch <= '9';
493 }
494
495 // Returns the offset of the next authority terminator in the input starting
496 // from startOffset. If no terminator is found, the return value will be equal
497 // to specLength.
498 static int nextAuthorityTerminator(const CHAR* spec, int startOffset, int specLength)
499 {
500 for (int i = startOffset; i < specLength; i++) {
501 if (isPossibleAuthorityTerminator(spec[i]))
502 return i;
503 }
504 return specLength; // Not found.
505 }
506
507 static void parseUserInfo(const CHAR* spec, const URLComponent& user, URLComponent& username, URLComponent& password)
508 {
509 // Find the first colon in the user section, which separates the
510 // username and password.
511 int colonOffset = 0;
512 while (colonOffset < user.length() && spec[user.begin() + colonOffset] != ':')
513 ++colonOffset;
514
515 if (colonOffset < user.length()) {
516 // Found separator: <username>:<password>
517 username = URLComponent(user.begin(), colonOffset);
518 password = URLComponent::fromRange(user.begin() + colonOffset + 1, user.begin() + user.length());
519 } else {
520 // No separator, treat everything as the username
521 username = user;
522 password = URLComponent();
523 }
524 }
525
526 static void parseServerInfo(const CHAR* spec, const URLComponent& serverInfo, URLComponent& host, URLComponent& port)
527 {
528 if (!serverInfo.length()) {
529 // No server info, host name is empty.
530 host.reset();
531 port.reset();
532 return;
533 }
534
535 // If the host starts with a left-bracket, assume the entire host is an
536 // IPv6 literal. Otherwise, assume none of the host is an IPv6 literal.
537 // This assumption will be overridden if we find a right-bracket.
538 //
539 // Our IPv6 address canonicalization code requires both brackets to
540 // exist, but the ability to locate an incomplete address can still be
541 // useful.
542 int ipv6Terminator = spec[serverInfo.begin()] == '[' ? serverInfo.end() : -1;
543 int colon = -1;
544
545 // Find the last right-bracket, and the last colon.
546 for (int i = serverInfo.begin(); i < serverInfo.end(); i++) {
547 switch (spec[i]) {
548 case ']':
549 ipv6Terminator = i;
550 break;
551 case ':':
552 colon = i;
553 break;
554 default:
555 break;
556 }
557 }
558
559 if (colon > ipv6Terminator) {
560 // Found a port number: <hostname>:<port>
561 host = URLComponent::fromRange(serverInfo.begin(), colon);
562 if (!host.length())
563 host.reset();
564 port = URLComponent::fromRange(colon + 1, serverInfo.end());
565 } else {
566 // No port: <hostname>
567 host = serverInfo;
568 port.reset();
569 }
570 }
571 };
572
573 } // namespace WTF
574
575 #endif // URLParser_h