2 * Copyright (c) 2014 Apple Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
21 * @APPLE_LICENSE_HEADER_END@
25 Copyright (c) 2012-2013, Apple Inc. All rights reserved.
26 Responsibility: Jim Luther
34 CFURL's URL string parser needs to be able to parse either an array of char or an array of UniChar.
36 The code in CFURL.c used to use this macro "#define STRING_CHAR(x) (useCString ? cstring[(x)] : ustring[(x)])" to determine which array to get a character from for every character looked at in the URL string. That macro added one or more compare and branch instructins to the parser's execution for *every* character in the URL string. Those extra compares and branches added up to 10% of the time (for long URL strings) it takes to create a URL object.
38 To ensure the exact same parser code is run over a char or a UniChar string, the source code was move to this .h file and is included multiple times by CFURL.c as needed. "STRING_CHAR(x)" was replaced by "characterArray[x]", and characterArray is defined as either an "const char *" or a "const UniChar *" for the two sets of function headers that are either parsing an array of char or an array of UniChar.
40 Any changes made to the parser are made in this file so that both char and the UniChar strings are parsed exactly the same way.
45 static void _parseComponentsCString(CFAllocatorRef alloc, CFURLRef baseURL, UInt32 *theFlags, CFRange **range, CFIndex cfStringLength, const char *characterArray)
46 static void _parseComponentsUString(CFAllocatorRef alloc, CFURLRef baseURL, UInt32 *theFlags, CFRange **range, CFIndex cfStringLength, const UniChar *characterArray)
48 #ifdef CFURL_INCLUDE_PARSE_COMPONENTS // defined when we want this block of code included
51 /* index gives the URL part involved; to calculate the correct range index, use the number of the bit of the equivalent flag (i.e. the host flag is HAS_HOST, which is 0x8. so the range index for the host is 3.) Note that this is true in this function ONLY, since the ranges stored in (*range) are actually packed, skipping those URL components that don't exist. This is why the indices are hard-coded in this function. */
53 CFIndex idx
, base_idx
= 0;
54 CFIndex string_length
;
55 UInt32 flags
= *theFlags
;
57 uint8_t numRanges
= 0;
59 string_length
= cfStringLength
;
61 // Algorithm is as described in RFC 1808
62 // 1: parse the fragment; remainder after left-most "#" is fragment
63 for (idx
= base_idx
; idx
< string_length
; idx
++) {
64 if ('#' == characterArray
[idx
]) {
65 flags
|= HAS_FRAGMENT
;
66 ranges
[8].location
= idx
+ 1;
67 ranges
[8].length
= string_length
- (idx
+ 1);
69 string_length
= idx
; // remove fragment from parse string
73 // 2: parse the scheme
74 for (idx
= base_idx
; idx
< string_length
; idx
++) {
75 UniChar ch
= characterArray
[idx
];
78 ranges
[0].location
= base_idx
;
79 ranges
[0].length
= idx
;
82 // optimization for ftp urls
83 if (idx
== 3 && characterArray
[0] == 'f' && characterArray
[1] == 't' && characterArray
[2] == 'p') {
84 _setSchemeTypeInFlags(&flags
, kHasFtpScheme
);
87 // optimization for http urls
88 if (characterArray
[0] == 'h' && characterArray
[1] == 't' && characterArray
[2] == 't' && characterArray
[3] == 'p') {
89 _setSchemeTypeInFlags(&flags
, kHasHttpScheme
);
91 // optimization for file urls
92 if (characterArray
[0] == 'f' && characterArray
[1] == 'i' && characterArray
[2] == 'l' && characterArray
[3] == 'e') {
93 _setSchemeTypeInFlags(&flags
, kHasFileScheme
);
95 // optimization for data urls
96 if (characterArray
[0] == 'd' && characterArray
[1] == 'a' && characterArray
[2] == 't' && characterArray
[3] == 'a') {
97 _setSchemeTypeInFlags(&flags
, kHasDataScheme
);
100 // optimization for https urls
101 else if (idx
== 5 && characterArray
[0] == 'h' && characterArray
[1] == 't' && characterArray
[2] == 't' && characterArray
[3] == 'p' && characterArray
[3] == 's') {
102 _setSchemeTypeInFlags(&flags
, kHasHttpsScheme
);
105 } else if (!scheme_valid(ch
)) {
106 break; // invalid scheme character -- no scheme
110 // Make sure we have an RFC-1808 compliant URL - that's either something without a scheme, or scheme:/(stuff) or scheme://(stuff)
111 // Strictly speaking, RFC 1808 & 2396 bar "scheme:" (with nothing following the colon); however, common usage
112 // expects this to be treated identically to "scheme://" - REW, 12/08/03
113 if (!(flags
& HAS_SCHEME
)) {
115 } else if (base_idx
== string_length
) {
117 } else if (characterArray
[base_idx
] != '/') {
124 // Clear the fragment flag if it's been set
125 if (flags
& HAS_FRAGMENT
) {
126 flags
&= (~HAS_FRAGMENT
);
127 string_length
= cfStringLength
;
130 (*range
) = (CFRange
*)CFAllocatorAllocate(alloc
, sizeof(CFRange
), 0);
131 (*range
)->location
= ranges
[0].location
;
132 (*range
)->length
= ranges
[0].length
;
136 // URL is 1808-compliant
137 flags
|= IS_DECOMPOSABLE
;
139 // 3: parse the network location and login
140 if (2 <= (string_length
- base_idx
) && '/' == characterArray
[base_idx
] && '/' == characterArray
[base_idx
+1]) {
141 CFIndex base
= 2 + base_idx
, extent
;
142 for (idx
= base
; idx
< string_length
; idx
++) {
143 if ('/' == characterArray
[idx
] || '?' == characterArray
[idx
]) {
149 // net_loc parts extend from base to extent (but not including), which might be to end of string
150 // net location is "<user>:<password>@<host>:<port>"
151 if (extent
!= base
) {
152 for (idx
= base
; idx
< extent
; idx
++) {
153 if ('@' == characterArray
[idx
]) { // there is a user
157 ranges
[1].location
= base
; // base of the user
158 for (idx2
= base
; idx2
< idx
; idx2
++) {
159 if (':' == characterArray
[idx2
]) { // found a password separator
160 flags
|= HAS_PASSWORD
;
162 ranges
[2].location
= idx2
+1; // base of the password
163 ranges
[2].length
= idx
-(idx2
+1); // password extent
164 ranges
[1].length
= idx2
- base
; // user extent
168 if (!(flags
& HAS_PASSWORD
)) {
169 // user extends to the '@'
170 ranges
[1].length
= idx
- base
; // user extent
178 ranges
[3].location
= base
; // base of host
180 // base has been advanced past the user and password if they existed
181 for (idx
= base
; idx
< extent
; idx
++) {
182 // IPV6 support (RFC 2732) DCJ June/10/2002
183 if ('[' == characterArray
[idx
]) { // starting IPV6 explicit address
184 // Find the ']' terminator of the IPv6 address, leave idx pointing to ']' or end
185 for ( ; idx
< extent
; ++ idx
) {
186 if ( ']' == characterArray
[idx
]) {
187 flags
|= IS_IPV6_ENCODED
;
192 // there is a port if we see a colon. Only the last one is the port, though.
193 else if ( ':' == characterArray
[idx
]) {
196 ranges
[4].location
= idx
+1; // base of port
197 ranges
[4].length
= extent
- (idx
+1); // port extent
198 ranges
[3].length
= idx
- base
; // host extent
202 if (!(flags
& HAS_PORT
)) {
203 ranges
[3].length
= extent
- base
; // host extent
209 // 4: parse the query; remainder after left-most "?" is query
210 for (idx
= base_idx
; idx
< string_length
; idx
++) {
211 if ('?' == characterArray
[idx
]) {
214 ranges
[7].location
= idx
+ 1;
215 ranges
[7].length
= string_length
- (idx
+1);
216 string_length
= idx
; // remove query from parse string
221 // 5: parse the parameters; remainder after left-most ";" is parameters
222 for (idx
= base_idx
; idx
< string_length
; idx
++) {
223 if (';' == characterArray
[idx
]) {
224 flags
|= HAS_PARAMETERS
;
226 ranges
[6].location
= idx
+ 1;
227 ranges
[6].length
= string_length
- (idx
+1);
228 string_length
= idx
; // remove parameters from parse string
233 // 6: parse the path; it's whatever's left between string_length & base_idx
234 if (string_length
- base_idx
!= 0 || (flags
& NET_LOCATION_MASK
))
236 // If we have a net location, we are 1808-compliant, and an empty path substring implies a path of "/"
242 pathRg
.location
= base_idx
;
243 pathRg
.length
= string_length
- base_idx
;
246 if (pathRg
.length
> 0) {
247 Boolean sawPercent
= FALSE
;
248 for (idx
= pathRg
.location
; idx
< string_length
; idx
++) {
249 if ('%' == characterArray
[idx
]) {
254 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_EMBEDDED_MINI
255 if (pathRg
.length
> 6 && characterArray
[pathRg
.location
] == '/' && characterArray
[pathRg
.location
+ 1] == '.' && characterArray
[pathRg
.location
+ 2] == 'f' && characterArray
[pathRg
.location
+ 3] == 'i' && characterArray
[pathRg
.location
+ 4] == 'l' && characterArray
[pathRg
.location
+ 5] == 'e' && characterArray
[pathRg
.location
+ 6] == '/') {
256 flags
|= PATH_HAS_FILE_ID
;
257 } else if (!sawPercent
) {
258 flags
|= POSIX_AND_URL_PATHS_MATCH
;
260 #elif DEPLOYMENT_TARGET_LINUX || DEPLOYMENT_TARGET_WINDOWS
262 flags
|= POSIX_AND_URL_PATHS_MATCH
;
266 ch
= characterArray
[pathRg
.location
+ pathRg
.length
- 1];
269 } else if (ch
== '.') {
270 if (pathRg
.length
== 1) {
273 ch
= characterArray
[pathRg
.location
+ pathRg
.length
- 2];
276 } else if (ch
!= '.') {
278 } else if (pathRg
.length
== 2) {
281 isDir
= (characterArray
[pathRg
.location
+ pathRg
.length
- 3] == '/');
288 isDir
= (baseURL
!= NULL
) ? CFURLHasDirectoryPath(baseURL
) : false;
291 flags
|= IS_DIRECTORY
;
296 (*range
) = (CFRange
*)CFAllocatorAllocate(alloc
, sizeof(CFRange
)*numRanges
, 0);
298 for (idx
= 0, flags
= 1; flags
!= (1<<9); flags
= (flags
<<1), idx
++) {
299 if ((*theFlags
) & flags
) {
300 (*range
)[numRanges
] = ranges
[idx
];
305 #endif // CFURL_INCLUDE_PARSE_COMPONENTS
308 static Boolean scanCharactersCString(CFAllocatorRef alloc, CFMutableStringRef *escapedString, UInt32 *flags, const char *characterArray, Boolean useCString, CFIndex base, CFIndex end, CFIndex *mark, UInt32 componentFlag, CFStringEncoding encoding)
309 static Boolean scanCharactersUString(CFAllocatorRef alloc, CFMutableStringRef *escapedString, UInt32 *flags, const UniChar *characterArray, Boolean useCString, CFIndex base, CFIndex end, CFIndex *mark, UInt32 componentFlag, CFStringEncoding encoding)
311 #ifdef CFURL_INCLUDE_SCAN_CHARACTERS // defined when we want this block of code included
314 Boolean sawIllegalChar
= false;
315 for (idx
= base
; idx
< end
; idx
++) {
316 Boolean shouldEscape
;
317 UniChar ch
= characterArray
[idx
];
318 if (isURLLegalCharacter(ch
)) {
319 if ((componentFlag
== HAS_USER
|| componentFlag
== HAS_PASSWORD
) && (ch
== '/' || ch
== '?' || ch
== '@')) {
322 shouldEscape
= false;
324 } else if (ch
== '%' && idx
+ 2 < end
&& isHexDigit(characterArray
[idx
+ 1]) && isHexDigit(characterArray
[idx
+2])) {
325 shouldEscape
= false;
326 } else if (componentFlag
== HAS_HOST
&& ((idx
== base
&& ch
== '[') || (idx
== end
-1 && ch
== ']'))) {
327 shouldEscape
= false;
332 sawIllegalChar
= true;
333 if (componentFlag
&& flags
) {
334 *flags
|= componentFlag
;
336 if (!*escapedString
) {
337 *escapedString
= CFStringCreateMutable(alloc
, 0);
340 CFStringRef tempString
= CFStringCreateWithBytes(alloc
, (uint8_t *)&(characterArray
[*mark
]), idx
- *mark
, kCFStringEncodingISOLatin1
, false);
341 CFStringAppend(*escapedString
, tempString
);
342 CFRelease(tempString
);
344 CFStringAppendCharacters(*escapedString
, (const UniChar
*)&(characterArray
[*mark
]), idx
- *mark
);
347 _appendPercentEscapesForCharacter(ch
, encoding
, *escapedString
); // This can never fail because anURL->_string was constructed from the encoding passed in
350 return sawIllegalChar
;
352 #endif // CFURL_INCLUDE_SCAN_CHARACTERS