2 * Copyright (c) 2015 Apple Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
21 * @APPLE_LICENSE_HEADER_END@
25 Copyright (c) 2012-2014, Apple Inc. All rights reserved.
26 Responsibility: Jim Luther/Chris Linn
34 CFURL's URL string parser needs to be able to parse either an array of char or an array of UniChar.
36 The code in CFURL.c used to use this macro "#define STRING_CHAR(x) (useCString ? cstring[(x)] : ustring[(x)])" to determine which array to get a character from for every character looked at in the URL string. That macro added one or more compare and branch instructins to the parser's execution for *every* character in the URL string. Those extra compares and branches added up to 10% of the time (for long URL strings) it takes to create a URL object.
38 To ensure the exact same parser code is run over a char or a UniChar string, the source code was move to this .h file and is included multiple times by CFURL.c as needed. "STRING_CHAR(x)" was replaced by "characterArray[x]", and characterArray is defined as either an "const char *" or a "const UniChar *" for the two sets of function headers that are either parsing an array of char or an array of UniChar.
40 Any changes made to the parser are made in this file so that both char and the UniChar strings are parsed exactly the same way.
45 static void _parseComponentsCString(CFAllocatorRef alloc, CFURLRef baseURL, CFIndex cfStringLength, const char *characterArray, UInt32 *theFlags, CFRange *packedRanges, uint8_t *numberOfRanges)
47 static void _parseComponentsUString(CFAllocatorRef alloc, CFURLRef baseURL, CFIndex cfStringLength, const UniChar *characterArray, UInt32 *theFlags, CFRange *packedRanges, uint8_t *numberOfRanges)
49 #ifdef CFURL_INCLUDE_PARSE_COMPONENTS // defined when we want this block of code included
51 /* index gives the URL part involved; to calculate the correct range index, use the number of the bit of the equivalent flag (i.e. the host flag is HAS_HOST, which is 0x8. so the range index for the host is 3.) Note that this is true in this function ONLY, since the ranges stored in (*range) are actually packed, skipping those URL components that don't exist. This is why the indices are hard-coded in this function. */
63 CFRange unpackedRanges
[MAX_COMPONENTS
];
65 CFIndex idx
, base_idx
= 0;
66 CFIndex string_length
;
67 UInt32 flags
= *theFlags
;
69 uint8_t numRanges
= 0;
71 string_length
= cfStringLength
;
73 // Algorithm is as described in RFC 1808
74 // 1: parse the fragment; remainder after left-most "#" is fragment
75 for (idx
= base_idx
; idx
< string_length
; idx
++) {
76 if ('#' == characterArray
[idx
]) {
77 flags
|= HAS_FRAGMENT
;
78 unpackedRanges
[fragment_index
].location
= idx
+ 1;
79 unpackedRanges
[fragment_index
].length
= string_length
- (idx
+ 1);
81 string_length
= idx
; // remove fragment from parse string
85 // 2: parse the scheme
86 for (idx
= base_idx
; idx
< string_length
; idx
++) {
87 UniChar ch
= characterArray
[idx
];
90 unpackedRanges
[scheme_index
].location
= base_idx
;
91 unpackedRanges
[scheme_index
].length
= idx
;
94 // optimization for ftp urls
95 if (idx
== 3 && characterArray
[0] == 'f' && characterArray
[1] == 't' && characterArray
[2] == 'p') {
96 _setSchemeTypeInFlags(&flags
, kHasFtpScheme
);
99 // optimization for http urls
100 if (characterArray
[0] == 'h' && characterArray
[1] == 't' && characterArray
[2] == 't' && characterArray
[3] == 'p') {
101 _setSchemeTypeInFlags(&flags
, kHasHttpScheme
);
103 // optimization for file urls
104 if (characterArray
[0] == 'f' && characterArray
[1] == 'i' && characterArray
[2] == 'l' && characterArray
[3] == 'e') {
105 _setSchemeTypeInFlags(&flags
, kHasFileScheme
);
107 // optimization for data urls
108 if (characterArray
[0] == 'd' && characterArray
[1] == 'a' && characterArray
[2] == 't' && characterArray
[3] == 'a') {
109 _setSchemeTypeInFlags(&flags
, kHasDataScheme
);
112 // optimization for https urls
113 else if (idx
== 5 && characterArray
[0] == 'h' && characterArray
[1] == 't' && characterArray
[2] == 't' && characterArray
[3] == 'p' && characterArray
[3] == 's') {
114 _setSchemeTypeInFlags(&flags
, kHasHttpsScheme
);
117 } else if (!scheme_valid(ch
)) {
118 break; // invalid scheme character -- no scheme
122 // Make sure we have an RFC-1808 compliant URL - that's either something without a scheme, or scheme:/(stuff) or scheme://(stuff)
123 // Strictly speaking, RFC 1808 & 2396 bar "scheme:" (with nothing following the colon); however, common usage
124 // expects this to be treated identically to "scheme://" - REW, 12/08/03
125 if (!(flags
& HAS_SCHEME
)) {
127 } else if (base_idx
== string_length
) {
129 } else if (characterArray
[base_idx
] != '/') {
136 // Clear the fragment flag if it's been set
137 if (flags
& HAS_FRAGMENT
) {
138 flags
&= (~HAS_FRAGMENT
);
139 string_length
= cfStringLength
;
142 packedRanges
[scheme_index
].location
= unpackedRanges
[scheme_index
].location
;
143 packedRanges
[scheme_index
].length
= unpackedRanges
[scheme_index
].length
;
147 // URL is 1808-compliant
148 flags
|= IS_DECOMPOSABLE
;
150 // 3: parse the network location and login
151 if (2 <= (string_length
- base_idx
) && '/' == characterArray
[base_idx
] && '/' == characterArray
[base_idx
+1]) {
152 CFIndex base
= 2 + base_idx
, extent
;
153 for (idx
= base
; idx
< string_length
; idx
++) {
154 if ('/' == characterArray
[idx
] || '?' == characterArray
[idx
]) {
160 // net_loc parts extend from base to extent (but not including), which might be to end of string
161 // net location is "<user>:<password>@<host>:<port>"
162 if (extent
!= base
) {
163 for (idx
= base
; idx
< extent
; idx
++) {
164 if ('@' == characterArray
[idx
]) { // there is a user
168 unpackedRanges
[user_index
].location
= base
; // base of the user
169 for (idx2
= base
; idx2
< idx
; idx2
++) {
170 if (':' == characterArray
[idx2
]) { // found a password separator
171 flags
|= HAS_PASSWORD
;
173 unpackedRanges
[password_index
].location
= idx2
+1; // base of the password
174 unpackedRanges
[password_index
].length
= idx
-(idx2
+1); // password extent
175 unpackedRanges
[user_index
].length
= idx2
- base
; // user extent
179 if (!(flags
& HAS_PASSWORD
)) {
180 // user extends to the '@'
181 unpackedRanges
[user_index
].length
= idx
- base
; // user extent
189 unpackedRanges
[host_index
].location
= base
; // base of host
191 // base has been advanced past the user and password if they existed
192 for (idx
= base
; idx
< extent
; idx
++) {
193 // IPV6 support (RFC 2732) DCJ June/10/2002
194 if ('[' == characterArray
[idx
]) { // starting IPV6 explicit address
195 // Find the ']' terminator of the IPv6 address, leave idx pointing to ']' or end
196 for ( ; idx
< extent
; ++ idx
) {
197 if ( ']' == characterArray
[idx
]) {
198 flags
|= IS_IPV6_ENCODED
;
203 // there is a port if we see a colon. Only the last one is the port, though.
204 else if ( ':' == characterArray
[idx
]) {
207 unpackedRanges
[port_index
].location
= idx
+1; // base of port
208 unpackedRanges
[port_index
].length
= extent
- (idx
+1); // port extent
209 unpackedRanges
[host_index
].length
= idx
- base
; // host extent
213 if (!(flags
& HAS_PORT
)) {
214 unpackedRanges
[host_index
].length
= extent
- base
; // host extent
220 // 4: parse the query; remainder after left-most "?" is query
221 for (idx
= base_idx
; idx
< string_length
; idx
++) {
222 if ('?' == characterArray
[idx
]) {
225 unpackedRanges
[query_index
].location
= idx
+ 1;
226 unpackedRanges
[query_index
].length
= string_length
- (idx
+1);
227 string_length
= idx
; // remove query from parse string
232 // 5: parse the parameters; remainder after left-most ";" is parameters
233 for (idx
= base_idx
; idx
< string_length
; idx
++) {
234 if (';' == characterArray
[idx
]) {
235 flags
|= HAS_PARAMETERS
;
237 unpackedRanges
[parameters_index
].location
= idx
+ 1;
238 unpackedRanges
[parameters_index
].length
= string_length
- (idx
+1);
239 string_length
= idx
; // remove parameters from parse string
244 // 6: parse the path; it's whatever's left between string_length & base_idx
245 if (string_length
- base_idx
!= 0 || (flags
& NET_LOCATION_MASK
))
247 // If we have a net location, we are 1808-compliant, and an empty path substring implies a path of "/"
253 pathRg
.location
= base_idx
;
254 pathRg
.length
= string_length
- base_idx
;
255 unpackedRanges
[path_index
] = pathRg
;
257 if (pathRg
.length
> 0) {
258 Boolean sawPercent
= FALSE
;
259 for (idx
= pathRg
.location
; idx
< string_length
; idx
++) {
260 if ('%' == characterArray
[idx
]) {
265 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_EMBEDDED_MINI
266 if (pathRg
.length
> 6 && characterArray
[pathRg
.location
] == '/' && characterArray
[pathRg
.location
+ 1] == '.' && characterArray
[pathRg
.location
+ 2] == 'f' && characterArray
[pathRg
.location
+ 3] == 'i' && characterArray
[pathRg
.location
+ 4] == 'l' && characterArray
[pathRg
.location
+ 5] == 'e' && characterArray
[pathRg
.location
+ 6] == '/') {
267 flags
|= PATH_HAS_FILE_ID
;
268 } else if (!sawPercent
) {
269 flags
|= POSIX_AND_URL_PATHS_MATCH
;
271 #elif DEPLOYMENT_TARGET_LINUX || DEPLOYMENT_TARGET_WINDOWS
273 flags
|= POSIX_AND_URL_PATHS_MATCH
;
277 ch
= characterArray
[pathRg
.location
+ pathRg
.length
- 1];
280 } else if (ch
== '.') {
281 if (pathRg
.length
== 1) {
284 ch
= characterArray
[pathRg
.location
+ pathRg
.length
- 2];
287 } else if (ch
!= '.') {
289 } else if (pathRg
.length
== 2) {
292 isDir
= (characterArray
[pathRg
.location
+ pathRg
.length
- 3] == '/');
299 isDir
= (baseURL
!= NULL
) ? CFURLHasDirectoryPath(baseURL
) : false;
302 flags
|= IS_DIRECTORY
;
307 *numberOfRanges
= numRanges
;
309 for (idx
= 0, flags
= 1; flags
!= (1<<9); flags
= (flags
<<1), idx
++) {
310 if ((*theFlags
) & flags
) {
311 packedRanges
[numRanges
] = unpackedRanges
[idx
];
317 #endif // CFURL_INCLUDE_PARSE_COMPONENTS
320 static Boolean scanCharactersCString(CFAllocatorRef alloc, CFMutableStringRef *escapedString, UInt32 *flags, const char *characterArray, Boolean useCString, CFIndex base, CFIndex end, CFIndex *mark, UInt32 componentFlag, CFStringEncoding encoding)
321 static Boolean scanCharactersUString(CFAllocatorRef alloc, CFMutableStringRef *escapedString, UInt32 *flags, const UniChar *characterArray, Boolean useCString, CFIndex base, CFIndex end, CFIndex *mark, UInt32 componentFlag, CFStringEncoding encoding)
323 #ifdef CFURL_INCLUDE_SCAN_CHARACTERS // defined when we want this block of code included
326 Boolean sawIllegalChar
= false;
327 for (idx
= base
; idx
< end
; idx
++) {
328 Boolean shouldEscape
;
329 UniChar ch
= characterArray
[idx
];
330 if (isURLLegalCharacter(ch
)) {
331 if ((componentFlag
== HAS_USER
|| componentFlag
== HAS_PASSWORD
) && (ch
== '/' || ch
== '?' || ch
== '@')) {
334 shouldEscape
= false;
336 } else if (ch
== '%' && idx
+ 2 < end
&& isHexDigit(characterArray
[idx
+ 1]) && isHexDigit(characterArray
[idx
+2])) {
337 shouldEscape
= false;
338 } else if (componentFlag
== HAS_HOST
&& ((idx
== base
&& ch
== '[') || (idx
== end
-1 && ch
== ']'))) {
339 shouldEscape
= false;
344 sawIllegalChar
= true;
345 if (componentFlag
&& flags
) {
346 *flags
|= componentFlag
;
348 if (!*escapedString
) {
349 *escapedString
= CFStringCreateMutable(alloc
, 0);
352 CFStringRef tempString
= CFStringCreateWithBytes(alloc
, (uint8_t *)&(characterArray
[*mark
]), idx
- *mark
, kCFStringEncodingISOLatin1
, false);
353 CFStringAppend(*escapedString
, tempString
);
354 CFRelease(tempString
);
356 CFStringAppendCharacters(*escapedString
, (const UniChar
*)&(characterArray
[*mark
]), idx
- *mark
);
359 _appendPercentEscapesForCharacter(ch
, encoding
, *escapedString
); // This can never fail because anURL->_string was constructed from the encoding passed in
362 return sawIllegalChar
;
364 #endif // CFURL_INCLUDE_SCAN_CHARACTERS