]> git.saurik.com Git - apple/cf.git/blob - CFURL.inc.h
93d73c55b18f93d34cf8f889047c5c5f758826a1
[apple/cf.git] / CFURL.inc.h
1 /*
2 * Copyright (c) 2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24 /* CFURL.inc.h
25 Copyright (c) 2012-2013, Apple Inc. All rights reserved.
26 Responsibility: Jim Luther
27 */
28
29
30 /*
31
32 What's this file for?
33
34 CFURL's URL string parser needs to be able to parse either an array of char or an array of UniChar.
35
36 The code in CFURL.c used to use this macro "#define STRING_CHAR(x) (useCString ? cstring[(x)] : ustring[(x)])" to determine which array to get a character from for every character looked at in the URL string. That macro added one or more compare and branch instructins to the parser's execution for *every* character in the URL string. Those extra compares and branches added up to 10% of the time (for long URL strings) it takes to create a URL object.
37
38 To ensure the exact same parser code is run over a char or a UniChar string, the source code was move to this .h file and is included multiple times by CFURL.c as needed. "STRING_CHAR(x)" was replaced by "characterArray[x]", and characterArray is defined as either an "const char *" or a "const UniChar *" for the two sets of function headers that are either parsing an array of char or an array of UniChar.
39
40 Any changes made to the parser are made in this file so that both char and the UniChar strings are parsed exactly the same way.
41
42 */
43
44 /*
45 static void _parseComponentsCString(CFAllocatorRef alloc, CFURLRef baseURL, UInt32 *theFlags, CFRange **range, CFIndex cfStringLength, const char *characterArray)
46 static void _parseComponentsUString(CFAllocatorRef alloc, CFURLRef baseURL, UInt32 *theFlags, CFRange **range, CFIndex cfStringLength, const UniChar *characterArray)
47 */
48 #ifdef CFURL_INCLUDE_PARSE_COMPONENTS // defined when we want this block of code included
49 {
50 CFRange ranges[9];
51 /* index gives the URL part involved; to calculate the correct range index, use the number of the bit of the equivalent flag (i.e. the host flag is HAS_HOST, which is 0x8. so the range index for the host is 3.) Note that this is true in this function ONLY, since the ranges stored in (*range) are actually packed, skipping those URL components that don't exist. This is why the indices are hard-coded in this function. */
52
53 CFIndex idx, base_idx = 0;
54 CFIndex string_length;
55 UInt32 flags = *theFlags;
56 Boolean isCompliant;
57 uint8_t numRanges = 0;
58
59 string_length = cfStringLength;
60
61 // Algorithm is as described in RFC 1808
62 // 1: parse the fragment; remainder after left-most "#" is fragment
63 for (idx = base_idx; idx < string_length; idx++) {
64 if ('#' == characterArray[idx]) {
65 flags |= HAS_FRAGMENT;
66 ranges[8].location = idx + 1;
67 ranges[8].length = string_length - (idx + 1);
68 numRanges ++;
69 string_length = idx; // remove fragment from parse string
70 break;
71 }
72 }
73 // 2: parse the scheme
74 for (idx = base_idx; idx < string_length; idx++) {
75 UniChar ch = characterArray[idx];
76 if (':' == ch) {
77 flags |= HAS_SCHEME;
78 ranges[0].location = base_idx;
79 ranges[0].length = idx;
80 numRanges ++;
81 base_idx = idx + 1;
82 // optimization for ftp urls
83 if (idx == 3 && characterArray[0] == 'f' && characterArray[1] == 't' && characterArray[2] == 'p') {
84 _setSchemeTypeInFlags(&flags, kHasFtpScheme);
85 }
86 else if (idx == 4) {
87 // optimization for http urls
88 if (characterArray[0] == 'h' && characterArray[1] == 't' && characterArray[2] == 't' && characterArray[3] == 'p') {
89 _setSchemeTypeInFlags(&flags, kHasHttpScheme);
90 }
91 // optimization for file urls
92 if (characterArray[0] == 'f' && characterArray[1] == 'i' && characterArray[2] == 'l' && characterArray[3] == 'e') {
93 _setSchemeTypeInFlags(&flags, kHasFileScheme);
94 }
95 // optimization for data urls
96 if (characterArray[0] == 'd' && characterArray[1] == 'a' && characterArray[2] == 't' && characterArray[3] == 'a') {
97 _setSchemeTypeInFlags(&flags, kHasDataScheme);
98 }
99 }
100 // optimization for https urls
101 else if (idx == 5 && characterArray[0] == 'h' && characterArray[1] == 't' && characterArray[2] == 't' && characterArray[3] == 'p' && characterArray[3] == 's') {
102 _setSchemeTypeInFlags(&flags, kHasHttpsScheme);
103 }
104 break;
105 } else if (!scheme_valid(ch)) {
106 break; // invalid scheme character -- no scheme
107 }
108 }
109
110 // Make sure we have an RFC-1808 compliant URL - that's either something without a scheme, or scheme:/(stuff) or scheme://(stuff)
111 // Strictly speaking, RFC 1808 & 2396 bar "scheme:" (with nothing following the colon); however, common usage
112 // expects this to be treated identically to "scheme://" - REW, 12/08/03
113 if (!(flags & HAS_SCHEME)) {
114 isCompliant = true;
115 } else if (base_idx == string_length) {
116 isCompliant = false;
117 } else if (characterArray[base_idx] != '/') {
118 isCompliant = false;
119 } else {
120 isCompliant = true;
121 }
122
123 if (!isCompliant) {
124 // Clear the fragment flag if it's been set
125 if (flags & HAS_FRAGMENT) {
126 flags &= (~HAS_FRAGMENT);
127 string_length = cfStringLength;
128 }
129 (*theFlags) = flags;
130 (*range) = (CFRange *)CFAllocatorAllocate(alloc, sizeof(CFRange), 0);
131 (*range)->location = ranges[0].location;
132 (*range)->length = ranges[0].length;
133
134 return;
135 }
136 // URL is 1808-compliant
137 flags |= IS_DECOMPOSABLE;
138
139 // 3: parse the network location and login
140 if (2 <= (string_length - base_idx) && '/' == characterArray[base_idx] && '/' == characterArray[base_idx+1]) {
141 CFIndex base = 2 + base_idx, extent;
142 for (idx = base; idx < string_length; idx++) {
143 if ('/' == characterArray[idx] || '?' == characterArray[idx]) {
144 break;
145 }
146 }
147 extent = idx;
148
149 // net_loc parts extend from base to extent (but not including), which might be to end of string
150 // net location is "<user>:<password>@<host>:<port>"
151 if (extent != base) {
152 for (idx = base; idx < extent; idx++) {
153 if ('@' == characterArray[idx]) { // there is a user
154 CFIndex idx2;
155 flags |= HAS_USER;
156 numRanges ++;
157 ranges[1].location = base; // base of the user
158 for (idx2 = base; idx2 < idx; idx2++) {
159 if (':' == characterArray[idx2]) { // found a password separator
160 flags |= HAS_PASSWORD;
161 numRanges ++;
162 ranges[2].location = idx2+1; // base of the password
163 ranges[2].length = idx-(idx2+1); // password extent
164 ranges[1].length = idx2 - base; // user extent
165 break;
166 }
167 }
168 if (!(flags & HAS_PASSWORD)) {
169 // user extends to the '@'
170 ranges[1].length = idx - base; // user extent
171 }
172 base = idx + 1;
173 break;
174 }
175 }
176 flags |= HAS_HOST;
177 numRanges ++;
178 ranges[3].location = base; // base of host
179
180 // base has been advanced past the user and password if they existed
181 for (idx = base; idx < extent; idx++) {
182 // IPV6 support (RFC 2732) DCJ June/10/2002
183 if ('[' == characterArray[idx]) { // starting IPV6 explicit address
184 // Find the ']' terminator of the IPv6 address, leave idx pointing to ']' or end
185 for ( ; idx < extent; ++ idx ) {
186 if ( ']' == characterArray[idx]) {
187 flags |= IS_IPV6_ENCODED;
188 break;
189 }
190 }
191 }
192 // there is a port if we see a colon. Only the last one is the port, though.
193 else if ( ':' == characterArray[idx]) {
194 flags |= HAS_PORT;
195 numRanges ++;
196 ranges[4].location = idx+1; // base of port
197 ranges[4].length = extent - (idx+1); // port extent
198 ranges[3].length = idx - base; // host extent
199 break;
200 }
201 }
202 if (!(flags & HAS_PORT)) {
203 ranges[3].length = extent - base; // host extent
204 }
205 }
206 base_idx = extent;
207 }
208
209 // 4: parse the query; remainder after left-most "?" is query
210 for (idx = base_idx; idx < string_length; idx++) {
211 if ('?' == characterArray[idx]) {
212 flags |= HAS_QUERY;
213 numRanges ++;
214 ranges[7].location = idx + 1;
215 ranges[7].length = string_length - (idx+1);
216 string_length = idx; // remove query from parse string
217 break;
218 }
219 }
220
221 // 5: parse the parameters; remainder after left-most ";" is parameters
222 for (idx = base_idx; idx < string_length; idx++) {
223 if (';' == characterArray[idx]) {
224 flags |= HAS_PARAMETERS;
225 numRanges ++;
226 ranges[6].location = idx + 1;
227 ranges[6].length = string_length - (idx+1);
228 string_length = idx; // remove parameters from parse string
229 break;
230 }
231 }
232
233 // 6: parse the path; it's whatever's left between string_length & base_idx
234 if (string_length - base_idx != 0 || (flags & NET_LOCATION_MASK))
235 {
236 // If we have a net location, we are 1808-compliant, and an empty path substring implies a path of "/"
237 UniChar ch;
238 Boolean isDir;
239 CFRange pathRg;
240 flags |= HAS_PATH;
241 numRanges ++;
242 pathRg.location = base_idx;
243 pathRg.length = string_length - base_idx;
244 ranges[5] = pathRg;
245
246 if (pathRg.length > 0) {
247 Boolean sawPercent = FALSE;
248 for (idx = pathRg.location; idx < string_length; idx++) {
249 if ('%' == characterArray[idx]) {
250 sawPercent = TRUE;
251 break;
252 }
253 }
254 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_EMBEDDED_MINI
255 if (pathRg.length > 6 && characterArray[pathRg.location] == '/' && characterArray[pathRg.location + 1] == '.' && characterArray[pathRg.location + 2] == 'f' && characterArray[pathRg.location + 3] == 'i' && characterArray[pathRg.location + 4] == 'l' && characterArray[pathRg.location + 5] == 'e' && characterArray[pathRg.location + 6] == '/') {
256 flags |= PATH_HAS_FILE_ID;
257 } else if (!sawPercent) {
258 flags |= POSIX_AND_URL_PATHS_MATCH;
259 }
260 #elif DEPLOYMENT_TARGET_LINUX || DEPLOYMENT_TARGET_WINDOWS
261 if (!sawPercent) {
262 flags |= POSIX_AND_URL_PATHS_MATCH;
263 }
264 #endif
265
266 ch = characterArray[pathRg.location + pathRg.length - 1];
267 if (ch == '/') {
268 isDir = true;
269 } else if (ch == '.') {
270 if (pathRg.length == 1) {
271 isDir = true;
272 } else {
273 ch = characterArray[pathRg.location + pathRg.length - 2];
274 if (ch == '/') {
275 isDir = true;
276 } else if (ch != '.') {
277 isDir = false;
278 } else if (pathRg.length == 2) {
279 isDir = true;
280 } else {
281 isDir = (characterArray[pathRg.location + pathRg.length - 3] == '/');
282 }
283 }
284 } else {
285 isDir = false;
286 }
287 } else {
288 isDir = (baseURL != NULL) ? CFURLHasDirectoryPath(baseURL) : false;
289 }
290 if (isDir) {
291 flags |= IS_DIRECTORY;
292 }
293 }
294
295 (*theFlags) = flags;
296 (*range) = (CFRange *)CFAllocatorAllocate(alloc, sizeof(CFRange)*numRanges, 0);
297 numRanges = 0;
298 for (idx = 0, flags = 1; flags != (1<<9); flags = (flags<<1), idx ++) {
299 if ((*theFlags) & flags) {
300 (*range)[numRanges] = ranges[idx];
301 numRanges ++;
302 }
303 }
304 }
305 #endif // CFURL_INCLUDE_PARSE_COMPONENTS
306
307 /*
308 static Boolean scanCharactersCString(CFAllocatorRef alloc, CFMutableStringRef *escapedString, UInt32 *flags, const char *characterArray, Boolean useCString, CFIndex base, CFIndex end, CFIndex *mark, UInt32 componentFlag, CFStringEncoding encoding)
309 static Boolean scanCharactersUString(CFAllocatorRef alloc, CFMutableStringRef *escapedString, UInt32 *flags, const UniChar *characterArray, Boolean useCString, CFIndex base, CFIndex end, CFIndex *mark, UInt32 componentFlag, CFStringEncoding encoding)
310 */
311 #ifdef CFURL_INCLUDE_SCAN_CHARACTERS // defined when we want this block of code included
312 {
313 CFIndex idx;
314 Boolean sawIllegalChar = false;
315 for (idx = base; idx < end; idx ++) {
316 Boolean shouldEscape;
317 UniChar ch = characterArray[idx];
318 if (isURLLegalCharacter(ch)) {
319 if ((componentFlag == HAS_USER || componentFlag == HAS_PASSWORD) && (ch == '/' || ch == '?' || ch == '@')) {
320 shouldEscape = true;
321 } else {
322 shouldEscape = false;
323 }
324 } else if (ch == '%' && idx + 2 < end && isHexDigit(characterArray[idx + 1]) && isHexDigit(characterArray[idx+2])) {
325 shouldEscape = false;
326 } else if (componentFlag == HAS_HOST && ((idx == base && ch == '[') || (idx == end-1 && ch == ']'))) {
327 shouldEscape = false;
328 } else {
329 shouldEscape = true;
330 }
331 if (shouldEscape) {
332 sawIllegalChar = true;
333 if (componentFlag && flags) {
334 *flags |= componentFlag;
335 }
336 if (!*escapedString) {
337 *escapedString = CFStringCreateMutable(alloc, 0);
338 }
339 if (useCString) {
340 CFStringRef tempString = CFStringCreateWithBytes(alloc, (uint8_t *)&(characterArray[*mark]), idx - *mark, kCFStringEncodingISOLatin1, false);
341 CFStringAppend(*escapedString, tempString);
342 CFRelease(tempString);
343 } else {
344 CFStringAppendCharacters(*escapedString, (const UniChar *)&(characterArray[*mark]), idx - *mark);
345 }
346 *mark = idx + 1;
347 _appendPercentEscapesForCharacter(ch, encoding, *escapedString); // This can never fail because anURL->_string was constructed from the encoding passed in
348 }
349 }
350 return sawIllegalChar;
351 }
352 #endif // CFURL_INCLUDE_SCAN_CHARACTERS