]> git.saurik.com Git - apple/cf.git/blob - CFURL.inc.h
CF-1152.14.tar.gz
[apple/cf.git] / CFURL.inc.h
1 /*
2 * Copyright (c) 2015 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24 /* CFURL.inc.h
25 Copyright (c) 2012-2014, Apple Inc. All rights reserved.
26 Responsibility: Jim Luther/Chris Linn
27 */
28
29
30 /*
31
32 What's this file for?
33
34 CFURL's URL string parser needs to be able to parse either an array of char or an array of UniChar.
35
36 The code in CFURL.c used to use this macro "#define STRING_CHAR(x) (useCString ? cstring[(x)] : ustring[(x)])" to determine which array to get a character from for every character looked at in the URL string. That macro added one or more compare and branch instructins to the parser's execution for *every* character in the URL string. Those extra compares and branches added up to 10% of the time (for long URL strings) it takes to create a URL object.
37
38 To ensure the exact same parser code is run over a char or a UniChar string, the source code was move to this .h file and is included multiple times by CFURL.c as needed. "STRING_CHAR(x)" was replaced by "characterArray[x]", and characterArray is defined as either an "const char *" or a "const UniChar *" for the two sets of function headers that are either parsing an array of char or an array of UniChar.
39
40 Any changes made to the parser are made in this file so that both char and the UniChar strings are parsed exactly the same way.
41
42 */
43
44 /*
45 static void _parseComponentsCString(CFAllocatorRef alloc, CFURLRef baseURL, CFIndex cfStringLength, const char *characterArray, UInt32 *theFlags, CFRange *packedRanges, uint8_t *numberOfRanges)
46 or
47 static void _parseComponentsUString(CFAllocatorRef alloc, CFURLRef baseURL, CFIndex cfStringLength, const UniChar *characterArray, UInt32 *theFlags, CFRange *packedRanges, uint8_t *numberOfRanges)
48 */
49 #ifdef CFURL_INCLUDE_PARSE_COMPONENTS // defined when we want this block of code included
50 {
51 /* index gives the URL part involved; to calculate the correct range index, use the number of the bit of the equivalent flag (i.e. the host flag is HAS_HOST, which is 0x8. so the range index for the host is 3.) Note that this is true in this function ONLY, since the ranges stored in (*range) are actually packed, skipping those URL components that don't exist. This is why the indices are hard-coded in this function. */
52 enum {
53 scheme_index = 0,
54 user_index = 1,
55 password_index = 2,
56 host_index = 3,
57 port_index = 4,
58 path_index = 5,
59 parameters_index = 6,
60 query_index = 7,
61 fragment_index = 8,
62 };
63 CFRange unpackedRanges[MAX_COMPONENTS];
64
65 CFIndex idx, base_idx = 0;
66 CFIndex string_length;
67 UInt32 flags = *theFlags;
68 Boolean isCompliant;
69 uint8_t numRanges = 0;
70
71 string_length = cfStringLength;
72
73 // Algorithm is as described in RFC 1808
74 // 1: parse the fragment; remainder after left-most "#" is fragment
75 for (idx = base_idx; idx < string_length; idx++) {
76 if ('#' == characterArray[idx]) {
77 flags |= HAS_FRAGMENT;
78 unpackedRanges[fragment_index].location = idx + 1;
79 unpackedRanges[fragment_index].length = string_length - (idx + 1);
80 numRanges ++;
81 string_length = idx; // remove fragment from parse string
82 break;
83 }
84 }
85 // 2: parse the scheme
86 for (idx = base_idx; idx < string_length; idx++) {
87 UniChar ch = characterArray[idx];
88 if (':' == ch) {
89 flags |= HAS_SCHEME;
90 unpackedRanges[scheme_index].location = base_idx;
91 unpackedRanges[scheme_index].length = idx;
92 numRanges ++;
93 base_idx = idx + 1;
94 // optimization for ftp urls
95 if (idx == 3 && characterArray[0] == 'f' && characterArray[1] == 't' && characterArray[2] == 'p') {
96 _setSchemeTypeInFlags(&flags, kHasFtpScheme);
97 }
98 else if (idx == 4) {
99 // optimization for http urls
100 if (characterArray[0] == 'h' && characterArray[1] == 't' && characterArray[2] == 't' && characterArray[3] == 'p') {
101 _setSchemeTypeInFlags(&flags, kHasHttpScheme);
102 }
103 // optimization for file urls
104 if (characterArray[0] == 'f' && characterArray[1] == 'i' && characterArray[2] == 'l' && characterArray[3] == 'e') {
105 _setSchemeTypeInFlags(&flags, kHasFileScheme);
106 }
107 // optimization for data urls
108 if (characterArray[0] == 'd' && characterArray[1] == 'a' && characterArray[2] == 't' && characterArray[3] == 'a') {
109 _setSchemeTypeInFlags(&flags, kHasDataScheme);
110 }
111 }
112 // optimization for https urls
113 else if (idx == 5 && characterArray[0] == 'h' && characterArray[1] == 't' && characterArray[2] == 't' && characterArray[3] == 'p' && characterArray[3] == 's') {
114 _setSchemeTypeInFlags(&flags, kHasHttpsScheme);
115 }
116 break;
117 } else if (!scheme_valid(ch)) {
118 break; // invalid scheme character -- no scheme
119 }
120 }
121
122 // Make sure we have an RFC-1808 compliant URL - that's either something without a scheme, or scheme:/(stuff) or scheme://(stuff)
123 // Strictly speaking, RFC 1808 & 2396 bar "scheme:" (with nothing following the colon); however, common usage
124 // expects this to be treated identically to "scheme://" - REW, 12/08/03
125 if (!(flags & HAS_SCHEME)) {
126 isCompliant = true;
127 } else if (base_idx == string_length) {
128 isCompliant = false;
129 } else if (characterArray[base_idx] != '/') {
130 isCompliant = false;
131 } else {
132 isCompliant = true;
133 }
134
135 if (!isCompliant) {
136 // Clear the fragment flag if it's been set
137 if (flags & HAS_FRAGMENT) {
138 flags &= (~HAS_FRAGMENT);
139 string_length = cfStringLength;
140 }
141 (*theFlags) = flags;
142 packedRanges[scheme_index].location = unpackedRanges[scheme_index].location;
143 packedRanges[scheme_index].length = unpackedRanges[scheme_index].length;
144 *numberOfRanges = 1;
145 }
146 else {
147 // URL is 1808-compliant
148 flags |= IS_DECOMPOSABLE;
149
150 // 3: parse the network location and login
151 if (2 <= (string_length - base_idx) && '/' == characterArray[base_idx] && '/' == characterArray[base_idx+1]) {
152 CFIndex base = 2 + base_idx, extent;
153 for (idx = base; idx < string_length; idx++) {
154 if ('/' == characterArray[idx] || '?' == characterArray[idx]) {
155 break;
156 }
157 }
158 extent = idx;
159
160 // net_loc parts extend from base to extent (but not including), which might be to end of string
161 // net location is "<user>:<password>@<host>:<port>"
162 if (extent != base) {
163 for (idx = base; idx < extent; idx++) {
164 if ('@' == characterArray[idx]) { // there is a user
165 CFIndex idx2;
166 flags |= HAS_USER;
167 numRanges ++;
168 unpackedRanges[user_index].location = base; // base of the user
169 for (idx2 = base; idx2 < idx; idx2++) {
170 if (':' == characterArray[idx2]) { // found a password separator
171 flags |= HAS_PASSWORD;
172 numRanges ++;
173 unpackedRanges[password_index].location = idx2+1; // base of the password
174 unpackedRanges[password_index].length = idx-(idx2+1); // password extent
175 unpackedRanges[user_index].length = idx2 - base; // user extent
176 break;
177 }
178 }
179 if (!(flags & HAS_PASSWORD)) {
180 // user extends to the '@'
181 unpackedRanges[user_index].length = idx - base; // user extent
182 }
183 base = idx + 1;
184 break;
185 }
186 }
187 flags |= HAS_HOST;
188 numRanges ++;
189 unpackedRanges[host_index].location = base; // base of host
190
191 // base has been advanced past the user and password if they existed
192 for (idx = base; idx < extent; idx++) {
193 // IPV6 support (RFC 2732) DCJ June/10/2002
194 if ('[' == characterArray[idx]) { // starting IPV6 explicit address
195 // Find the ']' terminator of the IPv6 address, leave idx pointing to ']' or end
196 for ( ; idx < extent; ++ idx ) {
197 if ( ']' == characterArray[idx]) {
198 flags |= IS_IPV6_ENCODED;
199 break;
200 }
201 }
202 }
203 // there is a port if we see a colon. Only the last one is the port, though.
204 else if ( ':' == characterArray[idx]) {
205 flags |= HAS_PORT;
206 numRanges ++;
207 unpackedRanges[port_index].location = idx+1; // base of port
208 unpackedRanges[port_index].length = extent - (idx+1); // port extent
209 unpackedRanges[host_index].length = idx - base; // host extent
210 break;
211 }
212 }
213 if (!(flags & HAS_PORT)) {
214 unpackedRanges[host_index].length = extent - base; // host extent
215 }
216 }
217 base_idx = extent;
218 }
219
220 // 4: parse the query; remainder after left-most "?" is query
221 for (idx = base_idx; idx < string_length; idx++) {
222 if ('?' == characterArray[idx]) {
223 flags |= HAS_QUERY;
224 numRanges ++;
225 unpackedRanges[query_index].location = idx + 1;
226 unpackedRanges[query_index].length = string_length - (idx+1);
227 string_length = idx; // remove query from parse string
228 break;
229 }
230 }
231
232 // 5: parse the parameters; remainder after left-most ";" is parameters
233 for (idx = base_idx; idx < string_length; idx++) {
234 if (';' == characterArray[idx]) {
235 flags |= HAS_PARAMETERS;
236 numRanges ++;
237 unpackedRanges[parameters_index].location = idx + 1;
238 unpackedRanges[parameters_index].length = string_length - (idx+1);
239 string_length = idx; // remove parameters from parse string
240 break;
241 }
242 }
243
244 // 6: parse the path; it's whatever's left between string_length & base_idx
245 if (string_length - base_idx != 0 || (flags & NET_LOCATION_MASK))
246 {
247 // If we have a net location, we are 1808-compliant, and an empty path substring implies a path of "/"
248 UniChar ch;
249 Boolean isDir;
250 CFRange pathRg;
251 flags |= HAS_PATH;
252 numRanges ++;
253 pathRg.location = base_idx;
254 pathRg.length = string_length - base_idx;
255 unpackedRanges[path_index] = pathRg;
256
257 if (pathRg.length > 0) {
258 Boolean sawPercent = FALSE;
259 for (idx = pathRg.location; idx < string_length; idx++) {
260 if ('%' == characterArray[idx]) {
261 sawPercent = TRUE;
262 break;
263 }
264 }
265 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_EMBEDDED_MINI
266 if (pathRg.length > 6 && characterArray[pathRg.location] == '/' && characterArray[pathRg.location + 1] == '.' && characterArray[pathRg.location + 2] == 'f' && characterArray[pathRg.location + 3] == 'i' && characterArray[pathRg.location + 4] == 'l' && characterArray[pathRg.location + 5] == 'e' && characterArray[pathRg.location + 6] == '/') {
267 flags |= PATH_HAS_FILE_ID;
268 } else if (!sawPercent) {
269 flags |= POSIX_AND_URL_PATHS_MATCH;
270 }
271 #elif DEPLOYMENT_TARGET_LINUX || DEPLOYMENT_TARGET_WINDOWS
272 if (!sawPercent) {
273 flags |= POSIX_AND_URL_PATHS_MATCH;
274 }
275 #endif
276
277 ch = characterArray[pathRg.location + pathRg.length - 1];
278 if (ch == '/') {
279 isDir = true;
280 } else if (ch == '.') {
281 if (pathRg.length == 1) {
282 isDir = true;
283 } else {
284 ch = characterArray[pathRg.location + pathRg.length - 2];
285 if (ch == '/') {
286 isDir = true;
287 } else if (ch != '.') {
288 isDir = false;
289 } else if (pathRg.length == 2) {
290 isDir = true;
291 } else {
292 isDir = (characterArray[pathRg.location + pathRg.length - 3] == '/');
293 }
294 }
295 } else {
296 isDir = false;
297 }
298 } else {
299 isDir = (baseURL != NULL) ? CFURLHasDirectoryPath(baseURL) : false;
300 }
301 if (isDir) {
302 flags |= IS_DIRECTORY;
303 }
304 }
305
306 (*theFlags) = flags;
307 *numberOfRanges = numRanges;
308 numRanges = 0;
309 for (idx = 0, flags = 1; flags != (1<<9); flags = (flags<<1), idx ++) {
310 if ((*theFlags) & flags) {
311 packedRanges[numRanges] = unpackedRanges[idx];
312 numRanges ++;
313 }
314 }
315 }
316 }
317 #endif // CFURL_INCLUDE_PARSE_COMPONENTS
318
319 /*
320 static Boolean scanCharactersCString(CFAllocatorRef alloc, CFMutableStringRef *escapedString, UInt32 *flags, const char *characterArray, Boolean useCString, CFIndex base, CFIndex end, CFIndex *mark, UInt32 componentFlag, CFStringEncoding encoding)
321 static Boolean scanCharactersUString(CFAllocatorRef alloc, CFMutableStringRef *escapedString, UInt32 *flags, const UniChar *characterArray, Boolean useCString, CFIndex base, CFIndex end, CFIndex *mark, UInt32 componentFlag, CFStringEncoding encoding)
322 */
323 #ifdef CFURL_INCLUDE_SCAN_CHARACTERS // defined when we want this block of code included
324 {
325 CFIndex idx;
326 Boolean sawIllegalChar = false;
327 for (idx = base; idx < end; idx ++) {
328 Boolean shouldEscape;
329 UniChar ch = characterArray[idx];
330 if (isURLLegalCharacter(ch)) {
331 if ((componentFlag == HAS_USER || componentFlag == HAS_PASSWORD) && (ch == '/' || ch == '?' || ch == '@')) {
332 shouldEscape = true;
333 } else {
334 shouldEscape = false;
335 }
336 } else if (ch == '%' && idx + 2 < end && isHexDigit(characterArray[idx + 1]) && isHexDigit(characterArray[idx+2])) {
337 shouldEscape = false;
338 } else if (componentFlag == HAS_HOST && ((idx == base && ch == '[') || (idx == end-1 && ch == ']'))) {
339 shouldEscape = false;
340 } else {
341 shouldEscape = true;
342 }
343 if (shouldEscape) {
344 sawIllegalChar = true;
345 if (componentFlag && flags) {
346 *flags |= componentFlag;
347 }
348 if (!*escapedString) {
349 *escapedString = CFStringCreateMutable(alloc, 0);
350 }
351 if (useCString) {
352 CFStringRef tempString = CFStringCreateWithBytes(alloc, (uint8_t *)&(characterArray[*mark]), idx - *mark, kCFStringEncodingISOLatin1, false);
353 CFStringAppend(*escapedString, tempString);
354 CFRelease(tempString);
355 } else {
356 CFStringAppendCharacters(*escapedString, (const UniChar *)&(characterArray[*mark]), idx - *mark);
357 }
358 *mark = idx + 1;
359 _appendPercentEscapesForCharacter(ch, encoding, *escapedString); // This can never fail because anURL->_string was constructed from the encoding passed in
360 }
361 }
362 return sawIllegalChar;
363 }
364 #endif // CFURL_INCLUDE_SCAN_CHARACTERS