CFURL.inc.h

   1 /*
   2  * Copyright (c) 2014 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23
  24 /*      CFURL.inc.h
  25         Copyright (c) 2012-2013, Apple Inc. All rights reserved.
  26         Responsibility: Jim Luther
  27 */
  28
  29
  30 /*
  31
  32  What's this file for?
  33
  34  CFURL's URL string parser needs to be able to parse either an array of char or an array of UniChar.
  35
  36  The code in CFURL.c used to use this macro "#define STRING_CHAR(x) (useCString ? cstring[(x)] : ustring[(x)])" to determine which array to get a character from for every character looked at in the URL string. That macro added one or more compare and branch instructins to the parser's execution for *every* character in the URL string. Those extra compares and branches added up to 10% of the time (for long URL strings) it takes to create a URL object.
  37
  38  To ensure the exact same parser code is run over a char or a UniChar string, the source code was move to this .h file and is included multiple times by CFURL.c as needed. "STRING_CHAR(x)" was replaced by "characterArray[x]", and characterArray is defined as either an "const char *" or a "const UniChar *" for the two sets of function headers that are either parsing an array of char or an array of UniChar.
  39
  40  Any changes made to the parser are made in this file so that both char and the UniChar strings are parsed exactly the same way.
  41
  42  */
  43
  44 /*
  45     static void _parseComponentsCString(CFAllocatorRef alloc, CFURLRef baseURL, UInt32 *theFlags, CFRange **range, CFIndex cfStringLength, const char *characterArray)
  46     static void _parseComponentsUString(CFAllocatorRef alloc, CFURLRef baseURL, UInt32 *theFlags, CFRange **range, CFIndex cfStringLength, const UniChar *characterArray)
  47  */
  48 #ifdef CFURL_INCLUDE_PARSE_COMPONENTS // defined when we want this block of code included
  49 {
  50     CFRange ranges[9];
  51     /* index gives the URL part involved; to calculate the correct range index, use the number of the bit of the equivalent flag (i.e. the host flag is HAS_HOST, which is 0x8.  so the range index for the host is 3.)  Note that this is true in this function ONLY, since the ranges stored in (*range) are actually packed, skipping those URL components that don't exist.  This is why the indices are hard-coded in this function. */
  52
  53     CFIndex idx, base_idx = 0;
  54     CFIndex string_length;
  55     UInt32 flags = *theFlags;
  56     Boolean isCompliant;
  57     uint8_t numRanges = 0;
  58
  59     string_length = cfStringLength;
  60
  61     // Algorithm is as described in RFC 1808
  62     // 1: parse the fragment; remainder after left-most "#" is fragment
  63     for (idx = base_idx; idx < string_length; idx++) {
  64         if ('#' == characterArray[idx]) {
  65             flags |= HAS_FRAGMENT;
  66             ranges[8].location = idx + 1;
  67             ranges[8].length = string_length - (idx + 1);
  68             numRanges ++;
  69             string_length = idx;        // remove fragment from parse string
  70             break;
  71         }
  72     }
  73     // 2: parse the scheme
  74     for (idx = base_idx; idx < string_length; idx++) {
  75         UniChar ch = characterArray[idx];
  76         if (':' == ch) {
  77             flags |= HAS_SCHEME;
  78             ranges[0].location = base_idx;
  79             ranges[0].length = idx;
  80             numRanges ++;
  81             base_idx = idx + 1;
  82             // optimization for ftp urls
  83             if (idx == 3 && characterArray[0] == 'f' && characterArray[1] == 't' && characterArray[2] == 'p') {
  84                 _setSchemeTypeInFlags(&flags, kHasFtpScheme);
  85             }
  86             else if (idx == 4) {
  87                 // optimization for http urls
  88                 if (characterArray[0] == 'h' && characterArray[1] == 't' && characterArray[2] == 't' && characterArray[3] == 'p') {
  89                     _setSchemeTypeInFlags(&flags, kHasHttpScheme);
  90                 }
  91                 // optimization for file urls
  92                 if (characterArray[0] == 'f' && characterArray[1] == 'i' && characterArray[2] == 'l' && characterArray[3] == 'e') {
  93                     _setSchemeTypeInFlags(&flags, kHasFileScheme);
  94                 }
  95                 // optimization for data urls
  96                 if (characterArray[0] == 'd' && characterArray[1] == 'a' && characterArray[2] == 't' && characterArray[3] == 'a') {
  97                     _setSchemeTypeInFlags(&flags, kHasDataScheme);
  98                 }
  99             }
 100             // optimization for https urls
 101             else if (idx == 5 && characterArray[0] == 'h' && characterArray[1] == 't' && characterArray[2] == 't' && characterArray[3] == 'p' && characterArray[3] == 's') {
 102                 _setSchemeTypeInFlags(&flags, kHasHttpsScheme);
 103             }
 104             break;
 105         } else if (!scheme_valid(ch)) {
 106             break;      // invalid scheme character -- no scheme
 107         }
 108     }
 109
 110     // Make sure we have an RFC-1808 compliant URL - that's either something without a scheme, or scheme:/(stuff) or scheme://(stuff)
 111     // Strictly speaking, RFC 1808 & 2396 bar "scheme:" (with nothing following the colon); however, common usage
 112     // expects this to be treated identically to "scheme://" - REW, 12/08/03
 113     if (!(flags & HAS_SCHEME)) {
 114         isCompliant = true;
 115     } else if (base_idx == string_length) {
 116         isCompliant = false;
 117     } else if (characterArray[base_idx] != '/') {
 118         isCompliant = false;
 119     } else {
 120         isCompliant = true;
 121     }
 122
 123     if (!isCompliant) {
 124         // Clear the fragment flag if it's been set
 125         if (flags & HAS_FRAGMENT) {
 126             flags &= (~HAS_FRAGMENT);
 127             string_length = cfStringLength;
 128         }
 129         (*theFlags) = flags;
 130         (*range) = (CFRange *)CFAllocatorAllocate(alloc, sizeof(CFRange), 0);
 131         (*range)->location = ranges[0].location;
 132         (*range)->length = ranges[0].length;
 133
 134         return;
 135     }
 136     // URL is 1808-compliant
 137     flags |= IS_DECOMPOSABLE;
 138
 139     // 3: parse the network location and login
 140     if (2 <= (string_length - base_idx) && '/' == characterArray[base_idx] && '/' == characterArray[base_idx+1]) {
 141         CFIndex base = 2 + base_idx, extent;
 142         for (idx = base; idx < string_length; idx++) {
 143             if ('/' == characterArray[idx] || '?' == characterArray[idx]) {
 144                 break;
 145             }
 146         }
 147         extent = idx;
 148
 149         // net_loc parts extend from base to extent (but not including), which might be to end of string
 150         // net location is "<user>:<password>@<host>:<port>"
 151         if (extent != base) {
 152             for (idx = base; idx < extent; idx++) {
 153                 if ('@' == characterArray[idx]) {   // there is a user
 154                     CFIndex idx2;
 155                     flags |= HAS_USER;
 156                     numRanges ++;
 157                     ranges[1].location = base;  // base of the user
 158                     for (idx2 = base; idx2 < idx; idx2++) {
 159                         if (':' == characterArray[idx2]) {      // found a password separator
 160                             flags |= HAS_PASSWORD;
 161                             numRanges ++;
 162                             ranges[2].location = idx2+1; // base of the password
 163                             ranges[2].length = idx-(idx2+1);  // password extent
 164                             ranges[1].length = idx2 - base; // user extent
 165                             break;
 166                         }
 167                     }
 168                     if (!(flags & HAS_PASSWORD)) {
 169                         // user extends to the '@'
 170                         ranges[1].length = idx - base; // user extent
 171                     }
 172                     base = idx + 1;
 173                     break;
 174                 }
 175             }
 176             flags |= HAS_HOST;
 177             numRanges ++;
 178             ranges[3].location = base; // base of host
 179
 180             // base has been advanced past the user and password if they existed
 181             for (idx = base; idx < extent; idx++) {
 182                 // IPV6 support (RFC 2732) DCJ June/10/2002
 183                 if ('[' == characterArray[idx]) {       // starting IPV6 explicit address
 184                     //  Find the ']' terminator of the IPv6 address, leave idx pointing to ']' or end
 185                     for ( ; idx < extent; ++ idx ) {
 186                         if ( ']' == characterArray[idx]) {
 187                             flags |= IS_IPV6_ENCODED;
 188                             break;
 189                         }
 190                     }
 191                 }
 192                 // there is a port if we see a colon.  Only the last one is the port, though.
 193                 else if ( ':' == characterArray[idx]) {
 194                     flags |= HAS_PORT;
 195                     numRanges ++;
 196                     ranges[4].location = idx+1; // base of port
 197                     ranges[4].length = extent - (idx+1); // port extent
 198                     ranges[3].length = idx - base; // host extent
 199                     break;
 200                 }
 201             }
 202             if (!(flags & HAS_PORT)) {
 203                 ranges[3].length = extent - base;  // host extent
 204             }
 205         }
 206         base_idx = extent;
 207     }
 208
 209     // 4: parse the query; remainder after left-most "?" is query
 210     for (idx = base_idx; idx < string_length; idx++) {
 211         if ('?' == characterArray[idx]) {
 212             flags |= HAS_QUERY;
 213             numRanges ++;
 214             ranges[7].location = idx + 1;
 215             ranges[7].length = string_length - (idx+1);
 216             string_length = idx;        // remove query from parse string
 217             break;
 218         }
 219     }
 220
 221     // 5: parse the parameters; remainder after left-most ";" is parameters
 222     for (idx = base_idx; idx < string_length; idx++) {
 223         if (';' == characterArray[idx]) {
 224             flags |= HAS_PARAMETERS;
 225             numRanges ++;
 226             ranges[6].location = idx + 1;
 227             ranges[6].length = string_length - (idx+1);
 228             string_length = idx;        // remove parameters from parse string
 229             break;
 230         }
 231     }
 232
 233     // 6: parse the path; it's whatever's left between string_length & base_idx
 234     if (string_length - base_idx != 0 || (flags & NET_LOCATION_MASK))
 235     {
 236         // If we have a net location, we are 1808-compliant, and an empty path substring implies a path of "/"
 237         UniChar ch;
 238         Boolean isDir;
 239         CFRange pathRg;
 240         flags |= HAS_PATH;
 241         numRanges ++;
 242         pathRg.location = base_idx;
 243         pathRg.length = string_length - base_idx;
 244         ranges[5] = pathRg;
 245
 246         if (pathRg.length > 0) {
 247             Boolean sawPercent = FALSE;
 248             for (idx = pathRg.location; idx < string_length; idx++) {
 249                 if ('%' == characterArray[idx]) {
 250                     sawPercent = TRUE;
 251                     break;
 252                 }
 253             }
 254 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_EMBEDDED_MINI
 255             if (pathRg.length > 6 && characterArray[pathRg.location] == '/' && characterArray[pathRg.location + 1] == '.' && characterArray[pathRg.location + 2] == 'f' && characterArray[pathRg.location + 3] == 'i' && characterArray[pathRg.location + 4] == 'l' && characterArray[pathRg.location + 5] == 'e' && characterArray[pathRg.location + 6] == '/') {
 256                 flags |= PATH_HAS_FILE_ID;
 257             } else if (!sawPercent) {
 258                 flags |= POSIX_AND_URL_PATHS_MATCH;
 259             }
 260 #elif DEPLOYMENT_TARGET_LINUX || DEPLOYMENT_TARGET_WINDOWS
 261             if (!sawPercent) {
 262                 flags |= POSIX_AND_URL_PATHS_MATCH;
 263             }
 264 #endif
 265
 266             ch = characterArray[pathRg.location + pathRg.length - 1];
 267             if (ch == '/') {
 268                 isDir = true;
 269             } else if (ch == '.') {
 270                 if (pathRg.length == 1) {
 271                     isDir = true;
 272                 } else {
 273                     ch = characterArray[pathRg.location + pathRg.length - 2];
 274                     if (ch == '/') {
 275                         isDir = true;
 276                     } else if (ch != '.') {
 277                         isDir = false;
 278                     } else if (pathRg.length == 2) {
 279                         isDir = true;
 280                     } else {
 281                         isDir = (characterArray[pathRg.location + pathRg.length - 3] == '/');
 282                     }
 283                 }
 284             } else {
 285                 isDir = false;
 286             }
 287         } else {
 288             isDir = (baseURL != NULL) ? CFURLHasDirectoryPath(baseURL) : false;
 289         }
 290         if (isDir) {
 291             flags |= IS_DIRECTORY;
 292         }
 293     }
 294
 295     (*theFlags) = flags;
 296     (*range) = (CFRange *)CFAllocatorAllocate(alloc, sizeof(CFRange)*numRanges, 0);
 297     numRanges = 0;
 298     for (idx = 0, flags = 1; flags != (1<<9); flags = (flags<<1), idx ++) {
 299         if ((*theFlags) & flags) {
 300             (*range)[numRanges] = ranges[idx];
 301             numRanges ++;
 302         }
 303     }
 304 }
 305 #endif  // CFURL_INCLUDE_PARSE_COMPONENTS
 306
 307 /*
 308     static Boolean scanCharactersCString(CFAllocatorRef alloc, CFMutableStringRef *escapedString, UInt32 *flags, const char *characterArray, Boolean useCString, CFIndex base, CFIndex end, CFIndex *mark, UInt32 componentFlag, CFStringEncoding encoding)
 309     static Boolean scanCharactersUString(CFAllocatorRef alloc, CFMutableStringRef *escapedString, UInt32 *flags, const UniChar *characterArray, Boolean useCString, CFIndex base, CFIndex end, CFIndex *mark, UInt32 componentFlag, CFStringEncoding encoding)
 310  */
 311 #ifdef CFURL_INCLUDE_SCAN_CHARACTERS  // defined when we want this block of code included
 312 {
 313     CFIndex idx;
 314     Boolean sawIllegalChar = false;
 315     for (idx = base; idx < end; idx ++) {
 316         Boolean shouldEscape;
 317         UniChar ch = characterArray[idx];
 318         if (isURLLegalCharacter(ch)) {
 319             if ((componentFlag == HAS_USER || componentFlag == HAS_PASSWORD) && (ch == '/' || ch == '?' || ch == '@')) {
 320                 shouldEscape = true;
 321             } else {
 322                 shouldEscape = false;
 323             }
 324         } else if (ch == '%' && idx + 2 < end && isHexDigit(characterArray[idx + 1]) && isHexDigit(characterArray[idx+2])) {
 325             shouldEscape = false;
 326         } else if (componentFlag == HAS_HOST && ((idx == base && ch == '[') || (idx == end-1 && ch == ']'))) {
 327             shouldEscape = false;
 328         } else {
 329             shouldEscape = true;
 330         }
 331         if (shouldEscape) {
 332             sawIllegalChar = true;
 333             if (componentFlag && flags) {
 334                 *flags |= componentFlag;
 335             }
 336             if (!*escapedString) {
 337                 *escapedString = CFStringCreateMutable(alloc, 0);
 338             }
 339             if (useCString) {
 340                 CFStringRef tempString = CFStringCreateWithBytes(alloc, (uint8_t *)&(characterArray[*mark]), idx - *mark, kCFStringEncodingISOLatin1, false);
 341                 CFStringAppend(*escapedString, tempString);
 342                 CFRelease(tempString);
 343             } else {
 344                 CFStringAppendCharacters(*escapedString, (const UniChar *)&(characterArray[*mark]), idx - *mark);
 345             }
 346             *mark = idx + 1;
 347             _appendPercentEscapesForCharacter(ch, encoding, *escapedString); // This can never fail because anURL->_string was constructed from the encoding passed in
 348         }
 349     }
 350     return sawIllegalChar;
 351 }
 352 #endif  // CFURL_INCLUDE_SCAN_CHARACTERS