CFURL.inc.h

   1 /*
   2  * Copyright (c) 2015 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23
  24 /*      CFURL.inc.h
  25         Copyright (c) 2012-2014, Apple Inc. All rights reserved.
  26         Responsibility: Jim Luther/Chris Linn
  27 */
  28
  29
  30 /*
  31
  32  What's this file for?
  33
  34  CFURL's URL string parser needs to be able to parse either an array of char or an array of UniChar.
  35
  36  The code in CFURL.c used to use this macro "#define STRING_CHAR(x) (useCString ? cstring[(x)] : ustring[(x)])" to determine which array to get a character from for every character looked at in the URL string. That macro added one or more compare and branch instructins to the parser's execution for *every* character in the URL string. Those extra compares and branches added up to 10% of the time (for long URL strings) it takes to create a URL object.
  37
  38  To ensure the exact same parser code is run over a char or a UniChar string, the source code was move to this .h file and is included multiple times by CFURL.c as needed. "STRING_CHAR(x)" was replaced by "characterArray[x]", and characterArray is defined as either an "const char *" or a "const UniChar *" for the two sets of function headers that are either parsing an array of char or an array of UniChar.
  39
  40  Any changes made to the parser are made in this file so that both char and the UniChar strings are parsed exactly the same way.
  41
  42  */
  43
  44 /*
  45     static void _parseComponentsCString(CFAllocatorRef alloc, CFURLRef baseURL, CFIndex cfStringLength, const char *characterArray, UInt32 *theFlags, CFRange *packedRanges, uint8_t *numberOfRanges)
  46  or
  47     static void _parseComponentsUString(CFAllocatorRef alloc, CFURLRef baseURL, CFIndex cfStringLength, const UniChar *characterArray, UInt32 *theFlags, CFRange *packedRanges, uint8_t *numberOfRanges)
  48  */
  49 #ifdef CFURL_INCLUDE_PARSE_COMPONENTS // defined when we want this block of code included
  50 {
  51     /* index gives the URL part involved; to calculate the correct range index, use the number of the bit of the equivalent flag (i.e. the host flag is HAS_HOST, which is 0x8.  so the range index for the host is 3.)  Note that this is true in this function ONLY, since the ranges stored in (*range) are actually packed, skipping those URL components that don't exist.  This is why the indices are hard-coded in this function. */
  52    enum {
  53         scheme_index        = 0,
  54         user_index          = 1,
  55         password_index      = 2,
  56         host_index          = 3,
  57         port_index          = 4,
  58         path_index          = 5,
  59         parameters_index    = 6,
  60         query_index         = 7,
  61         fragment_index      = 8,
  62     };
  63     CFRange unpackedRanges[MAX_COMPONENTS];
  64
  65     CFIndex idx, base_idx = 0;
  66     CFIndex string_length;
  67     UInt32 flags = *theFlags;
  68     Boolean isCompliant;
  69     uint8_t numRanges = 0;
  70
  71     string_length = cfStringLength;
  72
  73     // Algorithm is as described in RFC 1808
  74     // 1: parse the fragment; remainder after left-most "#" is fragment
  75     for (idx = base_idx; idx < string_length; idx++) {
  76         if ('#' == characterArray[idx]) {
  77             flags |= HAS_FRAGMENT;
  78             unpackedRanges[fragment_index].location = idx + 1;
  79             unpackedRanges[fragment_index].length = string_length - (idx + 1);
  80             numRanges ++;
  81             string_length = idx;        // remove fragment from parse string
  82             break;
  83         }
  84     }
  85     // 2: parse the scheme
  86     for (idx = base_idx; idx < string_length; idx++) {
  87         UniChar ch = characterArray[idx];
  88         if (':' == ch) {
  89             flags |= HAS_SCHEME;
  90             unpackedRanges[scheme_index].location = base_idx;
  91             unpackedRanges[scheme_index].length = idx;
  92             numRanges ++;
  93             base_idx = idx + 1;
  94             // optimization for ftp urls
  95             if (idx == 3 && characterArray[0] == 'f' && characterArray[1] == 't' && characterArray[2] == 'p') {
  96                 _setSchemeTypeInFlags(&flags, kHasFtpScheme);
  97             }
  98             else if (idx == 4) {
  99                 // optimization for http urls
 100                 if (characterArray[0] == 'h' && characterArray[1] == 't' && characterArray[2] == 't' && characterArray[3] == 'p') {
 101                     _setSchemeTypeInFlags(&flags, kHasHttpScheme);
 102                 }
 103                 // optimization for file urls
 104                 if (characterArray[0] == 'f' && characterArray[1] == 'i' && characterArray[2] == 'l' && characterArray[3] == 'e') {
 105                     _setSchemeTypeInFlags(&flags, kHasFileScheme);
 106                 }
 107                 // optimization for data urls
 108                 if (characterArray[0] == 'd' && characterArray[1] == 'a' && characterArray[2] == 't' && characterArray[3] == 'a') {
 109                     _setSchemeTypeInFlags(&flags, kHasDataScheme);
 110                 }
 111             }
 112             // optimization for https urls
 113             else if (idx == 5 && characterArray[0] == 'h' && characterArray[1] == 't' && characterArray[2] == 't' && characterArray[3] == 'p' && characterArray[3] == 's') {
 114                 _setSchemeTypeInFlags(&flags, kHasHttpsScheme);
 115             }
 116             break;
 117         } else if (!scheme_valid(ch)) {
 118             break;      // invalid scheme character -- no scheme
 119         }
 120     }
 121
 122     // Make sure we have an RFC-1808 compliant URL - that's either something without a scheme, or scheme:/(stuff) or scheme://(stuff)
 123     // Strictly speaking, RFC 1808 & 2396 bar "scheme:" (with nothing following the colon); however, common usage
 124     // expects this to be treated identically to "scheme://" - REW, 12/08/03
 125     if (!(flags & HAS_SCHEME)) {
 126         isCompliant = true;
 127     } else if (base_idx == string_length) {
 128         isCompliant = false;
 129     } else if (characterArray[base_idx] != '/') {
 130         isCompliant = false;
 131     } else {
 132         isCompliant = true;
 133     }
 134
 135     if (!isCompliant) {
 136         // Clear the fragment flag if it's been set
 137         if (flags & HAS_FRAGMENT) {
 138             flags &= (~HAS_FRAGMENT);
 139             string_length = cfStringLength;
 140         }
 141         (*theFlags) = flags;
 142         packedRanges[scheme_index].location = unpackedRanges[scheme_index].location;
 143         packedRanges[scheme_index].length = unpackedRanges[scheme_index].length;
 144         *numberOfRanges = 1;
 145     }
 146     else {
 147         // URL is 1808-compliant
 148         flags |= IS_DECOMPOSABLE;
 149
 150         // 3: parse the network location and login
 151         if (2 <= (string_length - base_idx) && '/' == characterArray[base_idx] && '/' == characterArray[base_idx+1]) {
 152             CFIndex base = 2 + base_idx, extent;
 153             for (idx = base; idx < string_length; idx++) {
 154                 if ('/' == characterArray[idx] || '?' == characterArray[idx]) {
 155                     break;
 156                 }
 157             }
 158             extent = idx;
 159
 160             // net_loc parts extend from base to extent (but not including), which might be to end of string
 161             // net location is "<user>:<password>@<host>:<port>"
 162             if (extent != base) {
 163                 for (idx = base; idx < extent; idx++) {
 164                     if ('@' == characterArray[idx]) {   // there is a user
 165                         CFIndex idx2;
 166                         flags |= HAS_USER;
 167                         numRanges ++;
 168                         unpackedRanges[user_index].location = base;  // base of the user
 169                         for (idx2 = base; idx2 < idx; idx2++) {
 170                             if (':' == characterArray[idx2]) {  // found a password separator
 171                                 flags |= HAS_PASSWORD;
 172                                 numRanges ++;
 173                                 unpackedRanges[password_index].location = idx2+1; // base of the password
 174                                 unpackedRanges[password_index].length = idx-(idx2+1);  // password extent
 175                                 unpackedRanges[user_index].length = idx2 - base; // user extent
 176                                 break;
 177                             }
 178                         }
 179                         if (!(flags & HAS_PASSWORD)) {
 180                             // user extends to the '@'
 181                             unpackedRanges[user_index].length = idx - base; // user extent
 182                         }
 183                         base = idx + 1;
 184                         break;
 185                     }
 186                 }
 187                 flags |= HAS_HOST;
 188                 numRanges ++;
 189                 unpackedRanges[host_index].location = base; // base of host
 190
 191                 // base has been advanced past the user and password if they existed
 192                 for (idx = base; idx < extent; idx++) {
 193                     // IPV6 support (RFC 2732) DCJ June/10/2002
 194                     if ('[' == characterArray[idx]) {   // starting IPV6 explicit address
 195                         //      Find the ']' terminator of the IPv6 address, leave idx pointing to ']' or end
 196                         for ( ; idx < extent; ++ idx ) {
 197                             if ( ']' == characterArray[idx]) {
 198                                 flags |= IS_IPV6_ENCODED;
 199                                 break;
 200                             }
 201                         }
 202                     }
 203                     // there is a port if we see a colon.  Only the last one is the port, though.
 204                     else if ( ':' == characterArray[idx]) {
 205                         flags |= HAS_PORT;
 206                         numRanges ++;
 207                         unpackedRanges[port_index].location = idx+1; // base of port
 208                         unpackedRanges[port_index].length = extent - (idx+1); // port extent
 209                         unpackedRanges[host_index].length = idx - base; // host extent
 210                         break;
 211                     }
 212                 }
 213                 if (!(flags & HAS_PORT)) {
 214                     unpackedRanges[host_index].length = extent - base;  // host extent
 215                 }
 216             }
 217             base_idx = extent;
 218         }
 219
 220         // 4: parse the query; remainder after left-most "?" is query
 221         for (idx = base_idx; idx < string_length; idx++) {
 222             if ('?' == characterArray[idx]) {
 223                 flags |= HAS_QUERY;
 224                 numRanges ++;
 225                 unpackedRanges[query_index].location = idx + 1;
 226                 unpackedRanges[query_index].length = string_length - (idx+1);
 227                 string_length = idx;    // remove query from parse string
 228                 break;
 229             }
 230         }
 231
 232         // 5: parse the parameters; remainder after left-most ";" is parameters
 233         for (idx = base_idx; idx < string_length; idx++) {
 234             if (';' == characterArray[idx]) {
 235                 flags |= HAS_PARAMETERS;
 236                 numRanges ++;
 237                 unpackedRanges[parameters_index].location = idx + 1;
 238                 unpackedRanges[parameters_index].length = string_length - (idx+1);
 239                 string_length = idx;    // remove parameters from parse string
 240                 break;
 241             }
 242         }
 243
 244         // 6: parse the path; it's whatever's left between string_length & base_idx
 245         if (string_length - base_idx != 0 || (flags & NET_LOCATION_MASK))
 246         {
 247             // If we have a net location, we are 1808-compliant, and an empty path substring implies a path of "/"
 248             UniChar ch;
 249             Boolean isDir;
 250             CFRange pathRg;
 251             flags |= HAS_PATH;
 252             numRanges ++;
 253             pathRg.location = base_idx;
 254             pathRg.length = string_length - base_idx;
 255             unpackedRanges[path_index] = pathRg;
 256
 257             if (pathRg.length > 0) {
 258                 Boolean sawPercent = FALSE;
 259                 for (idx = pathRg.location; idx < string_length; idx++) {
 260                     if ('%' == characterArray[idx]) {
 261                         sawPercent = TRUE;
 262                         break;
 263                     }
 264                 }
 265 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_EMBEDDED_MINI
 266                 if (pathRg.length > 6 && characterArray[pathRg.location] == '/' && characterArray[pathRg.location + 1] == '.' && characterArray[pathRg.location + 2] == 'f' && characterArray[pathRg.location + 3] == 'i' && characterArray[pathRg.location + 4] == 'l' && characterArray[pathRg.location + 5] == 'e' && characterArray[pathRg.location + 6] == '/') {
 267                     flags |= PATH_HAS_FILE_ID;
 268                 } else if (!sawPercent) {
 269                     flags |= POSIX_AND_URL_PATHS_MATCH;
 270                 }
 271 #elif DEPLOYMENT_TARGET_LINUX || DEPLOYMENT_TARGET_WINDOWS
 272                 if (!sawPercent) {
 273                     flags |= POSIX_AND_URL_PATHS_MATCH;
 274                 }
 275 #endif
 276
 277                 ch = characterArray[pathRg.location + pathRg.length - 1];
 278                 if (ch == '/') {
 279                     isDir = true;
 280                 } else if (ch == '.') {
 281                     if (pathRg.length == 1) {
 282                         isDir = true;
 283                     } else {
 284                         ch = characterArray[pathRg.location + pathRg.length - 2];
 285                         if (ch == '/') {
 286                             isDir = true;
 287                         } else if (ch != '.') {
 288                             isDir = false;
 289                         } else if (pathRg.length == 2) {
 290                             isDir = true;
 291                         } else {
 292                             isDir = (characterArray[pathRg.location + pathRg.length - 3] == '/');
 293                         }
 294                     }
 295                 } else {
 296                     isDir = false;
 297                 }
 298             } else {
 299                 isDir = (baseURL != NULL) ? CFURLHasDirectoryPath(baseURL) : false;
 300             }
 301             if (isDir) {
 302                 flags |= IS_DIRECTORY;
 303             }
 304         }
 305
 306         (*theFlags) = flags;
 307         *numberOfRanges = numRanges;
 308         numRanges = 0;
 309         for (idx = 0, flags = 1; flags != (1<<9); flags = (flags<<1), idx ++) {
 310             if ((*theFlags) & flags) {
 311                 packedRanges[numRanges] = unpackedRanges[idx];
 312                 numRanges ++;
 313             }
 314         }
 315     }
 316 }
 317 #endif  // CFURL_INCLUDE_PARSE_COMPONENTS
 318
 319 /*
 320     static Boolean scanCharactersCString(CFAllocatorRef alloc, CFMutableStringRef *escapedString, UInt32 *flags, const char *characterArray, Boolean useCString, CFIndex base, CFIndex end, CFIndex *mark, UInt32 componentFlag, CFStringEncoding encoding)
 321     static Boolean scanCharactersUString(CFAllocatorRef alloc, CFMutableStringRef *escapedString, UInt32 *flags, const UniChar *characterArray, Boolean useCString, CFIndex base, CFIndex end, CFIndex *mark, UInt32 componentFlag, CFStringEncoding encoding)
 322  */
 323 #ifdef CFURL_INCLUDE_SCAN_CHARACTERS  // defined when we want this block of code included
 324 {
 325     CFIndex idx;
 326     Boolean sawIllegalChar = false;
 327     for (idx = base; idx < end; idx ++) {
 328         Boolean shouldEscape;
 329         UniChar ch = characterArray[idx];
 330         if (isURLLegalCharacter(ch)) {
 331             if ((componentFlag == HAS_USER || componentFlag == HAS_PASSWORD) && (ch == '/' || ch == '?' || ch == '@')) {
 332                 shouldEscape = true;
 333             } else {
 334                 shouldEscape = false;
 335             }
 336         } else if (ch == '%' && idx + 2 < end && isHexDigit(characterArray[idx + 1]) && isHexDigit(characterArray[idx+2])) {
 337             shouldEscape = false;
 338         } else if (componentFlag == HAS_HOST && ((idx == base && ch == '[') || (idx == end-1 && ch == ']'))) {
 339             shouldEscape = false;
 340         } else {
 341             shouldEscape = true;
 342         }
 343         if (shouldEscape) {
 344             sawIllegalChar = true;
 345             if (componentFlag && flags) {
 346                 *flags |= componentFlag;
 347             }
 348             if (!*escapedString) {
 349                 *escapedString = CFStringCreateMutable(alloc, 0);
 350             }
 351             if (useCString) {
 352                 CFStringRef tempString = CFStringCreateWithBytes(alloc, (uint8_t *)&(characterArray[*mark]), idx - *mark, kCFStringEncodingISOLatin1, false);
 353                 CFStringAppend(*escapedString, tempString);
 354                 CFRelease(tempString);
 355             } else {
 356                 CFStringAppendCharacters(*escapedString, (const UniChar *)&(characterArray[*mark]), idx - *mark);
 357             }
 358             *mark = idx + 1;
 359             _appendPercentEscapesForCharacter(ch, encoding, *escapedString); // This can never fail because anURL->_string was constructed from the encoding passed in
 360         }
 361     }
 362     return sawIllegalChar;
 363 }
 364 #endif  // CFURL_INCLUDE_SCAN_CHARACTERS