Parsing.subproj/CFXMLInputStream.c

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
   7  *
   8  * This file contains Original Code and/or Modifications of Original Code
   9  * as defined in and that are subject to the Apple Public Source License
  10  * Version 2.0 (the 'License'). You may not use this file except in
  11  * compliance with the License. Please obtain a copy of the License at
  12  * http://www.opensource.apple.com/apsl/ and read it before using this
  13  * file.
  14  *
  15  * The Original Code and all software distributed under the License are
  16  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  17  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  18  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  20  * Please see the License for the specific language governing rights and
  21  * limitations under the License.
  22  *
  23  * @APPLE_LICENSE_HEADER_END@
  24  */
  25 /*      CFXMLInputStream.c
  26         Copyright 1999-2002, Apple, Inc. All rights reserved.
  27         Responsibility: Chris Parker
  28 */
  29
  30 #include <CoreFoundation/CFCharacterSet.h>
  31 #include <string.h>
  32 #include "CFStringEncodingConverter.h"
  33 #include "CFUniChar.h"
  34 #include "CFXMLInputStream.h"
  35
  36 /* Utility functions used in parsing */
  37 static Boolean determineEncoding(_CFXMLInputStream *stream) {
  38     const uint8_t *bytes = (uint8_t *)CFDataGetBytePtr(stream->data);
  39     UInt32 length = CFDataGetLength(stream->data);
  40     const uint8_t *idx = 0L, *end = 0L;
  41     const uint8_t *base = 0L;
  42     char quote = ' ';
  43     Boolean useUTF8 = false;
  44
  45     // Check for the byte order mark first
  46     if (length > 2) {
  47         // This clause checks for the unicode byte order mark, or a Unicode sequence lacking the BOM; technically an error, but this check is recommended by the XML spec
  48         if ((*bytes == 0xFF && *(bytes+1) == 0xFE) ||*(bytes+1) == 0x00) {
  49             stream->flags |= ENCODING_IS_UNICODE_SWAPPED;
  50             if (*bytes == 0xFF) {
  51                 stream->currentByte = bytes + 2;
  52             }
  53             stream->encoding = kCFStringEncodingUnicode;
  54             return true;
  55         } else if ((*bytes == 0xFE && *(bytes+1) == 0xFF) || *bytes == 0x00) {
  56             stream->flags |= ENCODING_IS_UNICODE_NATURAL;
  57             if (*bytes == 0xFE) {
  58                 stream->currentByte = bytes + 2;
  59             }
  60             stream->encoding = kCFStringEncodingUnicode;
  61             return true;
  62         } else if(*bytes == 0xEF && *(bytes+1) == 0xBB && *(bytes+2) == 0xBF) {
  63             if(*bytes == 0xEF) {
  64                 stream->currentByte = bytes + 3;
  65             }
  66             stream->encoding = kCFStringEncodingUTF8;
  67             stream->flags |= ENCODING_MATCHES_ASCII;
  68             return true;
  69         }
  70     }
  71     // Scan for the <?xml.... ?> opening
  72     if (length < 5 || strncmp((char const *) bytes, "<?xml", 5) != 0) {
  73         useUTF8 = true;
  74     }
  75     if (!useUTF8) {
  76         idx = bytes + 5;
  77         end = bytes + length;
  78         // Found "<?xml"; now we scan for "encoding"
  79         while (idx < end) {
  80             uint8_t ch = *idx;
  81             const uint8_t *scan;
  82             if ( ch == '?' || ch == '>') {
  83                 useUTF8 = true;
  84                 break;
  85             }
  86             idx ++;
  87             scan = idx;
  88             if (ch == 'e' && *scan++ == 'n' && *scan++ == 'c' && *scan++ == 'o' && *scan++ == 'd' && *scan++ == 'i' && *scan++ == 'n' && *scan++ == 'g' && *scan++ == '=') {
  89                 idx = scan;
  90                 break;
  91             }
  92         }
  93         if (!useUTF8 && idx >= end) {
  94             useUTF8 = true;
  95         }
  96     }
  97     if (!useUTF8) {
  98         // Found "encoding="; see if we've got an honest-to-goodness encoding name
  99         quote = *idx;
 100         if (quote != '\'' && quote != '\"') {
 101             useUTF8 = true;
 102         }
 103     }
 104     if (!useUTF8) {
 105         base = idx + 1; // Move past the quote character
 106         idx ++;
 107         while (idx < end && *idx != quote) idx ++;
 108         if (idx >= end) {
 109             useUTF8 = true;
 110         }
 111     }
 112     if (!useUTF8) {
 113         UInt32 len = idx - base;
 114         if (len == 5 && (*base == 'u' || *base == 'U') && (base[1] == 't' || base[1] == 'T') && (base[2] == 'f' || base[2] == 'F') && (base[3] == '-') && (base[4] == '8')) {
 115             useUTF8 = true;
 116         } else {
 117             CFStringRef encodingName = CFStringCreateWithBytes(stream->allocator, base, len, kCFStringEncodingISOLatin1, false);
 118             stream->encoding = CFStringConvertIANACharSetNameToEncoding(encodingName);
 119             CFRelease(encodingName);
 120         }
 121     }
 122     if (useUTF8) {
 123         stream->encoding = kCFStringEncodingUTF8;
 124         stream->flags |= ENCODING_MATCHES_ASCII;
 125         return true;
 126     } else if (stream->encoding == kCFStringEncodingInvalidId) {
 127         return false;
 128     } else if (__CFStringEncodingIsSupersetOfASCII(stream->encoding)) {
 129         stream->flags |= ENCODING_MATCHES_ASCII;
 130     }
 131     return true;
 132 }
 133
 134 CF_INLINE void _fillStringWithCharacters(CFMutableStringRef string, UniChar *characters, CFIndex numChars) {
 135     CFStringDelete(string, CFRangeMake(0, CFStringGetLength(string)));
 136     if (numChars) {
 137         CFStringAppendCharacters(string, characters, numChars);
 138     }
 139 }
 140
 141 __private_extern__ Boolean _openInputStream(_CFXMLInputStream *stream) {
 142     if (NULL == stream->data) {
 143         return false;
 144     } else {
 145         stream->currentByte = CFDataGetBytePtr(stream->data);
 146         if (determineEncoding(stream)) {
 147             stream->flags |= STREAM_OPEN;
 148             return true;
 149         } else {
 150             return false;
 151         }
 152     }
 153 }
 154
 155 __private_extern__ void _initializeInputStream(_CFXMLInputStream *stream, CFAllocatorRef alloc, CFURLRef dataSource, CFDataRef xmlData) {
 156     stream->data = xmlData ? CFRetain(xmlData) : NULL;
 157     stream->url = dataSource ? CFRetain(dataSource) : NULL;
 158     stream->encoding = kCFStringEncodingInvalidId;
 159     stream->currentByte = NULL;
 160
 161     stream->allocator = CFRetain(alloc);
 162     stream->charBuffer = NULL;
 163     stream->currentChar = NULL;
 164     stream->mark = NULL;
 165     stream->parserMark = NULL;
 166     stream->bufferLength = 0;
 167     stream->bufferCapacity = 0;
 168
 169     stream->charIndex = 1;
 170     stream->lineNum = 1;
 171
 172     stream->flags = 0;
 173     stream->nameSet = NULL;
 174     stream->tempString = NULL;
 175 }
 176
 177
 178 __private_extern__ void _freeInputStream(_CFXMLInputStream *stream) {
 179     if (stream->data) CFRelease(stream->data);
 180     if (stream->url) CFRelease(stream->url);
 181     if (stream->charBuffer) CFAllocatorDeallocate(stream->allocator, stream->charBuffer);
 182     if (stream->nameSet) CFRelease(stream->nameSet);
 183     if (stream->tempString) CFRelease(stream->tempString);
 184     CFRelease(stream->allocator);
 185 }
 186
 187 __private_extern__ CFStringEncoding _inputStreamGetEncoding(_CFXMLInputStream *stream) {
 188     return stream->encoding;
 189 }
 190
 191 __private_extern__ CFIndex _inputStreamCurrentLocation(_CFXMLInputStream *stream) {
 192     return stream->charIndex;
 193 }
 194
 195 __private_extern__ CFIndex _inputStreamCurrentLine(_CFXMLInputStream *stream) {
 196     return stream->lineNum;
 197 }
 198
 199 __private_extern__ Boolean _inputStreamAtEOF(_CFXMLInputStream *stream) {
 200     if (!(stream->flags & STREAM_OPEN)) return false;
 201     if (stream->currentChar) return false;
 202     if (stream->currentByte - CFDataGetBytePtr(stream->data) < CFDataGetLength(stream->data)) return false;
 203     return true;
 204 }
 205
 206 __private_extern__ Boolean _inputStreamComposingErrorOccurred(_CFXMLInputStream *stream) {
 207     return stream->flags & ENCODING_COMPOSITION_ERROR;
 208 }
 209
 210 #define INITIAL_BUFFER_SIZE 64
 211 static void growCharacterBuffer(_CFXMLInputStream *stream) {
 212     if (!stream->charBuffer) {
 213         stream->charBuffer = CFAllocatorAllocate(stream->allocator, INITIAL_BUFFER_SIZE*sizeof(UniChar), 0);
 214         stream->bufferCapacity = INITIAL_BUFFER_SIZE;
 215     } else {
 216         CFIndex currCharDelta = stream->currentChar ? stream->currentChar - stream->charBuffer : -1;
 217         CFIndex markDelta = stream->mark ? stream->mark - stream->charBuffer: -1;
 218         CFIndex parserMarkDelta = stream->parserMark ? stream->parserMark - stream->charBuffer: -1;
 219         UniChar *newBuffer = CFAllocatorReallocate(stream->allocator, stream->charBuffer, stream->bufferCapacity * 2 * sizeof(UniChar), 0);
 220         stream->bufferCapacity *= 2;
 221         if (newBuffer != stream->charBuffer) {
 222             stream->charBuffer = newBuffer;
 223             if (currCharDelta != -1) {
 224                 stream->currentChar = newBuffer + currCharDelta;
 225             }
 226             if (markDelta != -1) {
 227                 stream->mark = newBuffer + markDelta;
 228             }
 229             if (parserMarkDelta != -1) {
 230                 stream->parserMark = newBuffer + parserMarkDelta;
 231             }
 232         }
 233     }
 234 }
 235
 236 static CFIndex loadCharacters(UniChar *base, CFIndex maxLength, _CFXMLInputStream *stream) {
 237     const uint8_t *dataEnd = CFDataGetBytePtr(stream->data) + CFDataGetLength(stream->data);
 238     if (stream->flags & (ENCODING_IS_UNICODE_NATURAL|ENCODING_IS_UNICODE_SWAPPED) ) {
 239         CFIndex charsToTranslate = (dataEnd - stream->currentByte) / sizeof(UniChar);
 240         if (charsToTranslate > maxLength) {
 241             charsToTranslate = maxLength;
 242         }
 243         if (stream->flags & ENCODING_IS_UNICODE_NATURAL) {
 244             memmove(base, stream->currentByte, charsToTranslate * sizeof(UniChar));
 245             stream->currentByte += (charsToTranslate * sizeof(UniChar));
 246         } else {
 247             CFIndex i;
 248             uint8_t *baseBytePtr = (uint8_t *)base;
 249             for (i = 0; i < charsToTranslate; i ++) {
 250                 *(baseBytePtr + 1) = *stream->currentByte;
 251                 *baseBytePtr = *(stream->currentByte + 1);
 252                 baseBytePtr += 2;
 253                 stream->currentByte += 2;
 254             }
 255         }
 256         return charsToTranslate;
 257     } else {
 258         CFIndex lengthConsumed = 0, usedByteLength, usedCharLength;
 259         UInt32 conversionResult;
 260         if (stream->flags & ENCODING_MATCHES_ASCII) {
 261             while (stream->currentByte < dataEnd && lengthConsumed < maxLength) {
 262                 if (*stream->currentByte > 0x7f) break;
 263                 *base = *stream->currentByte;
 264                 base ++;
 265                 stream->currentByte ++;
 266                 lengthConsumed ++;
 267             }
 268             if (stream->currentByte == dataEnd || lengthConsumed == maxLength) {
 269                 return lengthConsumed;
 270             }
 271         }
 272         conversionResult = CFStringEncodingBytesToUnicode(stream->encoding, 0, stream->currentByte, dataEnd - stream->currentByte, &usedByteLength, base, maxLength-lengthConsumed, &usedCharLength);
 273         if(kCFStringEncodingConversionSuccess != conversionResult) {
 274             switch(conversionResult) {
 275                 case kCFStringEncodingConverterUnavailable:
 276                 case kCFStringEncodingInvalidInputStream:
 277                     stream->flags |= ENCODING_COMPOSITION_ERROR;
 278                     break;
 279                 case kCFStringEncodingInsufficientOutputBufferLength:
 280                 default:
 281                     break;
 282             }
 283         }
 284         if (usedByteLength > 0) {
 285             stream->currentByte += usedByteLength;
 286             lengthConsumed += usedCharLength;
 287         }
 288         return lengthConsumed;
 289     }
 290 }
 291
 292 // returns number of characters filled
 293 CF_INLINE CFIndex fillToCapacity(_CFXMLInputStream *stream) {
 294     CFIndex numFilled;
 295     if (stream->bufferLength >= stream->bufferCapacity) return 0;
 296     // Try and fill in the remaining characters
 297     numFilled = loadCharacters(stream->charBuffer+stream->bufferLength, stream->bufferCapacity - stream->bufferLength, stream);
 298     if (numFilled != 0) {
 299         stream->currentChar = stream->charBuffer + stream->bufferLength;
 300         stream->bufferLength += numFilled;
 301     }
 302     return numFilled;
 303 }
 304
 305 // we are expected to move mark & parserMark relative to any moved characters, set currentChar to the first new character fetched, update bufferLength, and advance currentByte as appropriate.  Does not check for EOF; it is the caller's responsibility to verify this.
 306 static void fillCharacterBuffer(_CFXMLInputStream *stream) {
 307     if (!stream->charBuffer) {
 308         growCharacterBuffer(stream);
 309     }
 310     if (!stream->mark && !stream->parserMark) {
 311         // This is the easy case; we can freely overwrite the buffer; if either mark or parserMark is set, we must not remove any characters from those marks and the end of the buffer
 312         CFIndex fillLength = stream->bufferCapacity-5; // We leave a few characters at the end, b/c we don't want to reallocate (doubling the amount of memory used) just b/c we're matching a small string near the end of the filled buffer
 313         stream->bufferLength = loadCharacters(stream->charBuffer, fillLength, stream);
 314         CFAssert(stream->bufferLength != 0, __kCFLogAssertion, "CF internal error: XML parser input stream corruption");
 315         stream->currentChar = stream->charBuffer;
 316     } else {
 317         // We do everything we can not to allocate; first we fill any remaining characters.  If that doesn't work, we try shifting the characters starting at the earlier of mark or parserMark to the beginning of buffer, then filling the newly-freed characters.
 318         Boolean done;
 319
 320         // First try just filling the remaining capacity
 321         done = (fillToCapacity(stream) != 0);
 322         if (!done) {
 323             const UniChar *leftMostMark;
 324             if (stream->mark && !stream->parserMark) {
 325                 leftMostMark = stream->mark;
 326             } else if (stream->parserMark && !stream->mark) {
 327                 leftMostMark = stream->parserMark;
 328             } else if (stream->parserMark < stream->mark) {
 329                 leftMostMark = stream->parserMark;
 330             } else {
 331                 leftMostMark = stream->mark;
 332             }
 333             if (leftMostMark > stream->charBuffer) {
 334                 CFIndex delta = leftMostMark - stream->charBuffer;
 335                 memmove(stream->charBuffer, leftMostMark, (stream->bufferLength - delta) * sizeof(UniChar));
 336                 stream->bufferLength -= delta;
 337                 if (stream->mark) {
 338                     stream->mark -= delta;
 339                 }
 340                 if (stream->parserMark) {
 341                     stream->parserMark -= delta;
 342                 }
 343                 // Now try to fill the newly-opened space
 344                 done = (fillToCapacity(stream) != 0);
 345                 delta = loadCharacters(stream->charBuffer+stream->bufferLength, stream->bufferCapacity - stream->bufferLength, stream);
 346             }
 347         }
 348         if (!done) {
 349             // No help for it; now we must allocate
 350             growCharacterBuffer(stream);
 351             fillToCapacity(stream); // If this doesn't work, we give up.
 352         }
 353     }
 354 }
 355
 356 /* The guts of getCharacter() have been separated in order to allow getCharacter() to be small and more easily inline-able. Note that a check late in the 10.3 development cycle indicates that getCharacter() should call getCharacterGuts() less than 2% of the time.  (In 29000 calls, less than 400 called this. Note that a majority of calls have advanceStream set, so that was left in the inline version.  Also note that some calls to getCharacter() were changed to go through the functions _inputStreamGetCharacter() or _inputStreamPeekCharacter(), as the expansion in using the inline version didn't seem worthwhile. See 3275503 for some data supporting this.
 357 */
 358 static Boolean getCharacterGuts(_CFXMLInputStream *stream, UniChar *ch, Boolean advanceStream) {
 359     if (stream->currentByte - CFDataGetBytePtr(stream->data) >= CFDataGetLength(stream->data)) {
 360         return false; // EOF
 361     } else if (!((stream->mark || stream->parserMark) && advanceStream) &&
 362                (((stream->flags & ENCODING_MATCHES_ASCII) && *(stream->currentByte) < 0x7F) ||
 363                 (stream->flags & (ENCODING_IS_UNICODE_NATURAL | ENCODING_IS_UNICODE_SWAPPED)))) {
 364         // We can only perform optimizations if neither mark is set (if the mark is set, we must fill the character buffer so we can retrieve the characters later), and the encoding is Unicode, or the encoding matches ASCII and we're looking at a low-byte character.
 365         if (stream->flags & ENCODING_MATCHES_ASCII) {
 366             *ch = (UniChar)*(stream->currentByte);
 367             if (advanceStream) {
 368                 stream->currentByte ++;
 369             }
 370         } else if (stream->flags & ENCODING_IS_UNICODE_NATURAL) {
 371             *ch = (*stream->currentByte) << 8;
 372             *ch += *(stream->currentByte + 1);
 373             if (advanceStream) {
 374                 stream->currentByte += 2;
 375             }
 376         } else {
 377             // Unicode with swapped bytes
 378             *ch = (*(stream->currentByte + 1)) << 8;
 379             *ch += *stream->currentByte;
 380             if (advanceStream) {
 381                 stream->currentByte += 2;
 382             }
 383         }
 384     } else {
 385         fillCharacterBuffer(stream); // this takes into account markIsSet to make sure and do the right thing
 386         if (!stream->charBuffer || !stream->currentChar) {
 387             return false;
 388         } else {
 389             *ch = *(stream->currentChar);
 390             if (advanceStream) {
 391                 stream->currentChar ++;
 392                 if (stream->currentChar == stream->charBuffer + stream->bufferLength) {
 393                     stream->currentChar = NULL;
 394                 }
 395             }
 396         }
 397     }
 398     return true;
 399 }
 400
 401 /* See comments above getCharacterGuts()
 402 */
 403 CF_INLINE Boolean getCharacter(_CFXMLInputStream *stream, UniChar *ch, Boolean advanceStream) {
 404     if (!(stream->flags & STREAM_OPEN)) {
 405         return false;
 406     } else if (stream->currentChar) {
 407         *ch = *stream->currentChar;
 408         if (advanceStream) {
 409             stream->currentChar ++;
 410             if (stream->currentChar == stream->charBuffer + stream->bufferLength) {
 411                 stream->currentChar = NULL;
 412             }
 413         }
 414     } else {
 415         if (!getCharacterGuts(stream, ch, advanceStream)) return false;
 416     }
 417     if (advanceStream) {
 418         UniChar nextChar;
 419         stream->charIndex ++;
 420         if ((*ch == '\n') || ((*ch == '\r') && (!_inputStreamPeekCharacter(stream, &nextChar) || nextChar != '\n'))) stream->lineNum ++;
 421     }
 422     return true;
 423 }
 424
 425 __private_extern__ Boolean _inputStreamPeekCharacter(_CFXMLInputStream *stream, UniChar *ch) {
 426     return getCharacter(stream, ch, false);
 427 }
 428
 429 __private_extern__ Boolean _inputStreamGetCharacter(_CFXMLInputStream *stream, UniChar *ch) {
 430     return getCharacter(stream, ch, true);
 431 }
 432
 433 __private_extern__ Boolean _inputStreamReturnCharacter(_CFXMLInputStream *stream, UniChar ch) {
 434     Boolean decrementLineNum = false;
 435     if (ch == '\n') {
 436         decrementLineNum = true;
 437     } else if (ch == '\r') {
 438         UniChar nextChar;
 439         if (!_inputStreamPeekCharacter(stream, &nextChar) || nextChar != '\n') {
 440             decrementLineNum = true;
 441         }
 442     }
 443
 444     if (!(stream->flags & STREAM_OPEN)) {
 445         return false;
 446     } else if (stream->currentChar) {
 447         if (stream->currentChar != stream->charBuffer) {
 448             stream->currentChar --;
 449         } else {
 450             // Yuck; we're unlucky and are returning a character _before_ the first character in charBuffer
 451             if (stream->bufferLength >= stream->bufferCapacity) {
 452                 growCharacterBuffer(stream);
 453             }
 454             memmove(stream->charBuffer + 1, stream->charBuffer, stream->bufferLength * sizeof(UniChar));
 455             *stream->charBuffer = ch;
 456             stream->bufferLength ++;
 457             if (stream->mark) {
 458                 stream->mark ++;
 459             }
 460             if (stream->parserMark) {
 461                 stream->parserMark ++;
 462             }
 463         }
 464     } else if ((stream->mark || stream->parserMark) && stream->bufferLength) {
 465         // We've been collecting characters in charBuffer; the only reason stream->currentChar is NULL is that we've processed the last character thusfar translated from data.  That last character is the one being returned.
 466         stream->currentChar = stream->charBuffer + stream->bufferLength - 1;
 467     } else if (stream->charBuffer) {
 468         // We have processed all the meaningful characters from charBuffer and have no reason to preserve them.  We use charBuffer to hold this one character that has been returned to us.
 469         *stream->charBuffer = ch;
 470         stream->currentChar = stream->charBuffer;
 471         stream->bufferLength = 1;
 472         if (stream->mark) {
 473             stream->mark ++;
 474         }
 475         if (stream->parserMark) {
 476             stream->parserMark ++;
 477         }
 478     } else if (stream->currentByte > CFDataGetBytePtr(stream->data)) {
 479         // We have no character buffer available, so that means one of two things - either we've never needed a character buffer because all the characters could come directly out of the byte stream, or we've not yet processed the first character.  The former means we can just back up the byte pointer; the latter means Bad Things have happened.
 480         if (stream->flags & ENCODING_MATCHES_ASCII) {
 481             stream->currentByte --;
 482         } else {  // Must be Unicode
 483             stream->currentByte -= 2;
 484         }
 485     } else {
 486         return false;
 487     }
 488     stream->charIndex --;
 489     if (decrementLineNum) {
 490         stream->lineNum --;
 491     }
 492     return true;
 493 }
 494
 495 // Returns the pointer to hold as the mark
 496 static UniChar *dropMark(_CFXMLInputStream *stream) {
 497     if (stream->currentChar) {
 498         return stream->currentChar;
 499     } else if (stream->mark || stream->parserMark) {
 500         return stream->charBuffer + stream->bufferLength;
 501     } else {
 502         if (!stream->charBuffer) {
 503             growCharacterBuffer(stream);
 504         }
 505         stream->bufferLength = 0; // This will be sufficient to force a fetch into the buffer when the next character is requested
 506         return stream->charBuffer;
 507     }
 508
 509 }
 510
 511 __private_extern__ void _inputStreamSetMark(_CFXMLInputStream *stream) {
 512     CFAssert(stream->mark == NULL, __kCFLogAssertion, "CF internal error: parser input stream malformed");
 513     stream->mark = dropMark(stream);
 514 }
 515
 516 __private_extern__ void _inputStreamClearMark(_CFXMLInputStream *stream) {
 517     CFAssert(stream->mark != NULL, __kCFLogAssertion, "CF internal error: parser input stream malformed");
 518     stream->mark = NULL;
 519 }
 520
 521 __private_extern__ void _inputStreamGetCharactersFromMark(_CFXMLInputStream *stream, CFMutableStringRef string) {
 522     UniChar *end = stream->currentChar ? stream->currentChar : stream->charBuffer + stream->bufferLength;
 523     CFIndex numChars = end - stream->mark;
 524     CFAssert(stream->mark, __kCFLogAssertion, "CF internal error: malformed XML input stream");
 525     _fillStringWithCharacters(string, stream->mark, numChars);
 526 }
 527
 528 static void restoreToMark(_CFXMLInputStream *stream, UniChar *mark) {
 529     UniChar *end = stream->currentChar ? stream->currentChar : stream->charBuffer + stream->bufferLength;
 530     if (end > mark) {
 531         CFIndex numChars = end - mark;
 532         stream->charIndex -= numChars;
 533         stream->currentChar = mark;
 534
 535         // This is annoying; to keep the line count accurate, if the first character we are returning is a CR, we must decrement the line count iff the next character is NOT a LF
 536         if (*(end - 1) == '\r') {
 537             UniChar nextChar;
 538             if (_inputStreamPeekCharacter(stream, &nextChar) && nextChar == '\n') {
 539                 end --;
 540             }
 541         }
 542         while (end != mark) {
 543             end --;
 544             if (*end == '\r') {
 545                 stream->lineNum --;
 546             } else if (*end == '\n') {
 547                 stream->lineNum --;
 548                 if (end != mark && *(end - 1) == '\r') {
 549                     end --;
 550                 }
 551             }
 552         }
 553     }
 554 }
 555
 556 __private_extern__ void _inputStreamBackUpToMark(_CFXMLInputStream *stream) {
 557     CFAssert(stream->mark != NULL || stream->charBuffer == NULL, __kCFLogAssertion, "CF internal error: malformed XML input stream");
 558     restoreToMark(stream, stream->mark);
 559 }
 560
 561 CF_INLINE Boolean isWhitespaceChar(UniChar ch) {
 562     return (ch == '\n' || ch == '\r' || ch == ' ' || ch == '\t');
 563 }
 564
 565 __private_extern__ CFIndex _inputStreamSkipWhitespace(_CFXMLInputStream *stream, CFMutableStringRef str) {
 566     UniChar ch;
 567     CFIndex len = 0;
 568     if (str) {
 569         stream->parserMark = dropMark(stream);
 570     }
 571     while (getCharacter(stream, &ch, true) && isWhitespaceChar(ch)) {
 572         len ++;
 573     }
 574     if (!isWhitespaceChar(ch)) {
 575         _inputStreamReturnCharacter(stream, ch);
 576     }
 577     if (str) {
 578         _fillStringWithCharacters(str, stream->parserMark, len);
 579         stream->parserMark = NULL;
 580     }
 581     return len;
 582 }
 583
 584 // false return means EOF was encountered without finding scanChars
 585 __private_extern__ Boolean _inputStreamScanToCharacters(_CFXMLInputStream *stream, const UniChar *scanChars, CFIndex numChars, CFMutableStringRef str) {
 586     Boolean done = false;
 587     CFIndex firstRepeatIndex = -1;
 588     CFIndex len = 0;
 589     stream->parserMark = dropMark(stream);
 590     do {
 591         UniChar ch;
 592         while (_inputStreamGetCharacter(stream, &ch) && ch != scanChars[0]) {
 593             len ++;
 594         }
 595         if (ch != scanChars[0]) {
 596             restoreToMark(stream, stream->parserMark);
 597             stream->parserMark = NULL;
 598             return false;
 599         } else {
 600             CFIndex i;
 601             for (i = 1; i < numChars; i ++) {
 602                 if (!_inputStreamGetCharacter(stream, &ch)) break;
 603                 if (ch != scanChars[i]) break;
 604             }
 605             if (i == numChars) {
 606                 done = true;
 607             } else {
 608                 if (firstRepeatIndex == -1) {
 609                     CFIndex j;
 610                     for (j = 1; j < numChars; j ++) {
 611                         if (scanChars[0] == scanChars[j]) {
 612                             break;
 613                         }
 614                     }
 615                     firstRepeatIndex = j;
 616                 }
 617                 _inputStreamReturnCharacter(stream, ch);
 618                 while (i > firstRepeatIndex) {
 619                     i --;
 620                     _inputStreamReturnCharacter(stream, scanChars[i]);
 621                 }
 622                 len += i;
 623             }
 624         }
 625     } while (!done);
 626     if (str) {
 627         _fillStringWithCharacters(str, stream->parserMark, len);
 628     }
 629     stream->parserMark = NULL;
 630     return true;
 631 }
 632
 633 __private_extern__ Boolean _inputStreamMatchString(_CFXMLInputStream *stream, const UniChar *stringToMatch, CFIndex length) {
 634     const UniChar *end = stringToMatch+length;
 635     const UniChar *sPtr=stringToMatch;
 636     stream->parserMark = dropMark(stream);
 637     while (sPtr < end) {
 638         UniChar ch;
 639         if (!_inputStreamGetCharacter(stream, &ch)) break;
 640         if (ch != *sPtr) break;
 641         sPtr ++;
 642     }
 643     if (sPtr != end) {
 644         restoreToMark(stream, stream->parserMark);
 645         stream->parserMark = NULL;
 646         return false;
 647     } else {
 648         stream->parserMark = NULL;
 649         return true;
 650     }
 651 }
 652
 653 __private_extern__ Boolean _inputStreamScanQuotedString(_CFXMLInputStream *stream, CFMutableStringRef str) {
 654     UniChar ch;
 655     if (!_inputStreamPeekCharacter(stream, &ch)) return false;
 656     if (ch != '\'' && ch != '\"')  return false;
 657
 658     _inputStreamGetCharacter(stream, &ch);
 659     if (!_inputStreamScanToCharacters(stream, &ch, 1, str)) {
 660         return false;
 661     }
 662     return true;
 663 }
 664
 665 /*
 666  [4]  NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender
 667  [5]  Name ::= (Letter | '_' | ':') (NameChar)*
 668  [7]  Nmtoken ::= (NameChar)+
 669  [84] Letter ::= BaseChar | Ideographic
 670
 671  We don't do this quite right; we rely on the Unicode charsets to do this analysis.  While
 672  the productions in the XML spec are based on the Unicode character sets, the definitions
 673  differ slightly to avoid those areas where the Unicode standard is still being resolved.
 674  At any rate, I'd lay money that using the Unicode charsets, we will be more correct than
 675  the vast majority of parsers out there.
 676
 677  Letter == kCFUniCharLetterCharacterSet
 678  Digit == kCFUniCharDecimalDigitCharacterSet
 679  CombiningChar == kCFUniCharNonBaseCharacterSet
 680  Extender - complex, and not represented by a uniform character set.
 681  */
 682 __private_extern__ Boolean _inputStreamScanXMLName(_CFXMLInputStream *stream, Boolean isNMToken, CFStringRef *str) {
 683     UniChar ch;
 684     Boolean success = true;
 685     stream->parserMark = dropMark(stream);
 686     if (!isNMToken) {
 687         // Only difference between an NMToken and a Name is Names have a stricter condition on the first character
 688         if (!getCharacter(stream, &ch, false)) {
 689             success = false;
 690         } else if (!CFUniCharIsMemberOf(ch, kCFUniCharLetterCharacterSet) && ch != '_' && ch != ':') {
 691             success = false;
 692         } else {
 693             getCharacter(stream, &ch, true);
 694         }
 695     }
 696     if (success) {
 697         while (getCharacter(stream, &ch, true)) {
 698             if (!CFUniCharIsMemberOf(ch, kCFUniCharLetterCharacterSet) && !CFUniCharIsMemberOf(ch, kCFUniCharDecimalDigitCharacterSet)  && ch != '.' && ch != '-' && ch != '_' && ch != ':' && !CFUniCharIsMemberOf(ch, kCFUniCharNonBaseCharacterSet)) {
 699                 _inputStreamReturnCharacter(stream, ch);
 700                 break;
 701             }
 702         }
 703         if (stream->currentChar == stream->parserMark) {
 704             success = false; // Must have processed at least one character
 705         }
 706     }
 707     if (success) {
 708         if (str) {
 709             if (!stream->nameSet) {
 710                 stream->nameSet = CFSetCreateMutable(stream->allocator, 0, &kCFTypeSetCallBacks);
 711                 stream->tempString = CFStringCreateMutableWithExternalCharactersNoCopy(stream->allocator, NULL, 0, 0, kCFAllocatorNull);
 712             }
 713             CFStringSetExternalCharactersNoCopy(stream->tempString, stream->parserMark, stream->currentChar-stream->parserMark, stream->currentChar-stream->parserMark);
 714             if (!CFSetGetValueIfPresent(stream->nameSet, stream->tempString, (const void **)str)) {
 715                 *str = CFStringCreateCopy(stream->allocator, stream->tempString);
 716                 CFSetAddValue(stream->nameSet, *str);
 717                 CFRelease(*str);
 718             }
 719         }
 720     } else {
 721         restoreToMark(stream, stream->parserMark);
 722     }
 723     stream->parserMark = NULL;
 724     return success;
 725 }
 726