Parsing.subproj/CFXMLInputStream.c

   1 /*
   2  * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23 /*      CFXMLInputStream.c
  24         Copyright 1999-2002, Apple, Inc. All rights reserved.
  25         Responsibility: Chris Parker
  26 */
  27
  28 #include <CoreFoundation/CFCharacterSet.h>
  29 #include <CoreFoundation/CFURLAccess.h>
  30 #include <string.h>
  31 #include "CFStringEncodingConverter.h"
  32 #include "CFUniChar.h"
  33 #include "CFXMLInputStream.h"
  34
  35 /* Utility functions used in parsing */
  36 static Boolean determineEncoding(_CFXMLInputStream *stream) {
  37     const uint8_t *bytes = (uint8_t *)CFDataGetBytePtr(stream->data);
  38     UInt32 length = CFDataGetLength(stream->data);
  39     const uint8_t *idx = 0L, *end = 0L;
  40     const uint8_t *base = 0L;
  41     char quote = ' ';
  42     Boolean useUTF8 = false;
  43
  44     // Check for the byte order mark first
  45     if (length > 2) {
  46         // This clause checks for the unicode byte order mark, or a Unicode sequence lacking the BOM; technically an error, but this check is recommended by the XML spec
  47         if ((*bytes == 0xFF && *(bytes+1) == 0xFE) ||*(bytes+1) == 0x00) {
  48             stream->flags |= ENCODING_IS_UNICODE_SWAPPED;
  49             if (*bytes == 0xFF) {
  50                 stream->currentByte = bytes + 2;
  51             }
  52             stream->encoding = kCFStringEncodingUnicode;
  53             return true;
  54         } else if ((*bytes == 0xFE && *(bytes+1) == 0xFF) || *bytes == 0x00) {
  55             stream->flags |= ENCODING_IS_UNICODE_NATURAL;
  56             if (*bytes == 0xFE) {
  57                 stream->currentByte = bytes + 2;
  58             }
  59             stream->encoding = kCFStringEncodingUnicode;
  60             return true;
  61         } else if(*bytes == 0xEF && *(bytes+1) == 0xBB && *(bytes+2) == 0xBF) {
  62             if(*bytes == 0xEF) {
  63                 stream->currentByte = bytes + 3;
  64             }
  65             stream->encoding = kCFStringEncodingUTF8;
  66             stream->flags |= ENCODING_MATCHES_ASCII;
  67             return true;
  68         }
  69     }
  70     // Scan for the <?xml.... ?> opening
  71     if (length < 5 || strncmp((char const *) bytes, "<?xml", 5) != 0) {
  72         useUTF8 = true;
  73     }
  74     if (!useUTF8) {
  75         idx = bytes + 5;
  76         end = bytes + length;
  77         // Found "<?xml"; now we scan for "encoding"
  78         while (idx < end) {
  79             uint8_t ch = *idx;
  80             const uint8_t *scan;
  81             if ( ch == '?' || ch == '>') {
  82                 useUTF8 = true;
  83                 break;
  84             }
  85             idx ++;
  86             scan = idx;
  87             if (ch == 'e' && *scan++ == 'n' && *scan++ == 'c' && *scan++ == 'o' && *scan++ == 'd' && *scan++ == 'i' && *scan++ == 'n' && *scan++ == 'g' && *scan++ == '=') {
  88                 idx = scan;
  89                 break;
  90             }
  91         }
  92         if (!useUTF8 && idx >= end) {
  93             useUTF8 = true;
  94         }
  95     }
  96     if (!useUTF8) {
  97         // Found "encoding="; see if we've got an honest-to-goodness encoding name
  98         quote = *idx;
  99         if (quote != '\'' && quote != '\"') {
 100             useUTF8 = true;
 101         }
 102     }
 103     if (!useUTF8) {
 104         base = idx + 1; // Move past the quote character
 105         idx ++;
 106         while (idx < end && *idx != quote) idx ++;
 107         if (idx >= end) {
 108             useUTF8 = true;
 109         }
 110     }
 111     if (!useUTF8) {
 112         UInt32 len = idx - base;
 113         if (len == 5 && (*base == 'u' || *base == 'U') && (base[1] == 't' || base[1] == 'T') && (base[2] == 'f' || base[2] == 'F') && (base[3] == '-') && (base[4] == '8')) {
 114             useUTF8 = true;
 115         } else {
 116             CFStringRef encodingName = CFStringCreateWithBytes(stream->allocator, base, len, kCFStringEncodingISOLatin1, false);
 117             stream->encoding = CFStringConvertIANACharSetNameToEncoding(encodingName);
 118             CFRelease(encodingName);
 119         }
 120     }
 121     if (useUTF8) {
 122         stream->encoding = kCFStringEncodingUTF8;
 123         stream->flags |= ENCODING_MATCHES_ASCII;
 124         return true;
 125     } else if (stream->encoding == kCFStringEncodingInvalidId) {
 126         return false;
 127     } else if (__CFStringEncodingIsSupersetOfASCII(stream->encoding)) {
 128         stream->flags |= ENCODING_MATCHES_ASCII;
 129     }
 130     return true;
 131 }
 132
 133 CF_INLINE void _fillStringWithCharacters(CFMutableStringRef string, UniChar *characters, CFIndex numChars) {
 134     CFStringDelete(string, CFRangeMake(0, CFStringGetLength(string)));
 135     if (numChars) {
 136         CFStringAppendCharacters(string, characters, numChars);
 137     }
 138 }
 139
 140 __private_extern__ Boolean _openInputStream(_CFXMLInputStream *stream) {
 141     if (NULL == stream->data && NULL != stream->url) {
 142         CFDataRef data = NULL;
 143         if (CFURLCreateDataAndPropertiesFromResource(stream->allocator, stream->url, &data, NULL, NULL, NULL)) {
 144             stream->data = data;
 145         }
 146     }
 147     if (NULL == stream->data) {
 148         return false;
 149     } else {
 150         stream->currentByte = CFDataGetBytePtr(stream->data);
 151         if (determineEncoding(stream)) {
 152             stream->flags |= STREAM_OPEN;
 153             return true;
 154         } else {
 155             return false;
 156         }
 157     }
 158 }
 159
 160 __private_extern__ void _initializeInputStream(_CFXMLInputStream *stream, CFAllocatorRef alloc, CFURLRef dataSource, CFDataRef xmlData) {
 161     stream->data = xmlData ? CFRetain(xmlData) : NULL;
 162     stream->url = dataSource ? CFRetain(dataSource) : NULL;
 163     stream->encoding = kCFStringEncodingInvalidId;
 164     stream->currentByte = NULL;
 165
 166     stream->allocator = CFRetain(alloc);
 167     stream->charBuffer = NULL;
 168     stream->currentChar = NULL;
 169     stream->mark = NULL;
 170     stream->parserMark = NULL;
 171     stream->bufferLength = 0;
 172     stream->bufferCapacity = 0;
 173
 174     stream->charIndex = 1;
 175     stream->lineNum = 1;
 176
 177     stream->flags = 0;
 178     stream->nameSet = NULL;
 179     stream->tempString = NULL;
 180 }
 181
 182
 183 __private_extern__ void _freeInputStream(_CFXMLInputStream *stream) {
 184     if (stream->data) CFRelease(stream->data);
 185     if (stream->url) CFRelease(stream->url);
 186     if (stream->charBuffer) CFAllocatorDeallocate(stream->allocator, stream->charBuffer);
 187     if (stream->nameSet) CFRelease(stream->nameSet);
 188     if (stream->tempString) CFRelease(stream->tempString);
 189     CFRelease(stream->allocator);
 190 }
 191
 192 __private_extern__ CFStringEncoding _inputStreamGetEncoding(_CFXMLInputStream *stream) {
 193     return stream->encoding;
 194 }
 195
 196 __private_extern__ CFIndex _inputStreamCurrentLocation(_CFXMLInputStream *stream) {
 197     return stream->charIndex;
 198 }
 199
 200 __private_extern__ CFIndex _inputStreamCurrentLine(_CFXMLInputStream *stream) {
 201     return stream->lineNum;
 202 }
 203
 204 __private_extern__ Boolean _inputStreamAtEOF(_CFXMLInputStream *stream) {
 205     if (!(stream->flags & STREAM_OPEN)) return false;
 206     if (stream->currentChar) return false;
 207     if (stream->currentByte - CFDataGetBytePtr(stream->data) < CFDataGetLength(stream->data)) return false;
 208     return true;
 209 }
 210
 211 __private_extern__ Boolean _inputStreamComposingErrorOccurred(_CFXMLInputStream *stream) {
 212     return stream->flags & ENCODING_COMPOSITION_ERROR;
 213 }
 214
 215 #define INITIAL_BUFFER_SIZE 64
 216 static void growCharacterBuffer(_CFXMLInputStream *stream) {
 217     if (!stream->charBuffer) {
 218         stream->charBuffer = CFAllocatorAllocate(stream->allocator, INITIAL_BUFFER_SIZE*sizeof(UniChar), 0);
 219         stream->bufferCapacity = INITIAL_BUFFER_SIZE;
 220     } else {
 221         CFIndex currCharDelta = stream->currentChar ? stream->currentChar - stream->charBuffer : -1;
 222         CFIndex markDelta = stream->mark ? stream->mark - stream->charBuffer: -1;
 223         CFIndex parserMarkDelta = stream->parserMark ? stream->parserMark - stream->charBuffer: -1;
 224         UniChar *newBuffer = CFAllocatorReallocate(stream->allocator, stream->charBuffer, stream->bufferCapacity * 2 * sizeof(UniChar), 0);
 225         stream->bufferCapacity *= 2;
 226         if (newBuffer != stream->charBuffer) {
 227             stream->charBuffer = newBuffer;
 228             if (currCharDelta != -1) {
 229                 stream->currentChar = newBuffer + currCharDelta;
 230             }
 231             if (markDelta != -1) {
 232                 stream->mark = newBuffer + markDelta;
 233             }
 234             if (parserMarkDelta != -1) {
 235                 stream->parserMark = newBuffer + parserMarkDelta;
 236             }
 237         }
 238     }
 239 }
 240
 241 static CFIndex loadCharacters(UniChar *base, CFIndex maxLength, _CFXMLInputStream *stream) {
 242     const uint8_t *dataEnd = CFDataGetBytePtr(stream->data) + CFDataGetLength(stream->data);
 243     if (stream->flags & (ENCODING_IS_UNICODE_NATURAL|ENCODING_IS_UNICODE_SWAPPED) ) {
 244         CFIndex charsToTranslate = (dataEnd - stream->currentByte) / sizeof(UniChar);
 245         if (charsToTranslate > maxLength) {
 246             charsToTranslate = maxLength;
 247         }
 248         if (stream->flags & ENCODING_IS_UNICODE_NATURAL) {
 249             memmove(base, stream->currentByte, charsToTranslate * sizeof(UniChar));
 250             stream->currentByte += (charsToTranslate * sizeof(UniChar));
 251         } else {
 252             CFIndex i;
 253             uint8_t *baseBytePtr = (uint8_t *)base;
 254             for (i = 0; i < charsToTranslate; i ++) {
 255                 *(baseBytePtr + 1) = *stream->currentByte;
 256                 *baseBytePtr = *(stream->currentByte + 1);
 257                 baseBytePtr += 2;
 258                 stream->currentByte += 2;
 259             }
 260         }
 261         return charsToTranslate;
 262     } else {
 263         CFIndex lengthConsumed = 0, usedByteLength, usedCharLength;
 264         UInt32 conversionResult;
 265         if (stream->flags & ENCODING_MATCHES_ASCII) {
 266             while (stream->currentByte < dataEnd && lengthConsumed < maxLength) {
 267                 if (*stream->currentByte > 0x7f) break;
 268                 *base = *stream->currentByte;
 269                 base ++;
 270                 stream->currentByte ++;
 271                 lengthConsumed ++;
 272             }
 273             if (stream->currentByte == dataEnd || lengthConsumed == maxLength) {
 274                 return lengthConsumed;
 275             }
 276         }
 277         conversionResult = CFStringEncodingBytesToUnicode(stream->encoding, 0, stream->currentByte, dataEnd - stream->currentByte, &usedByteLength, base, maxLength-lengthConsumed, &usedCharLength);
 278         if(kCFStringEncodingConversionSuccess != conversionResult) {
 279             switch(conversionResult) {
 280                 case kCFStringEncodingConverterUnavailable:
 281                 case kCFStringEncodingInvalidInputStream:
 282                     stream->flags |= ENCODING_COMPOSITION_ERROR;
 283                     break;
 284                 case kCFStringEncodingInsufficientOutputBufferLength:
 285                 default:
 286                     break;
 287             }
 288         }
 289         if (usedByteLength > 0) {
 290             stream->currentByte += usedByteLength;
 291             lengthConsumed += usedCharLength;
 292         }
 293         return lengthConsumed;
 294     }
 295 }
 296
 297 // returns number of characters filled
 298 CF_INLINE CFIndex fillToCapacity(_CFXMLInputStream *stream) {
 299     CFIndex numFilled;
 300     if (stream->bufferLength >= stream->bufferCapacity) return 0;
 301     // Try and fill in the remaining characters
 302     numFilled = loadCharacters(stream->charBuffer+stream->bufferLength, stream->bufferCapacity - stream->bufferLength, stream);
 303     if (numFilled != 0) {
 304         stream->currentChar = stream->charBuffer + stream->bufferLength;
 305         stream->bufferLength += numFilled;
 306     }
 307     return numFilled;
 308 }
 309
 310 // we are expected to move mark & parserMark relative to any moved characters, set currentChar to the first new character fetched, update bufferLength, and advance currentByte as appropriate.  Does not check for EOF; it is the caller's responsibility to verify this.
 311 static void fillCharacterBuffer(_CFXMLInputStream *stream) {
 312     if (!stream->charBuffer) {
 313         growCharacterBuffer(stream);
 314     }
 315     if (!stream->mark && !stream->parserMark) {
 316         // This is the easy case; we can freely overwrite the buffer; if either mark or parserMark is set, we must not remove any characters from those marks and the end of the buffer
 317         CFIndex fillLength = stream->bufferCapacity-5; // We leave a few characters at the end, b/c we don't want to reallocate (doubling the amount of memory used) just b/c we're matching a small string near the end of the filled buffer
 318         stream->bufferLength = loadCharacters(stream->charBuffer, fillLength, stream);
 319         CFAssert(stream->bufferLength != 0, __kCFLogAssertion, "CF internal error: XML parser input stream corruption");
 320         stream->currentChar = stream->charBuffer;
 321     } else {
 322         // We do everything we can not to allocate; first we fill any remaining characters.  If that doesn't work, we try shifting the characters starting at the earlier of mark or parserMark to the beginning of buffer, then filling the newly-freed characters.
 323         Boolean done;
 324
 325         // First try just filling the remaining capacity
 326         done = (fillToCapacity(stream) != 0);
 327         if (!done) {
 328             const UniChar *leftMostMark;
 329             if (stream->mark && !stream->parserMark) {
 330                 leftMostMark = stream->mark;
 331             } else if (stream->parserMark && !stream->mark) {
 332                 leftMostMark = stream->parserMark;
 333             } else if (stream->parserMark < stream->mark) {
 334                 leftMostMark = stream->parserMark;
 335             } else {
 336                 leftMostMark = stream->mark;
 337             }
 338             if (leftMostMark > stream->charBuffer) {
 339                 CFIndex delta = leftMostMark - stream->charBuffer;
 340                 memmove(stream->charBuffer, leftMostMark, (stream->bufferLength - delta) * sizeof(UniChar));
 341                 stream->bufferLength -= delta;
 342                 if (stream->mark) {
 343                     stream->mark -= delta;
 344                 }
 345                 if (stream->parserMark) {
 346                     stream->parserMark -= delta;
 347                 }
 348                 // Now try to fill the newly-opened space
 349                 done = (fillToCapacity(stream) != 0);
 350                 delta = loadCharacters(stream->charBuffer+stream->bufferLength, stream->bufferCapacity - stream->bufferLength, stream);
 351             }
 352         }
 353         if (!done) {
 354             // No help for it; now we must allocate
 355             growCharacterBuffer(stream);
 356             fillToCapacity(stream); // If this doesn't work, we give up.
 357         }
 358     }
 359 }
 360
 361 /* The guts of getCharacter() have been separated in order to allow getCharacter() to be small and more easily inline-able. Note that a check late in the 10.3 development cycle indicates that getCharacter() should call getCharacterGuts() less than 2% of the time.  (In 29000 calls, less than 400 called this. Note that a majority of calls have advanceStream set, so that was left in the inline version.  Also note that some calls to getCharacter() were changed to go through the functions _inputStreamGetCharacter() or _inputStreamPeekCharacter(), as the expansion in using the inline version didn't seem worthwhile. See 3275503 for some data supporting this.
 362 */
 363 static Boolean getCharacterGuts(_CFXMLInputStream *stream, UniChar *ch, Boolean advanceStream) {
 364     if (stream->currentByte - CFDataGetBytePtr(stream->data) >= CFDataGetLength(stream->data)) {
 365         return false; // EOF
 366     } else if (!((stream->mark || stream->parserMark) && advanceStream) &&
 367                (((stream->flags & ENCODING_MATCHES_ASCII) && *(stream->currentByte) < 0x7F) ||
 368                 (stream->flags & (ENCODING_IS_UNICODE_NATURAL | ENCODING_IS_UNICODE_SWAPPED)))) {
 369         // We can only perform optimizations if neither mark is set (if the mark is set, we must fill the character buffer so we can retrieve the characters later), and the encoding is Unicode, or the encoding matches ASCII and we're looking at a low-byte character.
 370         if (stream->flags & ENCODING_MATCHES_ASCII) {
 371             *ch = (UniChar)*(stream->currentByte);
 372             if (advanceStream) {
 373                 stream->currentByte ++;
 374             }
 375         } else if (stream->flags & ENCODING_IS_UNICODE_NATURAL) {
 376             *ch = (*stream->currentByte) << 8;
 377             *ch += *(stream->currentByte + 1);
 378             if (advanceStream) {
 379                 stream->currentByte += 2;
 380             }
 381         } else {
 382             // Unicode with swapped bytes
 383             *ch = (*(stream->currentByte + 1)) << 8;
 384             *ch += *stream->currentByte;
 385             if (advanceStream) {
 386                 stream->currentByte += 2;
 387             }
 388         }
 389     } else {
 390         fillCharacterBuffer(stream); // this takes into account markIsSet to make sure and do the right thing
 391         if (!stream->charBuffer || !stream->currentChar) {
 392             return false;
 393         } else {
 394             *ch = *(stream->currentChar);
 395             if (advanceStream) {
 396                 stream->currentChar ++;
 397                 if (stream->currentChar == stream->charBuffer + stream->bufferLength) {
 398                     stream->currentChar = NULL;
 399                 }
 400             }
 401         }
 402     }
 403     return true;
 404 }
 405
 406 /* See comments above getCharacterGuts()
 407 */
 408 CF_INLINE Boolean getCharacter(_CFXMLInputStream *stream, UniChar *ch, Boolean advanceStream) {
 409     if (!(stream->flags & STREAM_OPEN)) {
 410         return false;
 411     } else if (stream->currentChar) {
 412         *ch = *stream->currentChar;
 413         if (advanceStream) {
 414             stream->currentChar ++;
 415             if (stream->currentChar == stream->charBuffer + stream->bufferLength) {
 416                 stream->currentChar = NULL;
 417             }
 418         }
 419     } else {
 420         if (!getCharacterGuts(stream, ch, advanceStream)) return false;
 421     }
 422     if (advanceStream) {
 423         UniChar nextChar;
 424         stream->charIndex ++;
 425         if ((*ch == '\n') || ((*ch == '\r') && (!_inputStreamPeekCharacter(stream, &nextChar) || nextChar != '\n'))) stream->lineNum ++;
 426     }
 427     return true;
 428 }
 429
 430 __private_extern__ Boolean _inputStreamPeekCharacter(_CFXMLInputStream *stream, UniChar *ch) {
 431     return getCharacter(stream, ch, false);
 432 }
 433
 434 __private_extern__ Boolean _inputStreamGetCharacter(_CFXMLInputStream *stream, UniChar *ch) {
 435     return getCharacter(stream, ch, true);
 436 }
 437
 438 __private_extern__ Boolean _inputStreamReturnCharacter(_CFXMLInputStream *stream, UniChar ch) {
 439     Boolean decrementLineNum = false;
 440     if (ch == '\n') {
 441         decrementLineNum = true;
 442     } else if (ch == '\r') {
 443         UniChar nextChar;
 444         if (!_inputStreamPeekCharacter(stream, &nextChar) || nextChar != '\n') {
 445             decrementLineNum = true;
 446         }
 447     }
 448
 449     if (!(stream->flags & STREAM_OPEN)) {
 450         return false;
 451     } else if (stream->currentChar) {
 452         if (stream->currentChar != stream->charBuffer) {
 453             stream->currentChar --;
 454         } else {
 455             // Yuck; we're unlucky and are returning a character _before_ the first character in charBuffer
 456             if (stream->bufferLength >= stream->bufferCapacity) {
 457                 growCharacterBuffer(stream);
 458             }
 459             memmove(stream->charBuffer + 1, stream->charBuffer, stream->bufferLength * sizeof(UniChar));
 460             *stream->charBuffer = ch;
 461             stream->bufferLength ++;
 462             if (stream->mark) {
 463                 stream->mark ++;
 464             }
 465             if (stream->parserMark) {
 466                 stream->parserMark ++;
 467             }
 468         }
 469     } else if ((stream->mark || stream->parserMark) && stream->bufferLength) {
 470         // We've been collecting characters in charBuffer; the only reason stream->currentChar is NULL is that we've processed the last character thusfar translated from data.  That last character is the one being returned.
 471         stream->currentChar = stream->charBuffer + stream->bufferLength - 1;
 472     } else if (stream->charBuffer) {
 473         // We have processed all the meaningful characters from charBuffer and have no reason to preserve them.  We use charBuffer to hold this one character that has been returned to us.
 474         *stream->charBuffer = ch;
 475         stream->currentChar = stream->charBuffer;
 476         stream->bufferLength = 1;
 477         if (stream->mark) {
 478             stream->mark ++;
 479         }
 480         if (stream->parserMark) {
 481             stream->parserMark ++;
 482         }
 483     } else if (stream->currentByte > CFDataGetBytePtr(stream->data)) {
 484         // We have no character buffer available, so that means one of two things - either we've never needed a character buffer because all the characters could come directly out of the byte stream, or we've not yet processed the first character.  The former means we can just back up the byte pointer; the latter means Bad Things have happened.
 485         if (stream->flags & ENCODING_MATCHES_ASCII) {
 486             stream->currentByte --;
 487         } else {  // Must be Unicode
 488             stream->currentByte -= 2;
 489         }
 490     } else {
 491         return false;
 492     }
 493     stream->charIndex --;
 494     if (decrementLineNum) {
 495         stream->lineNum --;
 496     }
 497     return true;
 498 }
 499
 500 // Returns the pointer to hold as the mark
 501 static UniChar *dropMark(_CFXMLInputStream *stream) {
 502     if (stream->currentChar) {
 503         return stream->currentChar;
 504     } else if (stream->mark || stream->parserMark) {
 505         return stream->charBuffer + stream->bufferLength;
 506     } else {
 507         if (!stream->charBuffer) {
 508             growCharacterBuffer(stream);
 509         }
 510         stream->bufferLength = 0; // This will be sufficient to force a fetch into the buffer when the next character is requested
 511         return stream->charBuffer;
 512     }
 513
 514 }
 515
 516 __private_extern__ void _inputStreamSetMark(_CFXMLInputStream *stream) {
 517     CFAssert(stream->mark == NULL, __kCFLogAssertion, "CF internal error: parser input stream malformed");
 518     stream->mark = dropMark(stream);
 519 }
 520
 521 __private_extern__ void _inputStreamClearMark(_CFXMLInputStream *stream) {
 522     CFAssert(stream->mark != NULL, __kCFLogAssertion, "CF internal error: parser input stream malformed");
 523     stream->mark = NULL;
 524 }
 525
 526 __private_extern__ void _inputStreamGetCharactersFromMark(_CFXMLInputStream *stream, CFMutableStringRef string) {
 527     UniChar *end = stream->currentChar ? stream->currentChar : stream->charBuffer + stream->bufferLength;
 528     CFIndex numChars = end - stream->mark;
 529     CFAssert(stream->mark, __kCFLogAssertion, "CF internal error: malformed XML input stream");
 530     _fillStringWithCharacters(string, stream->mark, numChars);
 531 }
 532
 533 static void restoreToMark(_CFXMLInputStream *stream, UniChar *mark) {
 534     UniChar *end = stream->currentChar ? stream->currentChar : stream->charBuffer + stream->bufferLength;
 535     if (end > mark) {
 536         CFIndex numChars = end - mark;
 537         stream->charIndex -= numChars;
 538         stream->currentChar = mark;
 539
 540         // This is annoying; to keep the line count accurate, if the first character we are returning is a CR, we must decrement the line count iff the next character is NOT a LF
 541         if (*(end - 1) == '\r') {
 542             UniChar nextChar;
 543             if (_inputStreamPeekCharacter(stream, &nextChar) && nextChar == '\n') {
 544                 end --;
 545             }
 546         }
 547         while (end != mark) {
 548             end --;
 549             if (*end == '\r') {
 550                 stream->lineNum --;
 551             } else if (*end == '\n') {
 552                 stream->lineNum --;
 553                 if (end != mark && *(end - 1) == '\r') {
 554                     end --;
 555                 }
 556             }
 557         }
 558     }
 559 }
 560
 561 __private_extern__ void _inputStreamBackUpToMark(_CFXMLInputStream *stream) {
 562     CFAssert(stream->mark != NULL || stream->charBuffer == NULL, __kCFLogAssertion, "CF internal error: malformed XML input stream");
 563     restoreToMark(stream, stream->mark);
 564 }
 565
 566 CF_INLINE Boolean isWhitespaceChar(UniChar ch) {
 567     return (ch == '\n' || ch == '\r' || ch == ' ' || ch == '\t');
 568 }
 569
 570 __private_extern__ CFIndex _inputStreamSkipWhitespace(_CFXMLInputStream *stream, CFMutableStringRef str) {
 571     UniChar ch;
 572     CFIndex len = 0;
 573     if (str) {
 574         stream->parserMark = dropMark(stream);
 575     }
 576     while (getCharacter(stream, &ch, true) && isWhitespaceChar(ch)) {
 577         len ++;
 578     }
 579     if (!isWhitespaceChar(ch)) {
 580         _inputStreamReturnCharacter(stream, ch);
 581     }
 582     if (str) {
 583         _fillStringWithCharacters(str, stream->parserMark, len);
 584         stream->parserMark = NULL;
 585     }
 586     return len;
 587 }
 588
 589 // false return means EOF was encountered without finding scanChars
 590 __private_extern__ Boolean _inputStreamScanToCharacters(_CFXMLInputStream *stream, const UniChar *scanChars, CFIndex numChars, CFMutableStringRef str) {
 591     Boolean done = false;
 592     CFIndex firstRepeatIndex = -1;
 593     CFIndex len = 0;
 594     stream->parserMark = dropMark(stream);
 595     do {
 596         UniChar ch;
 597         while (_inputStreamGetCharacter(stream, &ch) && ch != scanChars[0]) {
 598             len ++;
 599         }
 600         if (ch != scanChars[0]) {
 601             restoreToMark(stream, stream->parserMark);
 602             stream->parserMark = NULL;
 603             return false;
 604         } else {
 605             CFIndex i;
 606             for (i = 1; i < numChars; i ++) {
 607                 if (!_inputStreamGetCharacter(stream, &ch)) break;
 608                 if (ch != scanChars[i]) break;
 609             }
 610             if (i == numChars) {
 611                 done = true;
 612             } else {
 613                 if (firstRepeatIndex == -1) {
 614                     CFIndex j;
 615                     for (j = 1; j < numChars; j ++) {
 616                         if (scanChars[0] == scanChars[j]) {
 617                             break;
 618                         }
 619                     }
 620                     firstRepeatIndex = j;
 621                 }
 622                 _inputStreamReturnCharacter(stream, ch);
 623                 while (i > firstRepeatIndex) {
 624                     i --;
 625                     _inputStreamReturnCharacter(stream, scanChars[i]);
 626                 }
 627                 len += i;
 628             }
 629         }
 630     } while (!done);
 631     if (str) {
 632         _fillStringWithCharacters(str, stream->parserMark, len);
 633     }
 634     stream->parserMark = NULL;
 635     return true;
 636 }
 637
 638 __private_extern__ Boolean _inputStreamMatchString(_CFXMLInputStream *stream, const UniChar *stringToMatch, CFIndex length) {
 639     const UniChar *end = stringToMatch+length;
 640     const UniChar *sPtr=stringToMatch;
 641     stream->parserMark = dropMark(stream);
 642     while (sPtr < end) {
 643         UniChar ch;
 644         if (!_inputStreamGetCharacter(stream, &ch)) break;
 645         if (ch != *sPtr) break;
 646         sPtr ++;
 647     }
 648     if (sPtr != end) {
 649         restoreToMark(stream, stream->parserMark);
 650         stream->parserMark = NULL;
 651         return false;
 652     } else {
 653         stream->parserMark = NULL;
 654         return true;
 655     }
 656 }
 657
 658 __private_extern__ Boolean _inputStreamScanQuotedString(_CFXMLInputStream *stream, CFMutableStringRef str) {
 659     UniChar ch;
 660     if (!_inputStreamPeekCharacter(stream, &ch)) return false;
 661     if (ch != '\'' && ch != '\"')  return false;
 662
 663     _inputStreamGetCharacter(stream, &ch);
 664     if (!_inputStreamScanToCharacters(stream, &ch, 1, str)) {
 665         return false;
 666     }
 667     return true;
 668 }
 669
 670 /*
 671  [4]  NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender
 672  [5]  Name ::= (Letter | '_' | ':') (NameChar)*
 673  [7]  Nmtoken ::= (NameChar)+
 674  [84] Letter ::= BaseChar | Ideographic
 675
 676  We don't do this quite right; we rely on the Unicode charsets to do this analysis.  While
 677  the productions in the XML spec are based on the Unicode character sets, the definitions
 678  differ slightly to avoid those areas where the Unicode standard is still being resolved.
 679  At any rate, I'd lay money that using the Unicode charsets, we will be more correct than
 680  the vast majority of parsers out there.
 681
 682  Letter == kCFUniCharLetterCharacterSet
 683  Digit == kCFUniCharDecimalDigitCharacterSet
 684  CombiningChar == kCFUniCharNonBaseCharacterSet
 685  Extender - complex, and not represented by a uniform character set.
 686  */
 687 __private_extern__ Boolean _inputStreamScanXMLName(_CFXMLInputStream *stream, Boolean isNMToken, CFStringRef *str) {
 688     UniChar ch;
 689     Boolean success = true;
 690     stream->parserMark = dropMark(stream);
 691     if (!isNMToken) {
 692         // Only difference between an NMToken and a Name is Names have a stricter condition on the first character
 693         if (!getCharacter(stream, &ch, false)) {
 694             success = false;
 695         } else if (!CFUniCharIsMemberOf(ch, kCFUniCharLetterCharacterSet) && ch != '_' && ch != ':') {
 696             success = false;
 697         } else {
 698             getCharacter(stream, &ch, true);
 699         }
 700     }
 701     if (success) {
 702         while (getCharacter(stream, &ch, true)) {
 703             if (!CFUniCharIsMemberOf(ch, kCFUniCharLetterCharacterSet) && !CFUniCharIsMemberOf(ch, kCFUniCharDecimalDigitCharacterSet)  && ch != '.' && ch != '-' && ch != '_' && ch != ':' && !CFUniCharIsMemberOf(ch, kCFUniCharNonBaseCharacterSet)) {
 704                 _inputStreamReturnCharacter(stream, ch);
 705                 break;
 706             }
 707         }
 708         if (stream->currentChar == stream->parserMark) {
 709             success = false; // Must have processed at least one character
 710         }
 711     }
 712     if (success) {
 713         if (str) {
 714             if (!stream->nameSet) {
 715                 stream->nameSet = CFSetCreateMutable(stream->allocator, 0, &kCFTypeSetCallBacks);
 716                 stream->tempString = CFStringCreateMutableWithExternalCharactersNoCopy(stream->allocator, NULL, 0, 0, kCFAllocatorNull);
 717             }
 718             CFStringSetExternalCharactersNoCopy(stream->tempString, stream->parserMark, stream->currentChar-stream->parserMark, stream->currentChar-stream->parserMark);
 719             if (!CFSetGetValueIfPresent(stream->nameSet, stream->tempString, (const void **)str)) {
 720                 *str = CFStringCreateCopy(stream->allocator, stream->tempString);
 721                 CFSetAddValue(stream->nameSet, *str);
 722                 CFRelease(*str);
 723             }
 724         }
 725     } else {
 726         restoreToMark(stream, stream->parserMark);
 727     }
 728     stream->parserMark = NULL;
 729     return success;
 730 }
 731