CFXMLInputStream.c

   1 /*
   2  * Copyright (c) 2012 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23
  24 /*      CFXMLInputStream.c
  25         Copyright (c) 1999-2011, Apple Inc. All rights reserved.
  26         Responsibility: David Smith
  27 */
  28
  29 #include "CFXMLInputStream.h"
  30 #include <CoreFoundation/CFCharacterSet.h>
  31 #include <string.h>
  32 #include "CFStringEncodingConverter.h"
  33 #include "CFUniChar.h"
  34
  35 /* Utility functions used in parsing */
  36 static Boolean determineEncoding(_CFXMLInputStream *stream) {
  37     const uint8_t *bytes = (uint8_t *)CFDataGetBytePtr(stream->data);
  38     UInt32 length = CFDataGetLength(stream->data);
  39     const uint8_t *idx = 0L, *end = 0L;
  40     const uint8_t *base = 0L;
  41     char quote = ' ';
  42     Boolean useUTF8 = false;
  43
  44     // Check for the byte order mark first
  45     if (length > 2) {
  46         // This clause checks for the unicode byte order mark, or a Unicode sequence lacking the BOM; technically an error, but this check is recommended by the XML spec
  47         if ((*bytes == 0xFF && *(bytes+1) == 0xFE) ||*(bytes+1) == 0x00) {
  48 #if __BIG_ENDIAN__
  49             stream->flags |= ENCODING_IS_UNICODE_SWAPPED;
  50 #else
  51             stream->flags |= ENCODING_IS_UNICODE_NATURAL;
  52 #endif
  53             if (*bytes == 0xFF) {
  54                 stream->currentByte = bytes + 2;
  55             }
  56             stream->encoding = kCFStringEncodingUnicode;
  57             return true;
  58         } else if ((*bytes == 0xFE && *(bytes+1) == 0xFF) || *bytes == 0x00) {
  59 #if __BIG_ENDIAN__
  60             stream->flags |= ENCODING_IS_UNICODE_NATURAL;
  61 #else
  62             stream->flags |= ENCODING_IS_UNICODE_SWAPPED;
  63 #endif
  64             if (*bytes == 0xFE) {
  65                 stream->currentByte = bytes + 2;
  66             }
  67             stream->encoding = kCFStringEncodingUnicode;
  68             return true;
  69         } else if(*bytes == 0xEF && *(bytes+1) == 0xBB && *(bytes+2) == 0xBF) {
  70             if(*bytes == 0xEF) {
  71                 stream->currentByte = bytes + 3;
  72             }
  73             stream->encoding = kCFStringEncodingUTF8;
  74             stream->flags |= ENCODING_MATCHES_ASCII;
  75             return true;
  76         }
  77     }
  78     // Scan for the <?xml.... ?> opening
  79     if (length < 5 || strncmp((char const *) bytes, "<?xml", 5) != 0) {
  80         useUTF8 = true;
  81     }
  82     if (!useUTF8) {
  83         idx = bytes + 5;
  84         end = bytes + length;
  85         // Found "<?xml"; now we scan for "encoding"
  86         while (idx < end) {
  87             uint8_t ch = *idx;
  88             const uint8_t *scan;
  89             if ( ch == '?' || ch == '>') {
  90                 useUTF8 = true;
  91                 break;
  92             }
  93             idx ++;
  94             scan = idx;
  95             if (ch == 'e' && *scan++ == 'n' && *scan++ == 'c' && *scan++ == 'o' && *scan++ == 'd' && *scan++ == 'i' && *scan++ == 'n' && *scan++ == 'g' && *scan++ == '=') {
  96                 idx = scan;
  97                 break;
  98             }
  99         }
 100         if (!useUTF8 && idx >= end) {
 101             useUTF8 = true;
 102         }
 103     }
 104     if (!useUTF8) {
 105         // Found "encoding="; see if we've got an honest-to-goodness encoding name
 106         quote = *idx;
 107         if (quote != '\'' && quote != '\"') {
 108             useUTF8 = true;
 109         }
 110     }
 111     if (!useUTF8) {
 112         base = idx + 1; // Move past the quote character
 113         idx ++;
 114         while (idx < end && *idx != quote) idx ++;
 115         if (idx >= end) {
 116             useUTF8 = true;
 117         }
 118     }
 119     if (!useUTF8) {
 120         UInt32 len = idx - base;
 121         if (len == 5 && (*base == 'u' || *base == 'U') && (base[1] == 't' || base[1] == 'T') && (base[2] == 'f' || base[2] == 'F') && (base[3] == '-') && (base[4] == '8')) {
 122             useUTF8 = true;
 123         } else {
 124             CFStringRef encodingName = CFStringCreateWithBytes(stream->allocator, base, len, kCFStringEncodingISOLatin1, false);
 125             stream->encoding = CFStringConvertIANACharSetNameToEncoding(encodingName);
 126             CFRelease(encodingName);
 127         }
 128     }
 129     if (useUTF8) {
 130         stream->encoding = kCFStringEncodingUTF8;
 131         stream->flags |= ENCODING_MATCHES_ASCII;
 132         return true;
 133     } else if (stream->encoding == kCFStringEncodingInvalidId) {
 134         return false;
 135     } else if (__CFStringEncodingIsSupersetOfASCII(stream->encoding)) {
 136         stream->flags |= ENCODING_MATCHES_ASCII;
 137     }
 138     return true;
 139 }
 140
 141 CF_INLINE void _fillStringWithCharacters(CFMutableStringRef string, UniChar *characters, CFIndex numChars) {
 142     CFStringDelete(string, CFRangeMake(0, CFStringGetLength(string)));
 143     if (numChars) {
 144         CFStringAppendCharacters(string, characters, numChars);
 145     }
 146 }
 147
 148 __private_extern__ Boolean _openInputStream(_CFXMLInputStream *stream) {
 149     if (NULL == stream->data) {
 150         return false;
 151     } else {
 152         stream->currentByte = CFDataGetBytePtr(stream->data);
 153         if (determineEncoding(stream)) {
 154             stream->flags |= STREAM_OPEN;
 155             return true;
 156         } else {
 157             return false;
 158         }
 159     }
 160 }
 161
 162 __private_extern__ void _initializeInputStream(_CFXMLInputStream *stream, CFAllocatorRef alloc, CFURLRef dataSource, CFDataRef xmlData) {
 163     stream->data = xmlData ? (CFDataRef)CFRetain(xmlData) : NULL;
 164     stream->url = dataSource ? (CFURLRef)CFRetain(dataSource) : NULL;
 165     stream->encoding = kCFStringEncodingInvalidId;
 166     stream->currentByte = NULL;
 167
 168     stream->allocator = (CFAllocatorRef)CFRetain(alloc);
 169     stream->charBuffer = NULL;
 170     stream->currentChar = NULL;
 171     stream->mark = NULL;
 172     stream->parserMark = NULL;
 173     stream->bufferLength = 0;
 174     stream->bufferCapacity = 0;
 175
 176     stream->charIndex = 1;
 177     stream->lineNum = 1;
 178
 179     stream->flags = 0;
 180     stream->nameSet = NULL;
 181     stream->tempString = NULL;
 182 }
 183
 184
 185 __private_extern__ void _freeInputStream(_CFXMLInputStream *stream) {
 186     if (stream->data) CFRelease(stream->data);
 187     if (stream->url) CFRelease(stream->url);
 188     if (stream->charBuffer) CFAllocatorDeallocate(stream->allocator, stream->charBuffer);
 189     if (stream->nameSet) CFRelease(stream->nameSet);
 190     if (stream->tempString) CFRelease(stream->tempString);
 191     CFRelease(stream->allocator);
 192 }
 193
 194 __private_extern__ CFStringEncoding _inputStreamGetEncoding(_CFXMLInputStream *stream) {
 195     return stream->encoding;
 196 }
 197
 198 __private_extern__ CFIndex _inputStreamCurrentLocation(_CFXMLInputStream *stream) {
 199     return stream->charIndex;
 200 }
 201
 202 __private_extern__ CFIndex _inputStreamCurrentLine(_CFXMLInputStream *stream) {
 203     return stream->lineNum;
 204 }
 205
 206 __private_extern__ Boolean _inputStreamAtEOF(_CFXMLInputStream *stream) {
 207     if (!(stream->flags & STREAM_OPEN)) return false;
 208     if (stream->currentChar) return false;
 209     if (stream->currentByte - CFDataGetBytePtr(stream->data) < CFDataGetLength(stream->data)) return false;
 210     return true;
 211 }
 212
 213 __private_extern__ Boolean _inputStreamComposingErrorOccurred(_CFXMLInputStream *stream) {
 214     return stream->flags & ENCODING_COMPOSITION_ERROR;
 215 }
 216
 217 #define INITIAL_BUFFER_SIZE 64
 218 static void growCharacterBuffer(_CFXMLInputStream *stream) {
 219     if (!stream->charBuffer) {
 220         stream->charBuffer = (UniChar *)CFAllocatorAllocate(stream->allocator, INITIAL_BUFFER_SIZE*sizeof(UniChar), 0);
 221         stream->bufferCapacity = INITIAL_BUFFER_SIZE;
 222     } else {
 223         CFIndex currCharDelta = stream->currentChar ? stream->currentChar - stream->charBuffer : -1;
 224         CFIndex markDelta = stream->mark ? stream->mark - stream->charBuffer: -1;
 225         CFIndex parserMarkDelta = stream->parserMark ? stream->parserMark - stream->charBuffer: -1;
 226         UniChar *newBuffer = (UniChar *)CFAllocatorReallocate(stream->allocator, stream->charBuffer, stream->bufferCapacity * 2 * sizeof(UniChar), 0);
 227         stream->bufferCapacity *= 2;
 228         if (newBuffer != stream->charBuffer) {
 229             stream->charBuffer = newBuffer;
 230             if (currCharDelta != -1) {
 231                 stream->currentChar = newBuffer + currCharDelta;
 232             }
 233             if (markDelta != -1) {
 234                 stream->mark = newBuffer + markDelta;
 235             }
 236             if (parserMarkDelta != -1) {
 237                 stream->parserMark = newBuffer + parserMarkDelta;
 238             }
 239         }
 240     }
 241 }
 242
 243 static CFIndex loadCharacters(UniChar *base, CFIndex maxLength, _CFXMLInputStream *stream) {
 244     const uint8_t *dataEnd = CFDataGetBytePtr(stream->data) + CFDataGetLength(stream->data);
 245     if (stream->flags & (ENCODING_IS_UNICODE_NATURAL|ENCODING_IS_UNICODE_SWAPPED) ) {
 246         CFIndex charsToTranslate = (dataEnd - stream->currentByte) / sizeof(UniChar);
 247         if (charsToTranslate > maxLength) {
 248             charsToTranslate = maxLength;
 249         }
 250         if (stream->flags & ENCODING_IS_UNICODE_NATURAL) {
 251             memmove(base, stream->currentByte, charsToTranslate * sizeof(UniChar));
 252             stream->currentByte += (charsToTranslate * sizeof(UniChar));
 253         } else {
 254             CFIndex i;
 255             uint8_t *baseBytePtr = (uint8_t *)base;
 256             for (i = 0; i < charsToTranslate; i ++) {
 257                 *(baseBytePtr + 1) = *stream->currentByte;
 258                 *baseBytePtr = *(stream->currentByte + 1);
 259                 baseBytePtr += 2;
 260                 stream->currentByte += 2;
 261             }
 262         }
 263         return charsToTranslate;
 264     } else {
 265         CFIndex lengthConsumed = 0;
 266         CFIndex usedByteLength, usedCharLength;
 267         UInt32 conversionResult;
 268         if (stream->flags & ENCODING_MATCHES_ASCII) {
 269             while (stream->currentByte < dataEnd && lengthConsumed < maxLength) {
 270                 if (*stream->currentByte > 0x7f) break;
 271                 *base = *stream->currentByte;
 272                 base ++;
 273                 stream->currentByte ++;
 274                 lengthConsumed ++;
 275             }
 276             if (stream->currentByte == dataEnd || lengthConsumed == maxLength) {
 277                 return lengthConsumed;
 278             }
 279         }
 280         conversionResult = CFStringEncodingBytesToUnicode(stream->encoding, 0, stream->currentByte, dataEnd - stream->currentByte, &usedByteLength, base, maxLength-lengthConsumed, &usedCharLength);
 281         if(kCFStringEncodingConversionSuccess != conversionResult) {
 282             switch(conversionResult) {
 283                 case kCFStringEncodingConverterUnavailable:
 284                 case kCFStringEncodingInvalidInputStream:
 285                     stream->flags |= ENCODING_COMPOSITION_ERROR;
 286                     break;
 287                 case kCFStringEncodingInsufficientOutputBufferLength:
 288                 default:
 289                     break;
 290             }
 291         }
 292         if (usedByteLength > 0) {
 293             stream->currentByte += usedByteLength;
 294             lengthConsumed += usedCharLength;
 295         }
 296         return lengthConsumed;
 297     }
 298 }
 299
 300 // returns number of characters filled
 301 CF_INLINE CFIndex fillToCapacity(_CFXMLInputStream *stream) {
 302     CFIndex numFilled;
 303     if (stream->bufferLength >= stream->bufferCapacity) return 0;
 304     // Try and fill in the remaining characters
 305     numFilled = loadCharacters(stream->charBuffer+stream->bufferLength, stream->bufferCapacity - stream->bufferLength, stream);
 306     if (numFilled != 0) {
 307         stream->currentChar = stream->charBuffer + stream->bufferLength;
 308         stream->bufferLength += numFilled;
 309     }
 310     return numFilled;
 311 }
 312
 313 // we are expected to move mark & parserMark relative to any moved characters, set currentChar to the first new character fetched, update bufferLength, and advance currentByte as appropriate.  Does not check for EOF; it is the caller's responsibility to verify this.
 314 static void fillCharacterBuffer(_CFXMLInputStream *stream) {
 315     if (!stream->charBuffer) {
 316         growCharacterBuffer(stream);
 317     }
 318     if (!stream->mark && !stream->parserMark) {
 319         // This is the easy case; we can freely overwrite the buffer; if either mark or parserMark is set, we must not remove any characters from those marks and the end of the buffer
 320         CFIndex fillLength = stream->bufferCapacity-5; // We leave a few characters at the end, b/c we don't want to reallocate (doubling the amount of memory used) just b/c we're matching a small string near the end of the filled buffer
 321         stream->bufferLength = loadCharacters(stream->charBuffer, fillLength, stream);
 322         CFAssert(stream->bufferLength != 0, __kCFLogAssertion, "CF internal error: XML parser input stream corruption");
 323         stream->currentChar = stream->charBuffer;
 324     } else {
 325         // We do everything we can not to allocate; first we fill any remaining characters.  If that doesn't work, we try shifting the characters starting at the earlier of mark or parserMark to the beginning of buffer, then filling the newly-freed characters.
 326         Boolean done;
 327
 328         // First try just filling the remaining capacity
 329         done = (fillToCapacity(stream) != 0);
 330         if (!done) {
 331             const UniChar *leftMostMark;
 332             if (stream->mark && !stream->parserMark) {
 333                 leftMostMark = stream->mark;
 334             } else if (stream->parserMark && !stream->mark) {
 335                 leftMostMark = stream->parserMark;
 336             } else if (stream->parserMark < stream->mark) {
 337                 leftMostMark = stream->parserMark;
 338             } else {
 339                 leftMostMark = stream->mark;
 340             }
 341             if (leftMostMark > stream->charBuffer) {
 342                 CFIndex delta = leftMostMark - stream->charBuffer;
 343                 memmove(stream->charBuffer, leftMostMark, (stream->bufferLength - delta) * sizeof(UniChar));
 344                 stream->bufferLength -= delta;
 345                 if (stream->mark) {
 346                     stream->mark -= delta;
 347                 }
 348                 if (stream->parserMark) {
 349                     stream->parserMark -= delta;
 350                 }
 351                 // Now try to fill the newly-opened space
 352                 done = (fillToCapacity(stream) != 0);
 353                 delta = loadCharacters(stream->charBuffer + stream->bufferLength, stream->bufferCapacity - stream->bufferLength, stream);
 354             }
 355         }
 356         if (!done) {
 357             // No help for it; now we must allocate
 358             growCharacterBuffer(stream);
 359             fillToCapacity(stream); // If this doesn't work, we give up.
 360         }
 361     }
 362 }
 363
 364 /* The guts of getCharacter() have been separated in order to allow getCharacter() to be small and more easily inline-able. Note that a check late in the 10.3 development cycle indicates that getCharacter() should call getCharacterGuts() less than 2% of the time.  (In 29000 calls, less than 400 called this. Note that a majority of calls have advanceStream set, so that was left in the inline version.  Also note that some calls to getCharacter() were changed to go through the functions _inputStreamGetCharacter() or _inputStreamPeekCharacter(), as the expansion in using the inline version didn't seem worthwhile. See 3275503 for some data supporting this.
 365 */
 366 static Boolean getCharacterGuts(_CFXMLInputStream *stream, UniChar *ch, Boolean advanceStream) {
 367     if (stream->currentByte - CFDataGetBytePtr(stream->data) >= CFDataGetLength(stream->data)) {
 368         return false; // EOF
 369     } else if (!((stream->mark || stream->parserMark) && advanceStream) &&
 370                (((stream->flags & ENCODING_MATCHES_ASCII) && *(stream->currentByte) < 0x7F) ||
 371                 (stream->flags & (ENCODING_IS_UNICODE_NATURAL | ENCODING_IS_UNICODE_SWAPPED)))) {
 372         // We can only perform optimizations if neither mark is set (if the mark is set, we must fill the character buffer so we can retrieve the characters later), and the encoding is Unicode, or the encoding matches ASCII and we're looking at a low-byte character.
 373         if (stream->flags & ENCODING_MATCHES_ASCII) {
 374             *ch = (UniChar)*(stream->currentByte);
 375             if (advanceStream) {
 376                 stream->currentByte ++;
 377             }
 378         } else if (stream->flags & ENCODING_IS_UNICODE_NATURAL) {
 379             *ch = *(UniChar *)(stream->currentByte);
 380             if (advanceStream) {
 381                 stream->currentByte += 2;
 382             }
 383         } else {
 384             // Unicode with swapped bytes
 385             *ch = CFSwapInt16(*(UniChar *)(stream->currentByte));
 386             if (advanceStream) {
 387                 stream->currentByte += 2;
 388             }
 389         }
 390     } else {
 391         fillCharacterBuffer(stream); // this takes into account markIsSet to make sure and do the right thing
 392         if (!stream->charBuffer || !stream->currentChar) {
 393             return false;
 394         } else {
 395             *ch = *(stream->currentChar);
 396             if (advanceStream) {
 397                 stream->currentChar ++;
 398                 if (stream->currentChar == stream->charBuffer + stream->bufferLength) {
 399                     stream->currentChar = NULL;
 400                 }
 401             }
 402         }
 403     }
 404     return true;
 405 }
 406
 407 /* See comments above getCharacterGuts()
 408 */
 409 CF_INLINE Boolean getCharacter(_CFXMLInputStream *stream, UniChar *ch, Boolean advanceStream) {
 410     if (!(stream->flags & STREAM_OPEN)) {
 411         return false;
 412     } else if (stream->currentChar) {
 413         *ch = *stream->currentChar;
 414         if (advanceStream) {
 415             stream->currentChar ++;
 416             if (stream->currentChar == stream->charBuffer + stream->bufferLength) {
 417                 stream->currentChar = NULL;
 418             }
 419         }
 420     } else {
 421         if (!getCharacterGuts(stream, ch, advanceStream)) return false;
 422     }
 423     if (advanceStream) {
 424         UniChar nextChar;
 425         stream->charIndex ++;
 426         if ((*ch == '\n') || ((*ch == '\r') && (!_inputStreamPeekCharacter(stream, &nextChar) || nextChar != '\n'))) stream->lineNum ++;
 427     }
 428     return true;
 429 }
 430
 431 __private_extern__ Boolean _inputStreamPeekCharacter(_CFXMLInputStream *stream, UniChar *ch) {
 432     return getCharacter(stream, ch, false);
 433 }
 434
 435 __private_extern__ Boolean _inputStreamGetCharacter(_CFXMLInputStream *stream, UniChar *ch) {
 436     return getCharacter(stream, ch, true);
 437 }
 438
 439 __private_extern__ Boolean _inputStreamReturnCharacter(_CFXMLInputStream *stream, UniChar ch) {
 440     Boolean decrementLineNum = false;
 441     if (ch == '\n') {
 442         decrementLineNum = true;
 443     } else if (ch == '\r') {
 444         UniChar nextChar;
 445         if (!_inputStreamPeekCharacter(stream, &nextChar) || nextChar != '\n') {
 446             decrementLineNum = true;
 447         }
 448     }
 449
 450     if (!(stream->flags & STREAM_OPEN)) {
 451         return false;
 452     } else if (stream->currentChar) {
 453         if (stream->currentChar != stream->charBuffer) {
 454             stream->currentChar --;
 455         } else {
 456             // Yuck; we're unlucky and are returning a character _before_ the first character in charBuffer
 457             if (stream->bufferLength >= stream->bufferCapacity) {
 458                 growCharacterBuffer(stream);
 459             }
 460             memmove(stream->charBuffer + 1, stream->charBuffer, stream->bufferLength * sizeof(UniChar));
 461             *stream->charBuffer = ch;
 462             stream->bufferLength ++;
 463             if (stream->mark) {
 464                 stream->mark ++;
 465             }
 466             if (stream->parserMark) {
 467                 stream->parserMark ++;
 468             }
 469         }
 470     } else if ((stream->mark || stream->parserMark) && stream->bufferLength) {
 471         // We've been collecting characters in charBuffer; the only reason stream->currentChar is NULL is that we've processed the last character thusfar translated from data.  That last character is the one being returned.
 472         stream->currentChar = stream->charBuffer + stream->bufferLength - 1;
 473     } else if (stream->charBuffer) {
 474         // We have processed all the meaningful characters from charBuffer and have no reason to preserve them.  We use charBuffer to hold this one character that has been returned to us.
 475         *stream->charBuffer = ch;
 476         stream->currentChar = stream->charBuffer;
 477         stream->bufferLength = 1;
 478         if (stream->mark) {
 479             stream->mark ++;
 480         }
 481         if (stream->parserMark) {
 482             stream->parserMark ++;
 483         }
 484     } else if (stream->currentByte > CFDataGetBytePtr(stream->data)) {
 485         // We have no character buffer available, so that means one of two things - either we've never needed a character buffer because all the characters could come directly out of the byte stream, or we've not yet processed the first character.  The former means we can just back up the byte pointer; the latter means Bad Things have happened.
 486         if (stream->flags & ENCODING_MATCHES_ASCII) {
 487             stream->currentByte --;
 488         } else {  // Must be Unicode
 489             stream->currentByte -= 2;
 490         }
 491     } else {
 492         return false;
 493     }
 494     stream->charIndex --;
 495     if (decrementLineNum) {
 496         stream->lineNum --;
 497     }
 498     return true;
 499 }
 500
 501 // Returns the pointer to hold as the mark
 502 static UniChar *dropMark(_CFXMLInputStream *stream) {
 503     if (stream->currentChar) {
 504         return stream->currentChar;
 505     } else if (stream->mark || stream->parserMark) {
 506         return stream->charBuffer + stream->bufferLength;
 507     } else {
 508         if (!stream->charBuffer) {
 509             growCharacterBuffer(stream);
 510         }
 511         stream->bufferLength = 0; // This will be sufficient to force a fetch into the buffer when the next character is requested
 512         return stream->charBuffer;
 513     }
 514
 515 }
 516
 517 __private_extern__ void _inputStreamSetMark(_CFXMLInputStream *stream) {
 518     CFAssert(stream->mark == NULL, __kCFLogAssertion, "CF internal error: parser input stream malformed");
 519     stream->mark = dropMark(stream);
 520 }
 521
 522 __private_extern__ void _inputStreamClearMark(_CFXMLInputStream *stream) {
 523     CFAssert(stream->mark != NULL, __kCFLogAssertion, "CF internal error: parser input stream malformed");
 524     stream->mark = NULL;
 525 }
 526
 527 __private_extern__ void _inputStreamGetCharactersFromMark(_CFXMLInputStream *stream, CFMutableStringRef string) {
 528     UniChar *end = stream->currentChar ? stream->currentChar : stream->charBuffer + stream->bufferLength;
 529     CFIndex numChars = end - stream->mark;
 530     CFAssert(stream->mark, __kCFLogAssertion, "CF internal error: malformed XML input stream");
 531     _fillStringWithCharacters(string, stream->mark, numChars);
 532 }
 533
 534 static void restoreToMark(_CFXMLInputStream *stream, UniChar *mark) {
 535     UniChar *end = stream->currentChar ? stream->currentChar : stream->charBuffer + stream->bufferLength;
 536     if (end > mark) {
 537         CFIndex numChars = end - mark;
 538         stream->charIndex -= numChars;
 539         stream->currentChar = mark;
 540
 541         // This is annoying; to keep the line count accurate, if the first character we are returning is a CR, we must decrement the line count iff the next character is NOT a LF
 542         if (*(end - 1) == '\r') {
 543             UniChar nextChar;
 544             if (_inputStreamPeekCharacter(stream, &nextChar) && nextChar == '\n') {
 545                 end --;
 546             }
 547         }
 548         while (end != mark) {
 549             end --;
 550             if (*end == '\r') {
 551                 stream->lineNum --;
 552             } else if (*end == '\n') {
 553                 stream->lineNum --;
 554                 if (end != mark && *(end - 1) == '\r') {
 555                     end --;
 556                 }
 557             }
 558         }
 559     }
 560 }
 561
 562 __private_extern__ void _inputStreamBackUpToMark(_CFXMLInputStream *stream) {
 563     CFAssert(stream->mark != NULL || stream->charBuffer == NULL, __kCFLogAssertion, "CF internal error: malformed XML input stream");
 564     restoreToMark(stream, stream->mark);
 565 }
 566
 567 CF_INLINE Boolean isWhitespaceChar(UniChar ch) {
 568     return (ch == '\n' || ch == '\r' || ch == ' ' || ch == '\t');
 569 }
 570
 571 __private_extern__ CFIndex _inputStreamSkipWhitespace(_CFXMLInputStream *stream, CFMutableStringRef str) {
 572     UniChar ch;
 573     CFIndex len = 0;
 574     if (str) {
 575         stream->parserMark = dropMark(stream);
 576     }
 577     while (getCharacter(stream, &ch, true) && isWhitespaceChar(ch)) {
 578         len ++;
 579     }
 580     if (!isWhitespaceChar(ch)) {
 581         _inputStreamReturnCharacter(stream, ch);
 582     }
 583     if (str) {
 584         _fillStringWithCharacters(str, stream->parserMark, len);
 585         stream->parserMark = NULL;
 586     }
 587     return len;
 588 }
 589
 590 // false return means EOF was encountered without finding scanChars
 591 __private_extern__ Boolean _inputStreamScanToCharacters(_CFXMLInputStream *stream, const UniChar *scanChars, CFIndex numChars, CFMutableStringRef str) {
 592     Boolean done = false;
 593     CFIndex firstRepeatIndex = -1;
 594     CFIndex len = 0;
 595     stream->parserMark = dropMark(stream);
 596     do {
 597         UniChar ch;
 598         while (_inputStreamGetCharacter(stream, &ch) && ch != scanChars[0]) {
 599             len ++;
 600         }
 601         if (ch != scanChars[0]) {
 602             restoreToMark(stream, stream->parserMark);
 603             stream->parserMark = NULL;
 604             return false;
 605         } else {
 606             CFIndex i;
 607             for (i = 1; i < numChars; i ++) {
 608                 if (!_inputStreamGetCharacter(stream, &ch)) break;
 609                 if (ch != scanChars[i]) break;
 610             }
 611             if (i == numChars) {
 612                 done = true;
 613             } else {
 614                 if (firstRepeatIndex == -1) {
 615                     CFIndex j;
 616                     for (j = 1; j < numChars; j ++) {
 617                         if (scanChars[0] == scanChars[j]) {
 618                             break;
 619                         }
 620                     }
 621                     firstRepeatIndex = j;
 622                 }
 623                 _inputStreamReturnCharacter(stream, ch);
 624                 while (i > firstRepeatIndex) {
 625                     i --;
 626                     _inputStreamReturnCharacter(stream, scanChars[i]);
 627                 }
 628                 len += i;
 629             }
 630         }
 631     } while (!done);
 632     if (str) {
 633         _fillStringWithCharacters(str, stream->parserMark, len);
 634     }
 635     stream->parserMark = NULL;
 636     return true;
 637 }
 638
 639 __private_extern__ Boolean _inputStreamMatchString(_CFXMLInputStream *stream, const UniChar *stringToMatch, CFIndex length) {
 640     const UniChar *end = stringToMatch+length;
 641     const UniChar *sPtr=stringToMatch;
 642     stream->parserMark = dropMark(stream);
 643     while (sPtr < end) {
 644         UniChar ch;
 645         if (!_inputStreamGetCharacter(stream, &ch)) break;
 646         if (ch != *sPtr) break;
 647         sPtr ++;
 648     }
 649     if (sPtr != end) {
 650         restoreToMark(stream, stream->parserMark);
 651         stream->parserMark = NULL;
 652         return false;
 653     } else {
 654         stream->parserMark = NULL;
 655         return true;
 656     }
 657 }
 658
 659 __private_extern__ Boolean _inputStreamScanQuotedString(_CFXMLInputStream *stream, CFMutableStringRef str) {
 660     UniChar ch;
 661     if (!_inputStreamPeekCharacter(stream, &ch)) return false;
 662     if (ch != '\'' && ch != '\"')  return false;
 663
 664     _inputStreamGetCharacter(stream, &ch);
 665     if (!_inputStreamScanToCharacters(stream, &ch, 1, str)) {
 666         return false;
 667     }
 668     return true;
 669 }
 670
 671 /*
 672  [4]  NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender
 673  [5]  Name ::= (Letter | '_' | ':') (NameChar)*
 674  [7]  Nmtoken ::= (NameChar)+
 675  [84] Letter ::= BaseChar | Ideographic
 676
 677  We don't do this quite right; we rely on the Unicode charsets to do this analysis.  While
 678  the productions in the XML spec are based on the Unicode character sets, the definitions
 679  differ slightly to avoid those areas where the Unicode standard is still being resolved.
 680  At any rate, I'd lay money that using the Unicode charsets, we will be more correct than
 681  the vast majority of parsers out there.
 682
 683  Letter == kCFUniCharLetterCharacterSet
 684  Digit == kCFUniCharDecimalDigitCharacterSet
 685  CombiningChar == kCFUniCharNonBaseCharacterSet
 686  Extender - complex, and not represented by a uniform character set.
 687  */
 688 __private_extern__ Boolean _inputStreamScanXMLName(_CFXMLInputStream *stream, Boolean isNMToken, CFStringRef *str) {
 689     UniChar ch;
 690     Boolean success = true;
 691     stream->parserMark = dropMark(stream);
 692     if (!isNMToken) {
 693         // Only difference between an NMToken and a Name is Names have a stricter condition on the first character
 694         if (!getCharacter(stream, &ch, false)) {
 695             success = false;
 696         } else if (!CFUniCharIsMemberOf(ch, kCFUniCharLetterCharacterSet) && ch != '_' && ch != ':') {
 697             success = false;
 698         } else {
 699             getCharacter(stream, &ch, true);
 700         }
 701     }
 702     if (success) {
 703         while (getCharacter(stream, &ch, true)) {
 704             if (!CFUniCharIsMemberOf(ch, kCFUniCharLetterCharacterSet) && !CFUniCharIsMemberOf(ch, kCFUniCharDecimalDigitCharacterSet)  && ch != '.' && ch != '-' && ch != '_' && ch != ':' && !CFUniCharIsMemberOf(ch, kCFUniCharNonBaseCharacterSet)) {
 705                 _inputStreamReturnCharacter(stream, ch);
 706                 break;
 707             }
 708         }
 709         if (NULL == stream->currentChar || stream->currentChar == stream->parserMark) {
 710             success = false; // Must have processed at least one character
 711         }
 712     }
 713     if (success) {
 714         if (str) {
 715             if (!stream->nameSet) {
 716                 stream->nameSet = CFSetCreateMutable(stream->allocator, 0, &kCFTypeSetCallBacks);
 717                 stream->tempString = CFStringCreateMutableWithExternalCharactersNoCopy(stream->allocator, NULL, 0, 0, kCFAllocatorNull);
 718             }
 719             CFStringSetExternalCharactersNoCopy(stream->tempString, stream->parserMark, stream->currentChar-stream->parserMark, stream->currentChar-stream->parserMark);
 720             if (!CFSetGetValueIfPresent(stream->nameSet, stream->tempString, (const void **)str)) {
 721                 *str = (CFStringRef)CFStringCreateCopy(stream->allocator, stream->tempString);
 722                 CFSetAddValue(stream->nameSet, *str);
 723                 CFRelease(*str);
 724             }
 725         }
 726     } else {
 727         restoreToMark(stream, stream->parserMark);
 728     }
 729     stream->parserMark = NULL;
 730     return success;
 731 }
 732
 733