CFXMLInputStream.c

   1 /*
   2  * Copyright (c) 2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23 /*      CFXMLInputStream.c
  24         Copyright 1999-2002, Apple, Inc. All rights reserved.
  25         Responsibility: Chris Parker
  26 */
  27
  28 #include "CFXMLInputStream.h"
  29 #include <CoreFoundation/CFCharacterSet.h>
  30 #include <string.h>
  31 #include "CFStringEncodingConverter.h"
  32 #include "CFUniChar.h"
  33
  34 /* Utility functions used in parsing */
  35 static Boolean determineEncoding(_CFXMLInputStream *stream) {
  36     const uint8_t *bytes = (uint8_t *)CFDataGetBytePtr(stream->data);
  37     UInt32 length = CFDataGetLength(stream->data);
  38     const uint8_t *idx = 0L, *end = 0L;
  39     const uint8_t *base = 0L;
  40     char quote = ' ';
  41     Boolean useUTF8 = false;
  42
  43     // Check for the byte order mark first
  44     if (length > 2) {
  45         // This clause checks for the unicode byte order mark, or a Unicode sequence lacking the BOM; technically an error, but this check is recommended by the XML spec
  46         if ((*bytes == 0xFF && *(bytes+1) == 0xFE) ||*(bytes+1) == 0x00) {
  47 #if __BIG_ENDIAN__
  48             stream->flags |= ENCODING_IS_UNICODE_SWAPPED;
  49 #else
  50             stream->flags |= ENCODING_IS_UNICODE_NATURAL;
  51 #endif
  52             if (*bytes == 0xFF) {
  53                 stream->currentByte = bytes + 2;
  54             }
  55             stream->encoding = kCFStringEncodingUnicode;
  56             return true;
  57         } else if ((*bytes == 0xFE && *(bytes+1) == 0xFF) || *bytes == 0x00) {
  58 #if __BIG_ENDIAN__
  59             stream->flags |= ENCODING_IS_UNICODE_NATURAL;
  60 #else
  61             stream->flags |= ENCODING_IS_UNICODE_SWAPPED;
  62 #endif
  63             if (*bytes == 0xFE) {
  64                 stream->currentByte = bytes + 2;
  65             }
  66             stream->encoding = kCFStringEncodingUnicode;
  67             return true;
  68         } else if(*bytes == 0xEF && *(bytes+1) == 0xBB && *(bytes+2) == 0xBF) {
  69             if(*bytes == 0xEF) {
  70                 stream->currentByte = bytes + 3;
  71             }
  72             stream->encoding = kCFStringEncodingUTF8;
  73             stream->flags |= ENCODING_MATCHES_ASCII;
  74             return true;
  75         }
  76     }
  77     // Scan for the <?xml.... ?> opening
  78     if (length < 5 || strncmp((char const *) bytes, "<?xml", 5) != 0) {
  79         useUTF8 = true;
  80     }
  81     if (!useUTF8) {
  82         idx = bytes + 5;
  83         end = bytes + length;
  84         // Found "<?xml"; now we scan for "encoding"
  85         while (idx < end) {
  86             uint8_t ch = *idx;
  87             const uint8_t *scan;
  88             if ( ch == '?' || ch == '>') {
  89                 useUTF8 = true;
  90                 break;
  91             }
  92             idx ++;
  93             scan = idx;
  94             if (ch == 'e' && *scan++ == 'n' && *scan++ == 'c' && *scan++ == 'o' && *scan++ == 'd' && *scan++ == 'i' && *scan++ == 'n' && *scan++ == 'g' && *scan++ == '=') {
  95                 idx = scan;
  96                 break;
  97             }
  98         }
  99         if (!useUTF8 && idx >= end) {
 100             useUTF8 = true;
 101         }
 102     }
 103     if (!useUTF8) {
 104         // Found "encoding="; see if we've got an honest-to-goodness encoding name
 105         quote = *idx;
 106         if (quote != '\'' && quote != '\"') {
 107             useUTF8 = true;
 108         }
 109     }
 110     if (!useUTF8) {
 111         base = idx + 1; // Move past the quote character
 112         idx ++;
 113         while (idx < end && *idx != quote) idx ++;
 114         if (idx >= end) {
 115             useUTF8 = true;
 116         }
 117     }
 118     if (!useUTF8) {
 119         UInt32 len = idx - base;
 120         if (len == 5 && (*base == 'u' || *base == 'U') && (base[1] == 't' || base[1] == 'T') && (base[2] == 'f' || base[2] == 'F') && (base[3] == '-') && (base[4] == '8')) {
 121             useUTF8 = true;
 122         } else {
 123             CFStringRef encodingName = CFStringCreateWithBytes(stream->allocator, base, len, kCFStringEncodingISOLatin1, false);
 124             stream->encoding = CFStringConvertIANACharSetNameToEncoding(encodingName);
 125             CFRelease(encodingName);
 126         }
 127     }
 128     if (useUTF8) {
 129         stream->encoding = kCFStringEncodingUTF8;
 130         stream->flags |= ENCODING_MATCHES_ASCII;
 131         return true;
 132     } else if (stream->encoding == kCFStringEncodingInvalidId) {
 133         return false;
 134     } else if (__CFStringEncodingIsSupersetOfASCII(stream->encoding)) {
 135         stream->flags |= ENCODING_MATCHES_ASCII;
 136     }
 137     return true;
 138 }
 139
 140 CF_INLINE void _fillStringWithCharacters(CFMutableStringRef string, UniChar *characters, CFIndex numChars) {
 141     CFStringDelete(string, CFRangeMake(0, CFStringGetLength(string)));
 142     if (numChars) {
 143         CFStringAppendCharacters(string, characters, numChars);
 144     }
 145 }
 146
 147 __private_extern__ Boolean _openInputStream(_CFXMLInputStream *stream) {
 148     if (NULL == stream->data) {
 149         return false;
 150     } else {
 151         stream->currentByte = CFDataGetBytePtr(stream->data);
 152         if (determineEncoding(stream)) {
 153             stream->flags |= STREAM_OPEN;
 154             return true;
 155         } else {
 156             return false;
 157         }
 158     }
 159 }
 160
 161 __private_extern__ void _initializeInputStream(_CFXMLInputStream *stream, CFAllocatorRef alloc, CFURLRef dataSource, CFDataRef xmlData) {
 162     stream->data = xmlData ? (CFDataRef)CFRetain(xmlData) : NULL;
 163     stream->url = dataSource ? (CFURLRef)CFRetain(dataSource) : NULL;
 164     stream->encoding = kCFStringEncodingInvalidId;
 165     stream->currentByte = NULL;
 166
 167     stream->allocator = (CFAllocatorRef)CFRetain(alloc);
 168     stream->charBuffer = NULL;
 169     stream->currentChar = NULL;
 170     stream->mark = NULL;
 171     stream->parserMark = NULL;
 172     stream->bufferLength = 0;
 173     stream->bufferCapacity = 0;
 174
 175     stream->charIndex = 1;
 176     stream->lineNum = 1;
 177
 178     stream->flags = 0;
 179     stream->nameSet = NULL;
 180     stream->tempString = NULL;
 181 }
 182
 183
 184 __private_extern__ void _freeInputStream(_CFXMLInputStream *stream) {
 185     if (stream->data) CFRelease(stream->data);
 186     if (stream->url) CFRelease(stream->url);
 187     if (stream->charBuffer) CFAllocatorDeallocate(stream->allocator, stream->charBuffer);
 188     if (stream->nameSet) CFRelease(stream->nameSet);
 189     if (stream->tempString) CFRelease(stream->tempString);
 190     CFRelease(stream->allocator);
 191 }
 192
 193 __private_extern__ CFStringEncoding _inputStreamGetEncoding(_CFXMLInputStream *stream) {
 194     return stream->encoding;
 195 }
 196
 197 __private_extern__ CFIndex _inputStreamCurrentLocation(_CFXMLInputStream *stream) {
 198     return stream->charIndex;
 199 }
 200
 201 __private_extern__ CFIndex _inputStreamCurrentLine(_CFXMLInputStream *stream) {
 202     return stream->lineNum;
 203 }
 204
 205 __private_extern__ Boolean _inputStreamAtEOF(_CFXMLInputStream *stream) {
 206     if (!(stream->flags & STREAM_OPEN)) return false;
 207     if (stream->currentChar) return false;
 208     if (stream->currentByte - CFDataGetBytePtr(stream->data) < CFDataGetLength(stream->data)) return false;
 209     return true;
 210 }
 211
 212 __private_extern__ Boolean _inputStreamComposingErrorOccurred(_CFXMLInputStream *stream) {
 213     return stream->flags & ENCODING_COMPOSITION_ERROR;
 214 }
 215
 216 #define INITIAL_BUFFER_SIZE 64
 217 static void growCharacterBuffer(_CFXMLInputStream *stream) {
 218     if (!stream->charBuffer) {
 219         stream->charBuffer = (UniChar *)CFAllocatorAllocate(stream->allocator, INITIAL_BUFFER_SIZE*sizeof(UniChar), 0);
 220         stream->bufferCapacity = INITIAL_BUFFER_SIZE;
 221     } else {
 222         CFIndex currCharDelta = stream->currentChar ? stream->currentChar - stream->charBuffer : -1;
 223         CFIndex markDelta = stream->mark ? stream->mark - stream->charBuffer: -1;
 224         CFIndex parserMarkDelta = stream->parserMark ? stream->parserMark - stream->charBuffer: -1;
 225         UniChar *newBuffer = (UniChar *)CFAllocatorReallocate(stream->allocator, stream->charBuffer, stream->bufferCapacity * 2 * sizeof(UniChar), 0);
 226         stream->bufferCapacity *= 2;
 227         if (newBuffer != stream->charBuffer) {
 228             stream->charBuffer = newBuffer;
 229             if (currCharDelta != -1) {
 230                 stream->currentChar = newBuffer + currCharDelta;
 231             }
 232             if (markDelta != -1) {
 233                 stream->mark = newBuffer + markDelta;
 234             }
 235             if (parserMarkDelta != -1) {
 236                 stream->parserMark = newBuffer + parserMarkDelta;
 237             }
 238         }
 239     }
 240 }
 241
 242 static CFIndex loadCharacters(UniChar *base, CFIndex maxLength, _CFXMLInputStream *stream) {
 243     const uint8_t *dataEnd = CFDataGetBytePtr(stream->data) + CFDataGetLength(stream->data);
 244     if (stream->flags & (ENCODING_IS_UNICODE_NATURAL|ENCODING_IS_UNICODE_SWAPPED) ) {
 245         CFIndex charsToTranslate = (dataEnd - stream->currentByte) / sizeof(UniChar);
 246         if (charsToTranslate > maxLength) {
 247             charsToTranslate = maxLength;
 248         }
 249         if (stream->flags & ENCODING_IS_UNICODE_NATURAL) {
 250             memmove(base, stream->currentByte, charsToTranslate * sizeof(UniChar));
 251             stream->currentByte += (charsToTranslate * sizeof(UniChar));
 252         } else {
 253             CFIndex i;
 254             uint8_t *baseBytePtr = (uint8_t *)base;
 255             for (i = 0; i < charsToTranslate; i ++) {
 256                 *(baseBytePtr + 1) = *stream->currentByte;
 257                 *baseBytePtr = *(stream->currentByte + 1);
 258                 baseBytePtr += 2;
 259                 stream->currentByte += 2;
 260             }
 261         }
 262         return charsToTranslate;
 263     } else {
 264         CFIndex lengthConsumed = 0;
 265         CFIndex usedByteLength, usedCharLength;
 266         UInt32 conversionResult;
 267         if (stream->flags & ENCODING_MATCHES_ASCII) {
 268             while (stream->currentByte < dataEnd && lengthConsumed < maxLength) {
 269                 if (*stream->currentByte > 0x7f) break;
 270                 *base = *stream->currentByte;
 271                 base ++;
 272                 stream->currentByte ++;
 273                 lengthConsumed ++;
 274             }
 275             if (stream->currentByte == dataEnd || lengthConsumed == maxLength) {
 276                 return lengthConsumed;
 277             }
 278         }
 279         conversionResult = CFStringEncodingBytesToUnicode(stream->encoding, 0, stream->currentByte, dataEnd - stream->currentByte, &usedByteLength, base, maxLength-lengthConsumed, &usedCharLength);
 280         if(kCFStringEncodingConversionSuccess != conversionResult) {
 281             switch(conversionResult) {
 282                 case kCFStringEncodingConverterUnavailable:
 283                 case kCFStringEncodingInvalidInputStream:
 284                     stream->flags |= ENCODING_COMPOSITION_ERROR;
 285                     break;
 286                 case kCFStringEncodingInsufficientOutputBufferLength:
 287                 default:
 288                     break;
 289             }
 290         }
 291         if (usedByteLength > 0) {
 292             stream->currentByte += usedByteLength;
 293             lengthConsumed += usedCharLength;
 294         }
 295         return lengthConsumed;
 296     }
 297 }
 298
 299 // returns number of characters filled
 300 CF_INLINE CFIndex fillToCapacity(_CFXMLInputStream *stream) {
 301     CFIndex numFilled;
 302     if (stream->bufferLength >= stream->bufferCapacity) return 0;
 303     // Try and fill in the remaining characters
 304     numFilled = loadCharacters(stream->charBuffer+stream->bufferLength, stream->bufferCapacity - stream->bufferLength, stream);
 305     if (numFilled != 0) {
 306         stream->currentChar = stream->charBuffer + stream->bufferLength;
 307         stream->bufferLength += numFilled;
 308     }
 309     return numFilled;
 310 }
 311
 312 // we are expected to move mark & parserMark relative to any moved characters, set currentChar to the first new character fetched, update bufferLength, and advance currentByte as appropriate.  Does not check for EOF; it is the caller's responsibility to verify this.
 313 static void fillCharacterBuffer(_CFXMLInputStream *stream) {
 314     if (!stream->charBuffer) {
 315         growCharacterBuffer(stream);
 316     }
 317     if (!stream->mark && !stream->parserMark) {
 318         // This is the easy case; we can freely overwrite the buffer; if either mark or parserMark is set, we must not remove any characters from those marks and the end of the buffer
 319         CFIndex fillLength = stream->bufferCapacity-5; // We leave a few characters at the end, b/c we don't want to reallocate (doubling the amount of memory used) just b/c we're matching a small string near the end of the filled buffer
 320         stream->bufferLength = loadCharacters(stream->charBuffer, fillLength, stream);
 321         CFAssert(stream->bufferLength != 0, __kCFLogAssertion, "CF internal error: XML parser input stream corruption");
 322         stream->currentChar = stream->charBuffer;
 323     } else {
 324         // We do everything we can not to allocate; first we fill any remaining characters.  If that doesn't work, we try shifting the characters starting at the earlier of mark or parserMark to the beginning of buffer, then filling the newly-freed characters.
 325         Boolean done;
 326
 327         // First try just filling the remaining capacity
 328         done = (fillToCapacity(stream) != 0);
 329         if (!done) {
 330             const UniChar *leftMostMark;
 331             if (stream->mark && !stream->parserMark) {
 332                 leftMostMark = stream->mark;
 333             } else if (stream->parserMark && !stream->mark) {
 334                 leftMostMark = stream->parserMark;
 335             } else if (stream->parserMark < stream->mark) {
 336                 leftMostMark = stream->parserMark;
 337             } else {
 338                 leftMostMark = stream->mark;
 339             }
 340             if (leftMostMark > stream->charBuffer) {
 341                 CFIndex delta = leftMostMark - stream->charBuffer;
 342                 memmove(stream->charBuffer, leftMostMark, (stream->bufferLength - delta) * sizeof(UniChar));
 343                 stream->bufferLength -= delta;
 344                 if (stream->mark) {
 345                     stream->mark -= delta;
 346                 }
 347                 if (stream->parserMark) {
 348                     stream->parserMark -= delta;
 349                 }
 350                 // Now try to fill the newly-opened space
 351                 done = (fillToCapacity(stream) != 0);
 352                 delta = loadCharacters(stream->charBuffer + stream->bufferLength, stream->bufferCapacity - stream->bufferLength, stream);
 353             }
 354         }
 355         if (!done) {
 356             // No help for it; now we must allocate
 357             growCharacterBuffer(stream);
 358             fillToCapacity(stream); // If this doesn't work, we give up.
 359         }
 360     }
 361 }
 362
 363 /* The guts of getCharacter() have been separated in order to allow getCharacter() to be small and more easily inline-able. Note that a check late in the 10.3 development cycle indicates that getCharacter() should call getCharacterGuts() less than 2% of the time.  (In 29000 calls, less than 400 called this. Note that a majority of calls have advanceStream set, so that was left in the inline version.  Also note that some calls to getCharacter() were changed to go through the functions _inputStreamGetCharacter() or _inputStreamPeekCharacter(), as the expansion in using the inline version didn't seem worthwhile. See 3275503 for some data supporting this.
 364 */
 365 static Boolean getCharacterGuts(_CFXMLInputStream *stream, UniChar *ch, Boolean advanceStream) {
 366     if (stream->currentByte - CFDataGetBytePtr(stream->data) >= CFDataGetLength(stream->data)) {
 367         return false; // EOF
 368     } else if (!((stream->mark || stream->parserMark) && advanceStream) &&
 369                (((stream->flags & ENCODING_MATCHES_ASCII) && *(stream->currentByte) < 0x7F) ||
 370                 (stream->flags & (ENCODING_IS_UNICODE_NATURAL | ENCODING_IS_UNICODE_SWAPPED)))) {
 371         // We can only perform optimizations if neither mark is set (if the mark is set, we must fill the character buffer so we can retrieve the characters later), and the encoding is Unicode, or the encoding matches ASCII and we're looking at a low-byte character.
 372         if (stream->flags & ENCODING_MATCHES_ASCII) {
 373             *ch = (UniChar)*(stream->currentByte);
 374             if (advanceStream) {
 375                 stream->currentByte ++;
 376             }
 377         } else if (stream->flags & ENCODING_IS_UNICODE_NATURAL) {
 378             *ch = *(UniChar *)(stream->currentByte);
 379             if (advanceStream) {
 380                 stream->currentByte += 2;
 381             }
 382         } else {
 383             // Unicode with swapped bytes
 384             *ch = CFSwapInt16(*(UniChar *)(stream->currentByte));
 385             if (advanceStream) {
 386                 stream->currentByte += 2;
 387             }
 388         }
 389     } else {
 390         fillCharacterBuffer(stream); // this takes into account markIsSet to make sure and do the right thing
 391         if (!stream->charBuffer || !stream->currentChar) {
 392             return false;
 393         } else {
 394             *ch = *(stream->currentChar);
 395             if (advanceStream) {
 396                 stream->currentChar ++;
 397                 if (stream->currentChar == stream->charBuffer + stream->bufferLength) {
 398                     stream->currentChar = NULL;
 399                 }
 400             }
 401         }
 402     }
 403     return true;
 404 }
 405
 406 /* See comments above getCharacterGuts()
 407 */
 408 CF_INLINE Boolean getCharacter(_CFXMLInputStream *stream, UniChar *ch, Boolean advanceStream) {
 409     if (!(stream->flags & STREAM_OPEN)) {
 410         return false;
 411     } else if (stream->currentChar) {
 412         *ch = *stream->currentChar;
 413         if (advanceStream) {
 414             stream->currentChar ++;
 415             if (stream->currentChar == stream->charBuffer + stream->bufferLength) {
 416                 stream->currentChar = NULL;
 417             }
 418         }
 419     } else {
 420         if (!getCharacterGuts(stream, ch, advanceStream)) return false;
 421     }
 422     if (advanceStream) {
 423         UniChar nextChar;
 424         stream->charIndex ++;
 425         if ((*ch == '\n') || ((*ch == '\r') && (!_inputStreamPeekCharacter(stream, &nextChar) || nextChar != '\n'))) stream->lineNum ++;
 426     }
 427     return true;
 428 }
 429
 430 __private_extern__ Boolean _inputStreamPeekCharacter(_CFXMLInputStream *stream, UniChar *ch) {
 431     return getCharacter(stream, ch, false);
 432 }
 433
 434 __private_extern__ Boolean _inputStreamGetCharacter(_CFXMLInputStream *stream, UniChar *ch) {
 435     return getCharacter(stream, ch, true);
 436 }
 437
 438 __private_extern__ Boolean _inputStreamReturnCharacter(_CFXMLInputStream *stream, UniChar ch) {
 439     Boolean decrementLineNum = false;
 440     if (ch == '\n') {
 441         decrementLineNum = true;
 442     } else if (ch == '\r') {
 443         UniChar nextChar;
 444         if (!_inputStreamPeekCharacter(stream, &nextChar) || nextChar != '\n') {
 445             decrementLineNum = true;
 446         }
 447     }
 448
 449     if (!(stream->flags & STREAM_OPEN)) {
 450         return false;
 451     } else if (stream->currentChar) {
 452         if (stream->currentChar != stream->charBuffer) {
 453             stream->currentChar --;
 454         } else {
 455             // Yuck; we're unlucky and are returning a character _before_ the first character in charBuffer
 456             if (stream->bufferLength >= stream->bufferCapacity) {
 457                 growCharacterBuffer(stream);
 458             }
 459             memmove(stream->charBuffer + 1, stream->charBuffer, stream->bufferLength * sizeof(UniChar));
 460             *stream->charBuffer = ch;
 461             stream->bufferLength ++;
 462             if (stream->mark) {
 463                 stream->mark ++;
 464             }
 465             if (stream->parserMark) {
 466                 stream->parserMark ++;
 467             }
 468         }
 469     } else if ((stream->mark || stream->parserMark) && stream->bufferLength) {
 470         // We've been collecting characters in charBuffer; the only reason stream->currentChar is NULL is that we've processed the last character thusfar translated from data.  That last character is the one being returned.
 471         stream->currentChar = stream->charBuffer + stream->bufferLength - 1;
 472     } else if (stream->charBuffer) {
 473         // We have processed all the meaningful characters from charBuffer and have no reason to preserve them.  We use charBuffer to hold this one character that has been returned to us.
 474         *stream->charBuffer = ch;
 475         stream->currentChar = stream->charBuffer;
 476         stream->bufferLength = 1;
 477         if (stream->mark) {
 478             stream->mark ++;
 479         }
 480         if (stream->parserMark) {
 481             stream->parserMark ++;
 482         }
 483     } else if (stream->currentByte > CFDataGetBytePtr(stream->data)) {
 484         // We have no character buffer available, so that means one of two things - either we've never needed a character buffer because all the characters could come directly out of the byte stream, or we've not yet processed the first character.  The former means we can just back up the byte pointer; the latter means Bad Things have happened.
 485         if (stream->flags & ENCODING_MATCHES_ASCII) {
 486             stream->currentByte --;
 487         } else {  // Must be Unicode
 488             stream->currentByte -= 2;
 489         }
 490     } else {
 491         return false;
 492     }
 493     stream->charIndex --;
 494     if (decrementLineNum) {
 495         stream->lineNum --;
 496     }
 497     return true;
 498 }
 499
 500 // Returns the pointer to hold as the mark
 501 static UniChar *dropMark(_CFXMLInputStream *stream) {
 502     if (stream->currentChar) {
 503         return stream->currentChar;
 504     } else if (stream->mark || stream->parserMark) {
 505         return stream->charBuffer + stream->bufferLength;
 506     } else {
 507         if (!stream->charBuffer) {
 508             growCharacterBuffer(stream);
 509         }
 510         stream->bufferLength = 0; // This will be sufficient to force a fetch into the buffer when the next character is requested
 511         return stream->charBuffer;
 512     }
 513
 514 }
 515
 516 __private_extern__ void _inputStreamSetMark(_CFXMLInputStream *stream) {
 517     CFAssert(stream->mark == NULL, __kCFLogAssertion, "CF internal error: parser input stream malformed");
 518     stream->mark = dropMark(stream);
 519 }
 520
 521 __private_extern__ void _inputStreamClearMark(_CFXMLInputStream *stream) {
 522     CFAssert(stream->mark != NULL, __kCFLogAssertion, "CF internal error: parser input stream malformed");
 523     stream->mark = NULL;
 524 }
 525
 526 __private_extern__ void _inputStreamGetCharactersFromMark(_CFXMLInputStream *stream, CFMutableStringRef string) {
 527     UniChar *end = stream->currentChar ? stream->currentChar : stream->charBuffer + stream->bufferLength;
 528     CFIndex numChars = end - stream->mark;
 529     CFAssert(stream->mark, __kCFLogAssertion, "CF internal error: malformed XML input stream");
 530     _fillStringWithCharacters(string, stream->mark, numChars);
 531 }
 532
 533 static void restoreToMark(_CFXMLInputStream *stream, UniChar *mark) {
 534     UniChar *end = stream->currentChar ? stream->currentChar : stream->charBuffer + stream->bufferLength;
 535     if (end > mark) {
 536         CFIndex numChars = end - mark;
 537         stream->charIndex -= numChars;
 538         stream->currentChar = mark;
 539
 540         // This is annoying; to keep the line count accurate, if the first character we are returning is a CR, we must decrement the line count iff the next character is NOT a LF
 541         if (*(end - 1) == '\r') {
 542             UniChar nextChar;
 543             if (_inputStreamPeekCharacter(stream, &nextChar) && nextChar == '\n') {
 544                 end --;
 545             }
 546         }
 547         while (end != mark) {
 548             end --;
 549             if (*end == '\r') {
 550                 stream->lineNum --;
 551             } else if (*end == '\n') {
 552                 stream->lineNum --;
 553                 if (end != mark && *(end - 1) == '\r') {
 554                     end --;
 555                 }
 556             }
 557         }
 558     }
 559 }
 560
 561 __private_extern__ void _inputStreamBackUpToMark(_CFXMLInputStream *stream) {
 562     CFAssert(stream->mark != NULL || stream->charBuffer == NULL, __kCFLogAssertion, "CF internal error: malformed XML input stream");
 563     restoreToMark(stream, stream->mark);
 564 }
 565
 566 CF_INLINE Boolean isWhitespaceChar(UniChar ch) {
 567     return (ch == '\n' || ch == '\r' || ch == ' ' || ch == '\t');
 568 }
 569
 570 __private_extern__ CFIndex _inputStreamSkipWhitespace(_CFXMLInputStream *stream, CFMutableStringRef str) {
 571     UniChar ch;
 572     CFIndex len = 0;
 573     if (str) {
 574         stream->parserMark = dropMark(stream);
 575     }
 576     while (getCharacter(stream, &ch, true) && isWhitespaceChar(ch)) {
 577         len ++;
 578     }
 579     if (!isWhitespaceChar(ch)) {
 580         _inputStreamReturnCharacter(stream, ch);
 581     }
 582     if (str) {
 583         _fillStringWithCharacters(str, stream->parserMark, len);
 584         stream->parserMark = NULL;
 585     }
 586     return len;
 587 }
 588
 589 // false return means EOF was encountered without finding scanChars
 590 __private_extern__ Boolean _inputStreamScanToCharacters(_CFXMLInputStream *stream, const UniChar *scanChars, CFIndex numChars, CFMutableStringRef str) {
 591     Boolean done = false;
 592     CFIndex firstRepeatIndex = -1;
 593     CFIndex len = 0;
 594     stream->parserMark = dropMark(stream);
 595     do {
 596         UniChar ch;
 597         while (_inputStreamGetCharacter(stream, &ch) && ch != scanChars[0]) {
 598             len ++;
 599         }
 600         if (ch != scanChars[0]) {
 601             restoreToMark(stream, stream->parserMark);
 602             stream->parserMark = NULL;
 603             return false;
 604         } else {
 605             CFIndex i;
 606             for (i = 1; i < numChars; i ++) {
 607                 if (!_inputStreamGetCharacter(stream, &ch)) break;
 608                 if (ch != scanChars[i]) break;
 609             }
 610             if (i == numChars) {
 611                 done = true;
 612             } else {
 613                 if (firstRepeatIndex == -1) {
 614                     CFIndex j;
 615                     for (j = 1; j < numChars; j ++) {
 616                         if (scanChars[0] == scanChars[j]) {
 617                             break;
 618                         }
 619                     }
 620                     firstRepeatIndex = j;
 621                 }
 622                 _inputStreamReturnCharacter(stream, ch);
 623                 while (i > firstRepeatIndex) {
 624                     i --;
 625                     _inputStreamReturnCharacter(stream, scanChars[i]);
 626                 }
 627                 len += i;
 628             }
 629         }
 630     } while (!done);
 631     if (str) {
 632         _fillStringWithCharacters(str, stream->parserMark, len);
 633     }
 634     stream->parserMark = NULL;
 635     return true;
 636 }
 637
 638 __private_extern__ Boolean _inputStreamMatchString(_CFXMLInputStream *stream, const UniChar *stringToMatch, CFIndex length) {
 639     const UniChar *end = stringToMatch+length;
 640     const UniChar *sPtr=stringToMatch;
 641     stream->parserMark = dropMark(stream);
 642     while (sPtr < end) {
 643         UniChar ch;
 644         if (!_inputStreamGetCharacter(stream, &ch)) break;
 645         if (ch != *sPtr) break;
 646         sPtr ++;
 647     }
 648     if (sPtr != end) {
 649         restoreToMark(stream, stream->parserMark);
 650         stream->parserMark = NULL;
 651         return false;
 652     } else {
 653         stream->parserMark = NULL;
 654         return true;
 655     }
 656 }
 657
 658 __private_extern__ Boolean _inputStreamScanQuotedString(_CFXMLInputStream *stream, CFMutableStringRef str) {
 659     UniChar ch;
 660     if (!_inputStreamPeekCharacter(stream, &ch)) return false;
 661     if (ch != '\'' && ch != '\"')  return false;
 662
 663     _inputStreamGetCharacter(stream, &ch);
 664     if (!_inputStreamScanToCharacters(stream, &ch, 1, str)) {
 665         return false;
 666     }
 667     return true;
 668 }
 669
 670 /*
 671  [4]  NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender
 672  [5]  Name ::= (Letter | '_' | ':') (NameChar)*
 673  [7]  Nmtoken ::= (NameChar)+
 674  [84] Letter ::= BaseChar | Ideographic
 675
 676  We don't do this quite right; we rely on the Unicode charsets to do this analysis.  While
 677  the productions in the XML spec are based on the Unicode character sets, the definitions
 678  differ slightly to avoid those areas where the Unicode standard is still being resolved.
 679  At any rate, I'd lay money that using the Unicode charsets, we will be more correct than
 680  the vast majority of parsers out there.
 681
 682  Letter == kCFUniCharLetterCharacterSet
 683  Digit == kCFUniCharDecimalDigitCharacterSet
 684  CombiningChar == kCFUniCharNonBaseCharacterSet
 685  Extender - complex, and not represented by a uniform character set.
 686  */
 687 __private_extern__ Boolean _inputStreamScanXMLName(_CFXMLInputStream *stream, Boolean isNMToken, CFStringRef *str) {
 688     UniChar ch;
 689     Boolean success = true;
 690     stream->parserMark = dropMark(stream);
 691     if (!isNMToken) {
 692         // Only difference between an NMToken and a Name is Names have a stricter condition on the first character
 693         if (!getCharacter(stream, &ch, false)) {
 694             success = false;
 695         } else if (!CFUniCharIsMemberOf(ch, kCFUniCharLetterCharacterSet) && ch != '_' && ch != ':') {
 696             success = false;
 697         } else {
 698             getCharacter(stream, &ch, true);
 699         }
 700     }
 701     if (success) {
 702         while (getCharacter(stream, &ch, true)) {
 703             if (!CFUniCharIsMemberOf(ch, kCFUniCharLetterCharacterSet) && !CFUniCharIsMemberOf(ch, kCFUniCharDecimalDigitCharacterSet)  && ch != '.' && ch != '-' && ch != '_' && ch != ':' && !CFUniCharIsMemberOf(ch, kCFUniCharNonBaseCharacterSet)) {
 704                 _inputStreamReturnCharacter(stream, ch);
 705                 break;
 706             }
 707         }
 708         if (NULL == stream->currentChar || stream->currentChar == stream->parserMark) {
 709             success = false; // Must have processed at least one character
 710         }
 711     }
 712     if (success) {
 713         if (str) {
 714             if (!stream->nameSet) {
 715                 stream->nameSet = CFSetCreateMutable(stream->allocator, 0, &kCFTypeSetCallBacks);
 716                 stream->tempString = CFStringCreateMutableWithExternalCharactersNoCopy(stream->allocator, NULL, 0, 0, kCFAllocatorNull);
 717             }
 718             CFStringSetExternalCharactersNoCopy(stream->tempString, stream->parserMark, stream->currentChar-stream->parserMark, stream->currentChar-stream->parserMark);
 719             if (!CFSetGetValueIfPresent(stream->nameSet, stream->tempString, (const void **)str)) {
 720                 *str = (CFStringRef)CFStringCreateCopy(stream->allocator, stream->tempString);
 721                 CFSetAddValue(stream->nameSet, *str);
 722                 CFRelease(*str);
 723             }
 724         }
 725     } else {
 726         restoreToMark(stream, stream->parserMark);
 727     }
 728     stream->parserMark = NULL;
 729     return success;
 730 }
 731
 732