2  * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. 
   4  * @APPLE_LICENSE_HEADER_START@ 
   6  * This file contains Original Code and/or Modifications of Original Code 
   7  * as defined in and that are subject to the Apple Public Source License 
   8  * Version 2.0 (the 'License'). You may not use this file except in 
   9  * compliance with the License. Please obtain a copy of the License at 
  10  * http://www.opensource.apple.com/apsl/ and read it before using this 
  13  * The Original Code and all software distributed under the License are 
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 
  18  * Please see the License for the specific language governing rights and 
  19  * limitations under the License. 
  21  * @APPLE_LICENSE_HEADER_END@ 
  24         Copyright 1999-2002, Apple, Inc. All rights reserved. 
  25         Responsibility: Chris Parker 
  28 #include <CoreFoundation/CFCharacterSet.h> 
  29 #include <CoreFoundation/CFURLAccess.h> 
  31 #include "CFStringEncodingConverter.h" 
  32 #include "CFUniChar.h" 
  33 #include "CFXMLInputStream.h" 
  35 /* Utility functions used in parsing */ 
  36 static Boolean 
determineEncoding(_CFXMLInputStream 
*stream
) { 
  37     const uint8_t *bytes 
= (uint8_t *)CFDataGetBytePtr(stream
->data
); 
  38     UInt32 length 
= CFDataGetLength(stream
->data
); 
  39     const uint8_t *idx 
= 0L, *end 
= 0L; 
  40     const uint8_t *base 
= 0L; 
  42     Boolean useUTF8 
= false; 
  44     // Check for the byte order mark first 
  46         // This clause checks for the unicode byte order mark, or a Unicode sequence lacking the BOM; technically an error, but this check is recommended by the XML spec 
  47         if ((*bytes 
== 0xFF && *(bytes
+1) == 0xFE) ||*(bytes
+1) == 0x00) { 
  48             stream
->flags 
|= ENCODING_IS_UNICODE_SWAPPED
; 
  50                 stream
->currentByte 
= bytes 
+ 2; 
  52             stream
->encoding 
= kCFStringEncodingUnicode
; 
  54         } else if ((*bytes 
== 0xFE && *(bytes
+1) == 0xFF) || *bytes 
== 0x00) { 
  55             stream
->flags 
|= ENCODING_IS_UNICODE_NATURAL
; 
  57                 stream
->currentByte 
= bytes 
+ 2; 
  59             stream
->encoding 
= kCFStringEncodingUnicode
; 
  61         } else if(*bytes 
== 0xEF && *(bytes
+1) == 0xBB && *(bytes
+2) == 0xBF) { 
  63                 stream
->currentByte 
= bytes 
+ 3; 
  65             stream
->encoding 
= kCFStringEncodingUTF8
; 
  66             stream
->flags 
|= ENCODING_MATCHES_ASCII
; 
  70     // Scan for the <?xml.... ?> opening 
  71     if (length 
< 5 || strncmp((char const *) bytes
, "<?xml", 5) != 0) { 
  77         // Found "<?xml"; now we scan for "encoding" 
  81             if ( ch 
== '?' || ch 
== '>') { 
  87             if (ch 
== 'e' && *scan
++ == 'n' && *scan
++ == 'c' && *scan
++ == 'o' && *scan
++ == 'd' && *scan
++ == 'i' && *scan
++ == 'n' && *scan
++ == 'g' && *scan
++ == '=') { 
  92         if (!useUTF8 
&& idx 
>= end
) { 
  97         // Found "encoding="; see if we've got an honest-to-goodness encoding name 
  99         if (quote 
!= '\'' && quote 
!= '\"') { 
 104         base 
= idx 
+ 1; // Move past the quote character 
 106         while (idx 
< end 
&& *idx 
!= quote
) idx 
++; 
 112         UInt32 len 
= idx 
- base
; 
 113         if (len 
== 5 && (*base 
== 'u' || *base 
== 'U') && (base
[1] == 't' || base
[1] == 'T') && (base
[2] == 'f' || base
[2] == 'F') && (base
[3] == '-') && (base
[4] == '8')) { 
 116             CFStringRef encodingName 
= CFStringCreateWithBytes(stream
->allocator
, base
, len
, kCFStringEncodingISOLatin1
, false); 
 117             stream
->encoding 
= CFStringConvertIANACharSetNameToEncoding(encodingName
); 
 118             CFRelease(encodingName
); 
 122         stream
->encoding 
= kCFStringEncodingUTF8
; 
 123         stream
->flags 
|= ENCODING_MATCHES_ASCII
; 
 125     } else if (stream
->encoding 
== kCFStringEncodingInvalidId
) { 
 127     } else if (__CFStringEncodingIsSupersetOfASCII(stream
->encoding
)) { 
 128         stream
->flags 
|= ENCODING_MATCHES_ASCII
; 
 133 CF_INLINE 
void _fillStringWithCharacters(CFMutableStringRef string
, UniChar 
*characters
, CFIndex numChars
) { 
 134     CFStringDelete(string
, CFRangeMake(0, CFStringGetLength(string
))); 
 136         CFStringAppendCharacters(string
, characters
, numChars
); 
 140 __private_extern__ Boolean 
_openInputStream(_CFXMLInputStream 
*stream
) { 
 141     if (NULL 
== stream
->data 
&& NULL 
!= stream
->url
) { 
 142         CFDataRef data 
= NULL
; 
 143         if (CFURLCreateDataAndPropertiesFromResource(stream
->allocator
, stream
->url
, &data
, NULL
, NULL
, NULL
)) { 
 147     if (NULL 
== stream
->data
) { 
 150         stream
->currentByte 
= CFDataGetBytePtr(stream
->data
); 
 151         if (determineEncoding(stream
)) { 
 152             stream
->flags 
|= STREAM_OPEN
; 
 160 __private_extern__ 
void _initializeInputStream(_CFXMLInputStream 
*stream
, CFAllocatorRef alloc
, CFURLRef dataSource
, CFDataRef xmlData
) { 
 161     stream
->data 
= xmlData 
? CFRetain(xmlData
) : NULL
; 
 162     stream
->url 
= dataSource 
? CFRetain(dataSource
) : NULL
; 
 163     stream
->encoding 
= kCFStringEncodingInvalidId
; 
 164     stream
->currentByte 
= NULL
; 
 166     stream
->allocator 
= CFRetain(alloc
); 
 167     stream
->charBuffer 
= NULL
; 
 168     stream
->currentChar 
= NULL
; 
 170     stream
->parserMark 
= NULL
; 
 171     stream
->bufferLength 
= 0; 
 172     stream
->bufferCapacity 
= 0; 
 174     stream
->charIndex 
= 1; 
 178     stream
->nameSet 
= NULL
; 
 179     stream
->tempString 
= NULL
; 
 183 __private_extern__ 
void _freeInputStream(_CFXMLInputStream 
*stream
) { 
 184     if (stream
->data
) CFRelease(stream
->data
); 
 185     if (stream
->url
) CFRelease(stream
->url
); 
 186     if (stream
->charBuffer
) CFAllocatorDeallocate(stream
->allocator
, stream
->charBuffer
); 
 187     if (stream
->nameSet
) CFRelease(stream
->nameSet
); 
 188     if (stream
->tempString
) CFRelease(stream
->tempString
); 
 189     CFRelease(stream
->allocator
); 
 192 __private_extern__ CFStringEncoding 
_inputStreamGetEncoding(_CFXMLInputStream 
*stream
) { 
 193     return stream
->encoding
; 
 196 __private_extern__ CFIndex 
_inputStreamCurrentLocation(_CFXMLInputStream 
*stream
) { 
 197     return stream
->charIndex
; 
 200 __private_extern__ CFIndex 
_inputStreamCurrentLine(_CFXMLInputStream 
*stream
) { 
 201     return stream
->lineNum
; 
 204 __private_extern__ Boolean 
_inputStreamAtEOF(_CFXMLInputStream 
*stream
) { 
 205     if (!(stream
->flags 
& STREAM_OPEN
)) return false; 
 206     if (stream
->currentChar
) return false; 
 207     if (stream
->currentByte 
- CFDataGetBytePtr(stream
->data
) < CFDataGetLength(stream
->data
)) return false; 
 211 __private_extern__ Boolean 
_inputStreamComposingErrorOccurred(_CFXMLInputStream 
*stream
) { 
 212     return stream
->flags 
& ENCODING_COMPOSITION_ERROR
; 
 215 #define INITIAL_BUFFER_SIZE 64 
 216 static void growCharacterBuffer(_CFXMLInputStream 
*stream
) { 
 217     if (!stream
->charBuffer
) { 
 218         stream
->charBuffer 
= CFAllocatorAllocate(stream
->allocator
, INITIAL_BUFFER_SIZE
*sizeof(UniChar
), 0); 
 219         stream
->bufferCapacity 
= INITIAL_BUFFER_SIZE
; 
 221         CFIndex currCharDelta 
= stream
->currentChar 
? stream
->currentChar 
- stream
->charBuffer 
: -1; 
 222         CFIndex markDelta 
= stream
->mark 
? stream
->mark 
- stream
->charBuffer
: -1; 
 223         CFIndex parserMarkDelta 
= stream
->parserMark 
? stream
->parserMark 
- stream
->charBuffer
: -1; 
 224         UniChar 
*newBuffer 
= CFAllocatorReallocate(stream
->allocator
, stream
->charBuffer
, stream
->bufferCapacity 
* 2 * sizeof(UniChar
), 0); 
 225         stream
->bufferCapacity 
*= 2; 
 226         if (newBuffer 
!= stream
->charBuffer
) { 
 227             stream
->charBuffer 
= newBuffer
; 
 228             if (currCharDelta 
!= -1) { 
 229                 stream
->currentChar 
= newBuffer 
+ currCharDelta
; 
 231             if (markDelta 
!= -1) { 
 232                 stream
->mark 
= newBuffer 
+ markDelta
; 
 234             if (parserMarkDelta 
!= -1) { 
 235                 stream
->parserMark 
= newBuffer 
+ parserMarkDelta
; 
 241 static CFIndex 
loadCharacters(UniChar 
*base
, CFIndex maxLength
, _CFXMLInputStream 
*stream
) { 
 242     const uint8_t *dataEnd 
= CFDataGetBytePtr(stream
->data
) + CFDataGetLength(stream
->data
); 
 243     if (stream
->flags 
& (ENCODING_IS_UNICODE_NATURAL
|ENCODING_IS_UNICODE_SWAPPED
) ) { 
 244         CFIndex charsToTranslate 
= (dataEnd 
- stream
->currentByte
) / sizeof(UniChar
); 
 245         if (charsToTranslate 
> maxLength
) { 
 246             charsToTranslate 
= maxLength
; 
 248         if (stream
->flags 
& ENCODING_IS_UNICODE_NATURAL
) { 
 249             memmove(base
, stream
->currentByte
, charsToTranslate 
* sizeof(UniChar
)); 
 250             stream
->currentByte 
+= (charsToTranslate 
* sizeof(UniChar
)); 
 253             uint8_t *baseBytePtr 
= (uint8_t *)base
; 
 254             for (i 
= 0; i 
< charsToTranslate
; i 
++) { 
 255                 *(baseBytePtr 
+ 1) = *stream
->currentByte
; 
 256                 *baseBytePtr 
= *(stream
->currentByte 
+ 1); 
 258                 stream
->currentByte 
+= 2; 
 261         return charsToTranslate
; 
 263         CFIndex lengthConsumed 
= 0, usedByteLength
, usedCharLength
; 
 264         UInt32 conversionResult
; 
 265         if (stream
->flags 
& ENCODING_MATCHES_ASCII
) { 
 266             while (stream
->currentByte 
< dataEnd 
&& lengthConsumed 
< maxLength
) { 
 267                 if (*stream
->currentByte 
> 0x7f) break; 
 268                 *base 
= *stream
->currentByte
; 
 270                 stream
->currentByte 
++; 
 273             if (stream
->currentByte 
== dataEnd 
|| lengthConsumed 
== maxLength
) { 
 274                 return lengthConsumed
; 
 277         conversionResult 
= CFStringEncodingBytesToUnicode(stream
->encoding
, 0, stream
->currentByte
, dataEnd 
- stream
->currentByte
, &usedByteLength
, base
, maxLength
-lengthConsumed
, &usedCharLength
); 
 278         if(kCFStringEncodingConversionSuccess 
!= conversionResult
) { 
 279             switch(conversionResult
) { 
 280                 case kCFStringEncodingConverterUnavailable
: 
 281                 case kCFStringEncodingInvalidInputStream
: 
 282                     stream
->flags 
|= ENCODING_COMPOSITION_ERROR
; 
 284                 case kCFStringEncodingInsufficientOutputBufferLength
: 
 289         if (usedByteLength 
> 0) { 
 290             stream
->currentByte 
+= usedByteLength
; 
 291             lengthConsumed 
+= usedCharLength
; 
 293         return lengthConsumed
; 
 297 // returns number of characters filled 
 298 CF_INLINE CFIndex 
fillToCapacity(_CFXMLInputStream 
*stream
) { 
 300     if (stream
->bufferLength 
>= stream
->bufferCapacity
) return 0; 
 301     // Try and fill in the remaining characters 
 302     numFilled 
= loadCharacters(stream
->charBuffer
+stream
->bufferLength
, stream
->bufferCapacity 
- stream
->bufferLength
, stream
); 
 303     if (numFilled 
!= 0) { 
 304         stream
->currentChar 
= stream
->charBuffer 
+ stream
->bufferLength
; 
 305         stream
->bufferLength 
+= numFilled
; 
 310 // we are expected to move mark & parserMark relative to any moved characters, set currentChar to the first new character fetched, update bufferLength, and advance currentByte as appropriate.  Does not check for EOF; it is the caller's responsibility to verify this. 
 311 static void fillCharacterBuffer(_CFXMLInputStream 
*stream
) { 
 312     if (!stream
->charBuffer
) { 
 313         growCharacterBuffer(stream
); 
 315     if (!stream
->mark 
&& !stream
->parserMark
) { 
 316         // This is the easy case; we can freely overwrite the buffer; if either mark or parserMark is set, we must not remove any characters from those marks and the end of the buffer 
 317         CFIndex fillLength 
= stream
->bufferCapacity
-5; // We leave a few characters at the end, b/c we don't want to reallocate (doubling the amount of memory used) just b/c we're matching a small string near the end of the filled buffer 
 318         stream
->bufferLength 
= loadCharacters(stream
->charBuffer
, fillLength
, stream
); 
 319         CFAssert(stream
->bufferLength 
!= 0, __kCFLogAssertion
, "CF internal error: XML parser input stream corruption"); 
 320         stream
->currentChar 
= stream
->charBuffer
; 
 322         // We do everything we can not to allocate; first we fill any remaining characters.  If that doesn't work, we try shifting the characters starting at the earlier of mark or parserMark to the beginning of buffer, then filling the newly-freed characters. 
 325         // First try just filling the remaining capacity 
 326         done 
= (fillToCapacity(stream
) != 0); 
 328             const UniChar 
*leftMostMark
; 
 329             if (stream
->mark 
&& !stream
->parserMark
) { 
 330                 leftMostMark 
= stream
->mark
; 
 331             } else if (stream
->parserMark 
&& !stream
->mark
) { 
 332                 leftMostMark 
= stream
->parserMark
; 
 333             } else if (stream
->parserMark 
< stream
->mark
) { 
 334                 leftMostMark 
= stream
->parserMark
; 
 336                 leftMostMark 
= stream
->mark
; 
 338             if (leftMostMark 
> stream
->charBuffer
) { 
 339                 CFIndex delta 
= leftMostMark 
- stream
->charBuffer
; 
 340                 memmove(stream
->charBuffer
, leftMostMark
, (stream
->bufferLength 
- delta
) * sizeof(UniChar
)); 
 341                 stream
->bufferLength 
-= delta
; 
 343                     stream
->mark 
-= delta
; 
 345                 if (stream
->parserMark
) { 
 346                     stream
->parserMark 
-= delta
; 
 348                 // Now try to fill the newly-opened space 
 349                 done 
= (fillToCapacity(stream
) != 0); 
 350                 delta 
= loadCharacters(stream
->charBuffer
+stream
->bufferLength
, stream
->bufferCapacity 
- stream
->bufferLength
, stream
); 
 354             // No help for it; now we must allocate 
 355             growCharacterBuffer(stream
); 
 356             fillToCapacity(stream
); // If this doesn't work, we give up. 
 361 /* The guts of getCharacter() have been separated in order to allow getCharacter() to be small and more easily inline-able. Note that a check late in the 10.3 development cycle indicates that getCharacter() should call getCharacterGuts() less than 2% of the time.  (In 29000 calls, less than 400 called this. Note that a majority of calls have advanceStream set, so that was left in the inline version.  Also note that some calls to getCharacter() were changed to go through the functions _inputStreamGetCharacter() or _inputStreamPeekCharacter(), as the expansion in using the inline version didn't seem worthwhile. See 3275503 for some data supporting this. 
 363 static Boolean 
getCharacterGuts(_CFXMLInputStream 
*stream
, UniChar 
*ch
, Boolean advanceStream
) { 
 364     if (stream
->currentByte 
- CFDataGetBytePtr(stream
->data
) >= CFDataGetLength(stream
->data
)) { 
 366     } else if (!((stream
->mark 
|| stream
->parserMark
) && advanceStream
) && 
 367                (((stream
->flags 
& ENCODING_MATCHES_ASCII
) && *(stream
->currentByte
) < 0x7F) || 
 368                 (stream
->flags 
& (ENCODING_IS_UNICODE_NATURAL 
| ENCODING_IS_UNICODE_SWAPPED
)))) { 
 369         // We can only perform optimizations if neither mark is set (if the mark is set, we must fill the character buffer so we can retrieve the characters later), and the encoding is Unicode, or the encoding matches ASCII and we're looking at a low-byte character. 
 370         if (stream
->flags 
& ENCODING_MATCHES_ASCII
) { 
 371             *ch 
= (UniChar
)*(stream
->currentByte
); 
 373                 stream
->currentByte 
++; 
 375         } else if (stream
->flags 
& ENCODING_IS_UNICODE_NATURAL
) { 
 376             *ch 
= (*stream
->currentByte
) << 8; 
 377             *ch 
+= *(stream
->currentByte 
+ 1); 
 379                 stream
->currentByte 
+= 2; 
 382             // Unicode with swapped bytes 
 383             *ch 
= (*(stream
->currentByte 
+ 1)) << 8; 
 384             *ch 
+= *stream
->currentByte
; 
 386                 stream
->currentByte 
+= 2; 
 390         fillCharacterBuffer(stream
); // this takes into account markIsSet to make sure and do the right thing 
 391         if (!stream
->charBuffer 
|| !stream
->currentChar
) { 
 394             *ch 
= *(stream
->currentChar
); 
 396                 stream
->currentChar 
++; 
 397                 if (stream
->currentChar 
== stream
->charBuffer 
+ stream
->bufferLength
) { 
 398                     stream
->currentChar 
= NULL
; 
 406 /* See comments above getCharacterGuts() 
 408 CF_INLINE Boolean 
getCharacter(_CFXMLInputStream 
*stream
, UniChar 
*ch
, Boolean advanceStream
) { 
 409     if (!(stream
->flags 
& STREAM_OPEN
)) { 
 411     } else if (stream
->currentChar
) { 
 412         *ch 
= *stream
->currentChar
; 
 414             stream
->currentChar 
++; 
 415             if (stream
->currentChar 
== stream
->charBuffer 
+ stream
->bufferLength
) { 
 416                 stream
->currentChar 
= NULL
; 
 420         if (!getCharacterGuts(stream
, ch
, advanceStream
)) return false; 
 424         stream
->charIndex 
++; 
 425         if ((*ch 
== '\n') || ((*ch 
== '\r') && (!_inputStreamPeekCharacter(stream
, &nextChar
) || nextChar 
!= '\n'))) stream
->lineNum 
++; 
 430 __private_extern__ Boolean 
_inputStreamPeekCharacter(_CFXMLInputStream 
*stream
, UniChar 
*ch
) { 
 431     return getCharacter(stream
, ch
, false); 
 434 __private_extern__ Boolean 
_inputStreamGetCharacter(_CFXMLInputStream 
*stream
, UniChar 
*ch
) { 
 435     return getCharacter(stream
, ch
, true); 
 438 __private_extern__ Boolean 
_inputStreamReturnCharacter(_CFXMLInputStream 
*stream
, UniChar ch
) { 
 439     Boolean decrementLineNum 
= false; 
 441         decrementLineNum 
= true; 
 442     } else if (ch 
== '\r') { 
 444         if (!_inputStreamPeekCharacter(stream
, &nextChar
) || nextChar 
!= '\n') { 
 445             decrementLineNum 
= true; 
 449     if (!(stream
->flags 
& STREAM_OPEN
)) { 
 451     } else if (stream
->currentChar
) { 
 452         if (stream
->currentChar 
!= stream
->charBuffer
) { 
 453             stream
->currentChar 
--; 
 455             // Yuck; we're unlucky and are returning a character _before_ the first character in charBuffer 
 456             if (stream
->bufferLength 
>= stream
->bufferCapacity
) { 
 457                 growCharacterBuffer(stream
); 
 459             memmove(stream
->charBuffer 
+ 1, stream
->charBuffer
, stream
->bufferLength 
* sizeof(UniChar
)); 
 460             *stream
->charBuffer 
= ch
; 
 461             stream
->bufferLength 
++; 
 465             if (stream
->parserMark
) { 
 466                 stream
->parserMark 
++; 
 469     } else if ((stream
->mark 
|| stream
->parserMark
) && stream
->bufferLength
) { 
 470         // We've been collecting characters in charBuffer; the only reason stream->currentChar is NULL is that we've processed the last character thusfar translated from data.  That last character is the one being returned. 
 471         stream
->currentChar 
= stream
->charBuffer 
+ stream
->bufferLength 
- 1; 
 472     } else if (stream
->charBuffer
) { 
 473         // We have processed all the meaningful characters from charBuffer and have no reason to preserve them.  We use charBuffer to hold this one character that has been returned to us. 
 474         *stream
->charBuffer 
= ch
; 
 475         stream
->currentChar 
= stream
->charBuffer
; 
 476         stream
->bufferLength 
= 1; 
 480         if (stream
->parserMark
) { 
 481             stream
->parserMark 
++; 
 483     } else if (stream
->currentByte 
> CFDataGetBytePtr(stream
->data
)) { 
 484         // We have no character buffer available, so that means one of two things - either we've never needed a character buffer because all the characters could come directly out of the byte stream, or we've not yet processed the first character.  The former means we can just back up the byte pointer; the latter means Bad Things have happened. 
 485         if (stream
->flags 
& ENCODING_MATCHES_ASCII
) { 
 486             stream
->currentByte 
--; 
 487         } else {  // Must be Unicode 
 488             stream
->currentByte 
-= 2; 
 493     stream
->charIndex 
--; 
 494     if (decrementLineNum
) { 
 500 // Returns the pointer to hold as the mark 
 501 static UniChar 
*dropMark(_CFXMLInputStream 
*stream
) { 
 502     if (stream
->currentChar
) { 
 503         return stream
->currentChar
; 
 504     } else if (stream
->mark 
|| stream
->parserMark
) { 
 505         return stream
->charBuffer 
+ stream
->bufferLength
; 
 507         if (!stream
->charBuffer
) { 
 508             growCharacterBuffer(stream
); 
 510         stream
->bufferLength 
= 0; // This will be sufficient to force a fetch into the buffer when the next character is requested 
 511         return stream
->charBuffer
; 
 516 __private_extern__ 
void _inputStreamSetMark(_CFXMLInputStream 
*stream
) { 
 517     CFAssert(stream
->mark 
== NULL
, __kCFLogAssertion
, "CF internal error: parser input stream malformed"); 
 518     stream
->mark 
= dropMark(stream
); 
 521 __private_extern__ 
void _inputStreamClearMark(_CFXMLInputStream 
*stream
) { 
 522     CFAssert(stream
->mark 
!= NULL
, __kCFLogAssertion
, "CF internal error: parser input stream malformed"); 
 526 __private_extern__ 
void _inputStreamGetCharactersFromMark(_CFXMLInputStream 
*stream
, CFMutableStringRef string
) { 
 527     UniChar 
*end 
= stream
->currentChar 
? stream
->currentChar 
: stream
->charBuffer 
+ stream
->bufferLength
; 
 528     CFIndex numChars 
= end 
- stream
->mark
; 
 529     CFAssert(stream
->mark
, __kCFLogAssertion
, "CF internal error: malformed XML input stream"); 
 530     _fillStringWithCharacters(string
, stream
->mark
, numChars
); 
 533 static void restoreToMark(_CFXMLInputStream 
*stream
, UniChar 
*mark
) { 
 534     UniChar 
*end 
= stream
->currentChar 
? stream
->currentChar 
: stream
->charBuffer 
+ stream
->bufferLength
; 
 536         CFIndex numChars 
= end 
- mark
; 
 537         stream
->charIndex 
-= numChars
; 
 538         stream
->currentChar 
= mark
; 
 540         // This is annoying; to keep the line count accurate, if the first character we are returning is a CR, we must decrement the line count iff the next character is NOT a LF 
 541         if (*(end 
- 1) == '\r') { 
 543             if (_inputStreamPeekCharacter(stream
, &nextChar
) && nextChar 
== '\n') { 
 547         while (end 
!= mark
) { 
 551             } else if (*end 
== '\n') { 
 553                 if (end 
!= mark 
&& *(end 
- 1) == '\r') { 
 561 __private_extern__ 
void _inputStreamBackUpToMark(_CFXMLInputStream 
*stream
) { 
 562     CFAssert(stream
->mark 
!= NULL 
|| stream
->charBuffer 
== NULL
, __kCFLogAssertion
, "CF internal error: malformed XML input stream"); 
 563     restoreToMark(stream
, stream
->mark
); 
 566 CF_INLINE Boolean 
isWhitespaceChar(UniChar ch
) { 
 567     return (ch 
== '\n' || ch 
== '\r' || ch 
== ' ' || ch 
== '\t'); 
 570 __private_extern__ CFIndex 
_inputStreamSkipWhitespace(_CFXMLInputStream 
*stream
, CFMutableStringRef str
) { 
 574         stream
->parserMark 
= dropMark(stream
); 
 576     while (getCharacter(stream
, &ch
, true) && isWhitespaceChar(ch
)) { 
 579     if (!isWhitespaceChar(ch
)) { 
 580         _inputStreamReturnCharacter(stream
, ch
); 
 583         _fillStringWithCharacters(str
, stream
->parserMark
, len
); 
 584         stream
->parserMark 
= NULL
; 
 589 // false return means EOF was encountered without finding scanChars 
 590 __private_extern__ Boolean 
_inputStreamScanToCharacters(_CFXMLInputStream 
*stream
, const UniChar 
*scanChars
, CFIndex numChars
, CFMutableStringRef str
) { 
 591     Boolean done 
= false; 
 592     CFIndex firstRepeatIndex 
= -1; 
 594     stream
->parserMark 
= dropMark(stream
); 
 597         while (_inputStreamGetCharacter(stream
, &ch
) && ch 
!= scanChars
[0]) { 
 600         if (ch 
!= scanChars
[0]) { 
 601             restoreToMark(stream
, stream
->parserMark
); 
 602             stream
->parserMark 
= NULL
; 
 606             for (i 
= 1; i 
< numChars
; i 
++) { 
 607                 if (!_inputStreamGetCharacter(stream
, &ch
)) break; 
 608                 if (ch 
!= scanChars
[i
]) break; 
 613                 if (firstRepeatIndex 
== -1) { 
 615                     for (j 
= 1; j 
< numChars
; j 
++) { 
 616                         if (scanChars
[0] == scanChars
[j
]) { 
 620                     firstRepeatIndex 
= j
; 
 622                 _inputStreamReturnCharacter(stream
, ch
); 
 623                 while (i 
> firstRepeatIndex
) { 
 625                     _inputStreamReturnCharacter(stream
, scanChars
[i
]); 
 632         _fillStringWithCharacters(str
, stream
->parserMark
, len
); 
 634     stream
->parserMark 
= NULL
; 
 638 __private_extern__ Boolean 
_inputStreamMatchString(_CFXMLInputStream 
*stream
, const UniChar 
*stringToMatch
, CFIndex length
) { 
 639     const UniChar 
*end 
= stringToMatch
+length
; 
 640     const UniChar 
*sPtr
=stringToMatch
; 
 641     stream
->parserMark 
= dropMark(stream
); 
 644         if (!_inputStreamGetCharacter(stream
, &ch
)) break; 
 645         if (ch 
!= *sPtr
) break; 
 649         restoreToMark(stream
, stream
->parserMark
); 
 650         stream
->parserMark 
= NULL
; 
 653         stream
->parserMark 
= NULL
; 
 658 __private_extern__ Boolean 
_inputStreamScanQuotedString(_CFXMLInputStream 
*stream
, CFMutableStringRef str
) { 
 660     if (!_inputStreamPeekCharacter(stream
, &ch
)) return false; 
 661     if (ch 
!= '\'' && ch 
!= '\"')  return false; 
 663     _inputStreamGetCharacter(stream
, &ch
); 
 664     if (!_inputStreamScanToCharacters(stream
, &ch
, 1, str
)) { 
 671  [4]  NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender 
 672  [5]  Name ::= (Letter | '_' | ':') (NameChar)* 
 673  [7]  Nmtoken ::= (NameChar)+ 
 674  [84] Letter ::= BaseChar | Ideographic 
 676  We don't do this quite right; we rely on the Unicode charsets to do this analysis.  While 
 677  the productions in the XML spec are based on the Unicode character sets, the definitions 
 678  differ slightly to avoid those areas where the Unicode standard is still being resolved. 
 679  At any rate, I'd lay money that using the Unicode charsets, we will be more correct than 
 680  the vast majority of parsers out there. 
 682  Letter == kCFUniCharLetterCharacterSet 
 683  Digit == kCFUniCharDecimalDigitCharacterSet 
 684  CombiningChar == kCFUniCharNonBaseCharacterSet 
 685  Extender - complex, and not represented by a uniform character set. 
 687 __private_extern__ Boolean 
_inputStreamScanXMLName(_CFXMLInputStream 
*stream
, Boolean isNMToken
, CFStringRef 
*str
) { 
 689     Boolean success 
= true; 
 690     stream
->parserMark 
= dropMark(stream
); 
 692         // Only difference between an NMToken and a Name is Names have a stricter condition on the first character 
 693         if (!getCharacter(stream
, &ch
, false)) { 
 695         } else if (!CFUniCharIsMemberOf(ch
, kCFUniCharLetterCharacterSet
) && ch 
!= '_' && ch 
!= ':') { 
 698             getCharacter(stream
, &ch
, true); 
 702         while (getCharacter(stream
, &ch
, true)) { 
 703             if (!CFUniCharIsMemberOf(ch
, kCFUniCharLetterCharacterSet
) && !CFUniCharIsMemberOf(ch
, kCFUniCharDecimalDigitCharacterSet
)  && ch 
!= '.' && ch 
!= '-' && ch 
!= '_' && ch 
!= ':' && !CFUniCharIsMemberOf(ch
, kCFUniCharNonBaseCharacterSet
)) { 
 704                 _inputStreamReturnCharacter(stream
, ch
); 
 708         if (stream
->currentChar 
== stream
->parserMark
) { 
 709             success 
= false; // Must have processed at least one character 
 714             if (!stream
->nameSet
) { 
 715                 stream
->nameSet 
= CFSetCreateMutable(stream
->allocator
, 0, &kCFTypeSetCallBacks
); 
 716                 stream
->tempString 
= CFStringCreateMutableWithExternalCharactersNoCopy(stream
->allocator
, NULL
, 0, 0, kCFAllocatorNull
); 
 718             CFStringSetExternalCharactersNoCopy(stream
->tempString
, stream
->parserMark
, stream
->currentChar
-stream
->parserMark
, stream
->currentChar
-stream
->parserMark
); 
 719             if (!CFSetGetValueIfPresent(stream
->nameSet
, stream
->tempString
, (const void **)str
)) { 
 720                 *str 
= CFStringCreateCopy(stream
->allocator
, stream
->tempString
); 
 721                 CFSetAddValue(stream
->nameSet
, *str
); 
 726         restoreToMark(stream
, stream
->parserMark
); 
 728     stream
->parserMark 
= NULL
;