2 * Copyright (c) 2008 Apple Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
21 * @APPLE_LICENSE_HEADER_END@
24 Copyright 1999-2002, Apple, Inc. All rights reserved.
25 Responsibility: Chris Parker
28 #include "CFXMLInputStream.h"
29 #include <CoreFoundation/CFCharacterSet.h>
31 #include "CFStringEncodingConverter.h"
32 #include "CFUniChar.h"
34 /* Utility functions used in parsing */
35 static Boolean
determineEncoding(_CFXMLInputStream
*stream
) {
36 const uint8_t *bytes
= (uint8_t *)CFDataGetBytePtr(stream
->data
);
37 UInt32 length
= CFDataGetLength(stream
->data
);
38 const uint8_t *idx
= 0L, *end
= 0L;
39 const uint8_t *base
= 0L;
41 Boolean useUTF8
= false;
43 // Check for the byte order mark first
45 // This clause checks for the unicode byte order mark, or a Unicode sequence lacking the BOM; technically an error, but this check is recommended by the XML spec
46 if ((*bytes
== 0xFF && *(bytes
+1) == 0xFE) ||*(bytes
+1) == 0x00) {
48 stream
->flags
|= ENCODING_IS_UNICODE_SWAPPED
;
50 stream
->flags
|= ENCODING_IS_UNICODE_NATURAL
;
53 stream
->currentByte
= bytes
+ 2;
55 stream
->encoding
= kCFStringEncodingUnicode
;
57 } else if ((*bytes
== 0xFE && *(bytes
+1) == 0xFF) || *bytes
== 0x00) {
59 stream
->flags
|= ENCODING_IS_UNICODE_NATURAL
;
61 stream
->flags
|= ENCODING_IS_UNICODE_SWAPPED
;
64 stream
->currentByte
= bytes
+ 2;
66 stream
->encoding
= kCFStringEncodingUnicode
;
68 } else if(*bytes
== 0xEF && *(bytes
+1) == 0xBB && *(bytes
+2) == 0xBF) {
70 stream
->currentByte
= bytes
+ 3;
72 stream
->encoding
= kCFStringEncodingUTF8
;
73 stream
->flags
|= ENCODING_MATCHES_ASCII
;
77 // Scan for the <?xml.... ?> opening
78 if (length
< 5 || strncmp((char const *) bytes
, "<?xml", 5) != 0) {
84 // Found "<?xml"; now we scan for "encoding"
88 if ( ch
== '?' || ch
== '>') {
94 if (ch
== 'e' && *scan
++ == 'n' && *scan
++ == 'c' && *scan
++ == 'o' && *scan
++ == 'd' && *scan
++ == 'i' && *scan
++ == 'n' && *scan
++ == 'g' && *scan
++ == '=') {
99 if (!useUTF8
&& idx
>= end
) {
104 // Found "encoding="; see if we've got an honest-to-goodness encoding name
106 if (quote
!= '\'' && quote
!= '\"') {
111 base
= idx
+ 1; // Move past the quote character
113 while (idx
< end
&& *idx
!= quote
) idx
++;
119 UInt32 len
= idx
- base
;
120 if (len
== 5 && (*base
== 'u' || *base
== 'U') && (base
[1] == 't' || base
[1] == 'T') && (base
[2] == 'f' || base
[2] == 'F') && (base
[3] == '-') && (base
[4] == '8')) {
123 CFStringRef encodingName
= CFStringCreateWithBytes(stream
->allocator
, base
, len
, kCFStringEncodingISOLatin1
, false);
124 stream
->encoding
= CFStringConvertIANACharSetNameToEncoding(encodingName
);
125 CFRelease(encodingName
);
129 stream
->encoding
= kCFStringEncodingUTF8
;
130 stream
->flags
|= ENCODING_MATCHES_ASCII
;
132 } else if (stream
->encoding
== kCFStringEncodingInvalidId
) {
134 } else if (__CFStringEncodingIsSupersetOfASCII(stream
->encoding
)) {
135 stream
->flags
|= ENCODING_MATCHES_ASCII
;
140 CF_INLINE
void _fillStringWithCharacters(CFMutableStringRef string
, UniChar
*characters
, CFIndex numChars
) {
141 CFStringDelete(string
, CFRangeMake(0, CFStringGetLength(string
)));
143 CFStringAppendCharacters(string
, characters
, numChars
);
147 __private_extern__ Boolean
_openInputStream(_CFXMLInputStream
*stream
) {
148 if (NULL
== stream
->data
) {
151 stream
->currentByte
= CFDataGetBytePtr(stream
->data
);
152 if (determineEncoding(stream
)) {
153 stream
->flags
|= STREAM_OPEN
;
161 __private_extern__
void _initializeInputStream(_CFXMLInputStream
*stream
, CFAllocatorRef alloc
, CFURLRef dataSource
, CFDataRef xmlData
) {
162 stream
->data
= xmlData
? (CFDataRef
)CFRetain(xmlData
) : NULL
;
163 stream
->url
= dataSource
? (CFURLRef
)CFRetain(dataSource
) : NULL
;
164 stream
->encoding
= kCFStringEncodingInvalidId
;
165 stream
->currentByte
= NULL
;
167 stream
->allocator
= (CFAllocatorRef
)CFRetain(alloc
);
168 stream
->charBuffer
= NULL
;
169 stream
->currentChar
= NULL
;
171 stream
->parserMark
= NULL
;
172 stream
->bufferLength
= 0;
173 stream
->bufferCapacity
= 0;
175 stream
->charIndex
= 1;
179 stream
->nameSet
= NULL
;
180 stream
->tempString
= NULL
;
184 __private_extern__
void _freeInputStream(_CFXMLInputStream
*stream
) {
185 if (stream
->data
) CFRelease(stream
->data
);
186 if (stream
->url
) CFRelease(stream
->url
);
187 if (stream
->charBuffer
) CFAllocatorDeallocate(stream
->allocator
, stream
->charBuffer
);
188 if (stream
->nameSet
) CFRelease(stream
->nameSet
);
189 if (stream
->tempString
) CFRelease(stream
->tempString
);
190 CFRelease(stream
->allocator
);
193 __private_extern__ CFStringEncoding
_inputStreamGetEncoding(_CFXMLInputStream
*stream
) {
194 return stream
->encoding
;
197 __private_extern__ CFIndex
_inputStreamCurrentLocation(_CFXMLInputStream
*stream
) {
198 return stream
->charIndex
;
201 __private_extern__ CFIndex
_inputStreamCurrentLine(_CFXMLInputStream
*stream
) {
202 return stream
->lineNum
;
205 __private_extern__ Boolean
_inputStreamAtEOF(_CFXMLInputStream
*stream
) {
206 if (!(stream
->flags
& STREAM_OPEN
)) return false;
207 if (stream
->currentChar
) return false;
208 if (stream
->currentByte
- CFDataGetBytePtr(stream
->data
) < CFDataGetLength(stream
->data
)) return false;
212 __private_extern__ Boolean
_inputStreamComposingErrorOccurred(_CFXMLInputStream
*stream
) {
213 return stream
->flags
& ENCODING_COMPOSITION_ERROR
;
216 #define INITIAL_BUFFER_SIZE 64
217 static void growCharacterBuffer(_CFXMLInputStream
*stream
) {
218 if (!stream
->charBuffer
) {
219 stream
->charBuffer
= (UniChar
*)CFAllocatorAllocate(stream
->allocator
, INITIAL_BUFFER_SIZE
*sizeof(UniChar
), 0);
220 stream
->bufferCapacity
= INITIAL_BUFFER_SIZE
;
222 CFIndex currCharDelta
= stream
->currentChar
? stream
->currentChar
- stream
->charBuffer
: -1;
223 CFIndex markDelta
= stream
->mark
? stream
->mark
- stream
->charBuffer
: -1;
224 CFIndex parserMarkDelta
= stream
->parserMark
? stream
->parserMark
- stream
->charBuffer
: -1;
225 UniChar
*newBuffer
= (UniChar
*)CFAllocatorReallocate(stream
->allocator
, stream
->charBuffer
, stream
->bufferCapacity
* 2 * sizeof(UniChar
), 0);
226 stream
->bufferCapacity
*= 2;
227 if (newBuffer
!= stream
->charBuffer
) {
228 stream
->charBuffer
= newBuffer
;
229 if (currCharDelta
!= -1) {
230 stream
->currentChar
= newBuffer
+ currCharDelta
;
232 if (markDelta
!= -1) {
233 stream
->mark
= newBuffer
+ markDelta
;
235 if (parserMarkDelta
!= -1) {
236 stream
->parserMark
= newBuffer
+ parserMarkDelta
;
242 static CFIndex
loadCharacters(UniChar
*base
, CFIndex maxLength
, _CFXMLInputStream
*stream
) {
243 const uint8_t *dataEnd
= CFDataGetBytePtr(stream
->data
) + CFDataGetLength(stream
->data
);
244 if (stream
->flags
& (ENCODING_IS_UNICODE_NATURAL
|ENCODING_IS_UNICODE_SWAPPED
) ) {
245 CFIndex charsToTranslate
= (dataEnd
- stream
->currentByte
) / sizeof(UniChar
);
246 if (charsToTranslate
> maxLength
) {
247 charsToTranslate
= maxLength
;
249 if (stream
->flags
& ENCODING_IS_UNICODE_NATURAL
) {
250 memmove(base
, stream
->currentByte
, charsToTranslate
* sizeof(UniChar
));
251 stream
->currentByte
+= (charsToTranslate
* sizeof(UniChar
));
254 uint8_t *baseBytePtr
= (uint8_t *)base
;
255 for (i
= 0; i
< charsToTranslate
; i
++) {
256 *(baseBytePtr
+ 1) = *stream
->currentByte
;
257 *baseBytePtr
= *(stream
->currentByte
+ 1);
259 stream
->currentByte
+= 2;
262 return charsToTranslate
;
264 CFIndex lengthConsumed
= 0;
265 CFIndex usedByteLength
, usedCharLength
;
266 UInt32 conversionResult
;
267 if (stream
->flags
& ENCODING_MATCHES_ASCII
) {
268 while (stream
->currentByte
< dataEnd
&& lengthConsumed
< maxLength
) {
269 if (*stream
->currentByte
> 0x7f) break;
270 *base
= *stream
->currentByte
;
272 stream
->currentByte
++;
275 if (stream
->currentByte
== dataEnd
|| lengthConsumed
== maxLength
) {
276 return lengthConsumed
;
279 conversionResult
= CFStringEncodingBytesToUnicode(stream
->encoding
, 0, stream
->currentByte
, dataEnd
- stream
->currentByte
, &usedByteLength
, base
, maxLength
-lengthConsumed
, &usedCharLength
);
280 if(kCFStringEncodingConversionSuccess
!= conversionResult
) {
281 switch(conversionResult
) {
282 case kCFStringEncodingConverterUnavailable
:
283 case kCFStringEncodingInvalidInputStream
:
284 stream
->flags
|= ENCODING_COMPOSITION_ERROR
;
286 case kCFStringEncodingInsufficientOutputBufferLength
:
291 if (usedByteLength
> 0) {
292 stream
->currentByte
+= usedByteLength
;
293 lengthConsumed
+= usedCharLength
;
295 return lengthConsumed
;
299 // returns number of characters filled
300 CF_INLINE CFIndex
fillToCapacity(_CFXMLInputStream
*stream
) {
302 if (stream
->bufferLength
>= stream
->bufferCapacity
) return 0;
303 // Try and fill in the remaining characters
304 numFilled
= loadCharacters(stream
->charBuffer
+stream
->bufferLength
, stream
->bufferCapacity
- stream
->bufferLength
, stream
);
305 if (numFilled
!= 0) {
306 stream
->currentChar
= stream
->charBuffer
+ stream
->bufferLength
;
307 stream
->bufferLength
+= numFilled
;
312 // we are expected to move mark & parserMark relative to any moved characters, set currentChar to the first new character fetched, update bufferLength, and advance currentByte as appropriate. Does not check for EOF; it is the caller's responsibility to verify this.
313 static void fillCharacterBuffer(_CFXMLInputStream
*stream
) {
314 if (!stream
->charBuffer
) {
315 growCharacterBuffer(stream
);
317 if (!stream
->mark
&& !stream
->parserMark
) {
318 // This is the easy case; we can freely overwrite the buffer; if either mark or parserMark is set, we must not remove any characters from those marks and the end of the buffer
319 CFIndex fillLength
= stream
->bufferCapacity
-5; // We leave a few characters at the end, b/c we don't want to reallocate (doubling the amount of memory used) just b/c we're matching a small string near the end of the filled buffer
320 stream
->bufferLength
= loadCharacters(stream
->charBuffer
, fillLength
, stream
);
321 CFAssert(stream
->bufferLength
!= 0, __kCFLogAssertion
, "CF internal error: XML parser input stream corruption");
322 stream
->currentChar
= stream
->charBuffer
;
324 // We do everything we can not to allocate; first we fill any remaining characters. If that doesn't work, we try shifting the characters starting at the earlier of mark or parserMark to the beginning of buffer, then filling the newly-freed characters.
327 // First try just filling the remaining capacity
328 done
= (fillToCapacity(stream
) != 0);
330 const UniChar
*leftMostMark
;
331 if (stream
->mark
&& !stream
->parserMark
) {
332 leftMostMark
= stream
->mark
;
333 } else if (stream
->parserMark
&& !stream
->mark
) {
334 leftMostMark
= stream
->parserMark
;
335 } else if (stream
->parserMark
< stream
->mark
) {
336 leftMostMark
= stream
->parserMark
;
338 leftMostMark
= stream
->mark
;
340 if (leftMostMark
> stream
->charBuffer
) {
341 CFIndex delta
= leftMostMark
- stream
->charBuffer
;
342 memmove(stream
->charBuffer
, leftMostMark
, (stream
->bufferLength
- delta
) * sizeof(UniChar
));
343 stream
->bufferLength
-= delta
;
345 stream
->mark
-= delta
;
347 if (stream
->parserMark
) {
348 stream
->parserMark
-= delta
;
350 // Now try to fill the newly-opened space
351 done
= (fillToCapacity(stream
) != 0);
352 delta
= loadCharacters(stream
->charBuffer
+ stream
->bufferLength
, stream
->bufferCapacity
- stream
->bufferLength
, stream
);
356 // No help for it; now we must allocate
357 growCharacterBuffer(stream
);
358 fillToCapacity(stream
); // If this doesn't work, we give up.
363 /* The guts of getCharacter() have been separated in order to allow getCharacter() to be small and more easily inline-able. Note that a check late in the 10.3 development cycle indicates that getCharacter() should call getCharacterGuts() less than 2% of the time. (In 29000 calls, less than 400 called this. Note that a majority of calls have advanceStream set, so that was left in the inline version. Also note that some calls to getCharacter() were changed to go through the functions _inputStreamGetCharacter() or _inputStreamPeekCharacter(), as the expansion in using the inline version didn't seem worthwhile. See 3275503 for some data supporting this.
365 static Boolean
getCharacterGuts(_CFXMLInputStream
*stream
, UniChar
*ch
, Boolean advanceStream
) {
366 if (stream
->currentByte
- CFDataGetBytePtr(stream
->data
) >= CFDataGetLength(stream
->data
)) {
368 } else if (!((stream
->mark
|| stream
->parserMark
) && advanceStream
) &&
369 (((stream
->flags
& ENCODING_MATCHES_ASCII
) && *(stream
->currentByte
) < 0x7F) ||
370 (stream
->flags
& (ENCODING_IS_UNICODE_NATURAL
| ENCODING_IS_UNICODE_SWAPPED
)))) {
371 // We can only perform optimizations if neither mark is set (if the mark is set, we must fill the character buffer so we can retrieve the characters later), and the encoding is Unicode, or the encoding matches ASCII and we're looking at a low-byte character.
372 if (stream
->flags
& ENCODING_MATCHES_ASCII
) {
373 *ch
= (UniChar
)*(stream
->currentByte
);
375 stream
->currentByte
++;
377 } else if (stream
->flags
& ENCODING_IS_UNICODE_NATURAL
) {
378 *ch
= *(UniChar
*)(stream
->currentByte
);
380 stream
->currentByte
+= 2;
383 // Unicode with swapped bytes
384 *ch
= CFSwapInt16(*(UniChar
*)(stream
->currentByte
));
386 stream
->currentByte
+= 2;
390 fillCharacterBuffer(stream
); // this takes into account markIsSet to make sure and do the right thing
391 if (!stream
->charBuffer
|| !stream
->currentChar
) {
394 *ch
= *(stream
->currentChar
);
396 stream
->currentChar
++;
397 if (stream
->currentChar
== stream
->charBuffer
+ stream
->bufferLength
) {
398 stream
->currentChar
= NULL
;
406 /* See comments above getCharacterGuts()
408 CF_INLINE Boolean
getCharacter(_CFXMLInputStream
*stream
, UniChar
*ch
, Boolean advanceStream
) {
409 if (!(stream
->flags
& STREAM_OPEN
)) {
411 } else if (stream
->currentChar
) {
412 *ch
= *stream
->currentChar
;
414 stream
->currentChar
++;
415 if (stream
->currentChar
== stream
->charBuffer
+ stream
->bufferLength
) {
416 stream
->currentChar
= NULL
;
420 if (!getCharacterGuts(stream
, ch
, advanceStream
)) return false;
424 stream
->charIndex
++;
425 if ((*ch
== '\n') || ((*ch
== '\r') && (!_inputStreamPeekCharacter(stream
, &nextChar
) || nextChar
!= '\n'))) stream
->lineNum
++;
430 __private_extern__ Boolean
_inputStreamPeekCharacter(_CFXMLInputStream
*stream
, UniChar
*ch
) {
431 return getCharacter(stream
, ch
, false);
434 __private_extern__ Boolean
_inputStreamGetCharacter(_CFXMLInputStream
*stream
, UniChar
*ch
) {
435 return getCharacter(stream
, ch
, true);
438 __private_extern__ Boolean
_inputStreamReturnCharacter(_CFXMLInputStream
*stream
, UniChar ch
) {
439 Boolean decrementLineNum
= false;
441 decrementLineNum
= true;
442 } else if (ch
== '\r') {
444 if (!_inputStreamPeekCharacter(stream
, &nextChar
) || nextChar
!= '\n') {
445 decrementLineNum
= true;
449 if (!(stream
->flags
& STREAM_OPEN
)) {
451 } else if (stream
->currentChar
) {
452 if (stream
->currentChar
!= stream
->charBuffer
) {
453 stream
->currentChar
--;
455 // Yuck; we're unlucky and are returning a character _before_ the first character in charBuffer
456 if (stream
->bufferLength
>= stream
->bufferCapacity
) {
457 growCharacterBuffer(stream
);
459 memmove(stream
->charBuffer
+ 1, stream
->charBuffer
, stream
->bufferLength
* sizeof(UniChar
));
460 *stream
->charBuffer
= ch
;
461 stream
->bufferLength
++;
465 if (stream
->parserMark
) {
466 stream
->parserMark
++;
469 } else if ((stream
->mark
|| stream
->parserMark
) && stream
->bufferLength
) {
470 // We've been collecting characters in charBuffer; the only reason stream->currentChar is NULL is that we've processed the last character thusfar translated from data. That last character is the one being returned.
471 stream
->currentChar
= stream
->charBuffer
+ stream
->bufferLength
- 1;
472 } else if (stream
->charBuffer
) {
473 // We have processed all the meaningful characters from charBuffer and have no reason to preserve them. We use charBuffer to hold this one character that has been returned to us.
474 *stream
->charBuffer
= ch
;
475 stream
->currentChar
= stream
->charBuffer
;
476 stream
->bufferLength
= 1;
480 if (stream
->parserMark
) {
481 stream
->parserMark
++;
483 } else if (stream
->currentByte
> CFDataGetBytePtr(stream
->data
)) {
484 // We have no character buffer available, so that means one of two things - either we've never needed a character buffer because all the characters could come directly out of the byte stream, or we've not yet processed the first character. The former means we can just back up the byte pointer; the latter means Bad Things have happened.
485 if (stream
->flags
& ENCODING_MATCHES_ASCII
) {
486 stream
->currentByte
--;
487 } else { // Must be Unicode
488 stream
->currentByte
-= 2;
493 stream
->charIndex
--;
494 if (decrementLineNum
) {
500 // Returns the pointer to hold as the mark
501 static UniChar
*dropMark(_CFXMLInputStream
*stream
) {
502 if (stream
->currentChar
) {
503 return stream
->currentChar
;
504 } else if (stream
->mark
|| stream
->parserMark
) {
505 return stream
->charBuffer
+ stream
->bufferLength
;
507 if (!stream
->charBuffer
) {
508 growCharacterBuffer(stream
);
510 stream
->bufferLength
= 0; // This will be sufficient to force a fetch into the buffer when the next character is requested
511 return stream
->charBuffer
;
516 __private_extern__
void _inputStreamSetMark(_CFXMLInputStream
*stream
) {
517 CFAssert(stream
->mark
== NULL
, __kCFLogAssertion
, "CF internal error: parser input stream malformed");
518 stream
->mark
= dropMark(stream
);
521 __private_extern__
void _inputStreamClearMark(_CFXMLInputStream
*stream
) {
522 CFAssert(stream
->mark
!= NULL
, __kCFLogAssertion
, "CF internal error: parser input stream malformed");
526 __private_extern__
void _inputStreamGetCharactersFromMark(_CFXMLInputStream
*stream
, CFMutableStringRef string
) {
527 UniChar
*end
= stream
->currentChar
? stream
->currentChar
: stream
->charBuffer
+ stream
->bufferLength
;
528 CFIndex numChars
= end
- stream
->mark
;
529 CFAssert(stream
->mark
, __kCFLogAssertion
, "CF internal error: malformed XML input stream");
530 _fillStringWithCharacters(string
, stream
->mark
, numChars
);
533 static void restoreToMark(_CFXMLInputStream
*stream
, UniChar
*mark
) {
534 UniChar
*end
= stream
->currentChar
? stream
->currentChar
: stream
->charBuffer
+ stream
->bufferLength
;
536 CFIndex numChars
= end
- mark
;
537 stream
->charIndex
-= numChars
;
538 stream
->currentChar
= mark
;
540 // This is annoying; to keep the line count accurate, if the first character we are returning is a CR, we must decrement the line count iff the next character is NOT a LF
541 if (*(end
- 1) == '\r') {
543 if (_inputStreamPeekCharacter(stream
, &nextChar
) && nextChar
== '\n') {
547 while (end
!= mark
) {
551 } else if (*end
== '\n') {
553 if (end
!= mark
&& *(end
- 1) == '\r') {
561 __private_extern__
void _inputStreamBackUpToMark(_CFXMLInputStream
*stream
) {
562 CFAssert(stream
->mark
!= NULL
|| stream
->charBuffer
== NULL
, __kCFLogAssertion
, "CF internal error: malformed XML input stream");
563 restoreToMark(stream
, stream
->mark
);
566 CF_INLINE Boolean
isWhitespaceChar(UniChar ch
) {
567 return (ch
== '\n' || ch
== '\r' || ch
== ' ' || ch
== '\t');
570 __private_extern__ CFIndex
_inputStreamSkipWhitespace(_CFXMLInputStream
*stream
, CFMutableStringRef str
) {
574 stream
->parserMark
= dropMark(stream
);
576 while (getCharacter(stream
, &ch
, true) && isWhitespaceChar(ch
)) {
579 if (!isWhitespaceChar(ch
)) {
580 _inputStreamReturnCharacter(stream
, ch
);
583 _fillStringWithCharacters(str
, stream
->parserMark
, len
);
584 stream
->parserMark
= NULL
;
589 // false return means EOF was encountered without finding scanChars
590 __private_extern__ Boolean
_inputStreamScanToCharacters(_CFXMLInputStream
*stream
, const UniChar
*scanChars
, CFIndex numChars
, CFMutableStringRef str
) {
591 Boolean done
= false;
592 CFIndex firstRepeatIndex
= -1;
594 stream
->parserMark
= dropMark(stream
);
597 while (_inputStreamGetCharacter(stream
, &ch
) && ch
!= scanChars
[0]) {
600 if (ch
!= scanChars
[0]) {
601 restoreToMark(stream
, stream
->parserMark
);
602 stream
->parserMark
= NULL
;
606 for (i
= 1; i
< numChars
; i
++) {
607 if (!_inputStreamGetCharacter(stream
, &ch
)) break;
608 if (ch
!= scanChars
[i
]) break;
613 if (firstRepeatIndex
== -1) {
615 for (j
= 1; j
< numChars
; j
++) {
616 if (scanChars
[0] == scanChars
[j
]) {
620 firstRepeatIndex
= j
;
622 _inputStreamReturnCharacter(stream
, ch
);
623 while (i
> firstRepeatIndex
) {
625 _inputStreamReturnCharacter(stream
, scanChars
[i
]);
632 _fillStringWithCharacters(str
, stream
->parserMark
, len
);
634 stream
->parserMark
= NULL
;
638 __private_extern__ Boolean
_inputStreamMatchString(_CFXMLInputStream
*stream
, const UniChar
*stringToMatch
, CFIndex length
) {
639 const UniChar
*end
= stringToMatch
+length
;
640 const UniChar
*sPtr
=stringToMatch
;
641 stream
->parserMark
= dropMark(stream
);
644 if (!_inputStreamGetCharacter(stream
, &ch
)) break;
645 if (ch
!= *sPtr
) break;
649 restoreToMark(stream
, stream
->parserMark
);
650 stream
->parserMark
= NULL
;
653 stream
->parserMark
= NULL
;
658 __private_extern__ Boolean
_inputStreamScanQuotedString(_CFXMLInputStream
*stream
, CFMutableStringRef str
) {
660 if (!_inputStreamPeekCharacter(stream
, &ch
)) return false;
661 if (ch
!= '\'' && ch
!= '\"') return false;
663 _inputStreamGetCharacter(stream
, &ch
);
664 if (!_inputStreamScanToCharacters(stream
, &ch
, 1, str
)) {
671 [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender
672 [5] Name ::= (Letter | '_' | ':') (NameChar)*
673 [7] Nmtoken ::= (NameChar)+
674 [84] Letter ::= BaseChar | Ideographic
676 We don't do this quite right; we rely on the Unicode charsets to do this analysis. While
677 the productions in the XML spec are based on the Unicode character sets, the definitions
678 differ slightly to avoid those areas where the Unicode standard is still being resolved.
679 At any rate, I'd lay money that using the Unicode charsets, we will be more correct than
680 the vast majority of parsers out there.
682 Letter == kCFUniCharLetterCharacterSet
683 Digit == kCFUniCharDecimalDigitCharacterSet
684 CombiningChar == kCFUniCharNonBaseCharacterSet
685 Extender - complex, and not represented by a uniform character set.
687 __private_extern__ Boolean
_inputStreamScanXMLName(_CFXMLInputStream
*stream
, Boolean isNMToken
, CFStringRef
*str
) {
689 Boolean success
= true;
690 stream
->parserMark
= dropMark(stream
);
692 // Only difference between an NMToken and a Name is Names have a stricter condition on the first character
693 if (!getCharacter(stream
, &ch
, false)) {
695 } else if (!CFUniCharIsMemberOf(ch
, kCFUniCharLetterCharacterSet
) && ch
!= '_' && ch
!= ':') {
698 getCharacter(stream
, &ch
, true);
702 while (getCharacter(stream
, &ch
, true)) {
703 if (!CFUniCharIsMemberOf(ch
, kCFUniCharLetterCharacterSet
) && !CFUniCharIsMemberOf(ch
, kCFUniCharDecimalDigitCharacterSet
) && ch
!= '.' && ch
!= '-' && ch
!= '_' && ch
!= ':' && !CFUniCharIsMemberOf(ch
, kCFUniCharNonBaseCharacterSet
)) {
704 _inputStreamReturnCharacter(stream
, ch
);
708 if (NULL
== stream
->currentChar
|| stream
->currentChar
== stream
->parserMark
) {
709 success
= false; // Must have processed at least one character
714 if (!stream
->nameSet
) {
715 stream
->nameSet
= CFSetCreateMutable(stream
->allocator
, 0, &kCFTypeSetCallBacks
);
716 stream
->tempString
= CFStringCreateMutableWithExternalCharactersNoCopy(stream
->allocator
, NULL
, 0, 0, kCFAllocatorNull
);
718 CFStringSetExternalCharactersNoCopy(stream
->tempString
, stream
->parserMark
, stream
->currentChar
-stream
->parserMark
, stream
->currentChar
-stream
->parserMark
);
719 if (!CFSetGetValueIfPresent(stream
->nameSet
, stream
->tempString
, (const void **)str
)) {
720 *str
= (CFStringRef
)CFStringCreateCopy(stream
->allocator
, stream
->tempString
);
721 CFSetAddValue(stream
->nameSet
, *str
);
726 restoreToMark(stream
, stream
->parserMark
);
728 stream
->parserMark
= NULL
;