]> git.saurik.com Git - apple/cf.git/blob - CFXMLInputStream.c
CF-855.14.tar.gz
[apple/cf.git] / CFXMLInputStream.c
1 /*
2 * Copyright (c) 2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24 /* CFXMLInputStream.c
25 Copyright (c) 1999-2013, Apple Inc. All rights reserved.
26 Responsibility: David Smith
27 */
28
29 #include "CFXMLInputStream.h"
30 #include <CoreFoundation/CFCharacterSet.h>
31 #include <string.h>
32 #include "CFStringEncodingConverter.h"
33 #include "CFUniChar.h"
34
35 /* Utility functions used in parsing */
36 static Boolean determineEncoding(_CFXMLInputStream *stream) {
37 const uint8_t *bytes = (uint8_t *)CFDataGetBytePtr(stream->data);
38 UInt32 length = CFDataGetLength(stream->data);
39 const uint8_t *idx = 0L, *end = 0L;
40 const uint8_t *base = 0L;
41 char quote = ' ';
42 Boolean useUTF8 = false;
43
44 // Check for the byte order mark first
45 if (length > 2) {
46 // This clause checks for the unicode byte order mark, or a Unicode sequence lacking the BOM; technically an error, but this check is recommended by the XML spec
47 if ((*bytes == 0xFF && *(bytes+1) == 0xFE) ||*(bytes+1) == 0x00) {
48 #if __BIG_ENDIAN__
49 stream->flags |= ENCODING_IS_UNICODE_SWAPPED;
50 #else
51 stream->flags |= ENCODING_IS_UNICODE_NATURAL;
52 #endif
53 if (*bytes == 0xFF) {
54 stream->currentByte = bytes + 2;
55 }
56 stream->encoding = kCFStringEncodingUnicode;
57 return true;
58 } else if ((*bytes == 0xFE && *(bytes+1) == 0xFF) || *bytes == 0x00) {
59 #if __BIG_ENDIAN__
60 stream->flags |= ENCODING_IS_UNICODE_NATURAL;
61 #else
62 stream->flags |= ENCODING_IS_UNICODE_SWAPPED;
63 #endif
64 if (*bytes == 0xFE) {
65 stream->currentByte = bytes + 2;
66 }
67 stream->encoding = kCFStringEncodingUnicode;
68 return true;
69 } else if(*bytes == 0xEF && *(bytes+1) == 0xBB && *(bytes+2) == 0xBF) {
70 if(*bytes == 0xEF) {
71 stream->currentByte = bytes + 3;
72 }
73 stream->encoding = kCFStringEncodingUTF8;
74 stream->flags |= ENCODING_MATCHES_ASCII;
75 return true;
76 }
77 }
78 // Scan for the <?xml.... ?> opening
79 if (length < 5 || strncmp((char const *) bytes, "<?xml", 5) != 0) {
80 useUTF8 = true;
81 }
82 if (!useUTF8) {
83 idx = bytes + 5;
84 end = bytes + length;
85 // Found "<?xml"; now we scan for "encoding"
86 while (idx < end) {
87 uint8_t ch = *idx;
88 const uint8_t *scan;
89 if ( ch == '?' || ch == '>') {
90 useUTF8 = true;
91 break;
92 }
93 idx ++;
94 scan = idx;
95 if (ch == 'e' && *scan++ == 'n' && *scan++ == 'c' && *scan++ == 'o' && *scan++ == 'd' && *scan++ == 'i' && *scan++ == 'n' && *scan++ == 'g' && *scan++ == '=') {
96 idx = scan;
97 break;
98 }
99 }
100 if (!useUTF8 && idx >= end) {
101 useUTF8 = true;
102 }
103 }
104 if (!useUTF8) {
105 // Found "encoding="; see if we've got an honest-to-goodness encoding name
106 quote = *idx;
107 if (quote != '\'' && quote != '\"') {
108 useUTF8 = true;
109 }
110 }
111 if (!useUTF8) {
112 base = idx + 1; // Move past the quote character
113 idx ++;
114 while (idx < end && *idx != quote) idx ++;
115 if (idx >= end) {
116 useUTF8 = true;
117 }
118 }
119 if (!useUTF8) {
120 UInt32 len = idx - base;
121 if (len == 5 && (*base == 'u' || *base == 'U') && (base[1] == 't' || base[1] == 'T') && (base[2] == 'f' || base[2] == 'F') && (base[3] == '-') && (base[4] == '8')) {
122 useUTF8 = true;
123 } else {
124 CFStringRef encodingName = CFStringCreateWithBytes(stream->allocator, base, len, kCFStringEncodingISOLatin1, false);
125 stream->encoding = CFStringConvertIANACharSetNameToEncoding(encodingName);
126 CFRelease(encodingName);
127 }
128 }
129 if (useUTF8) {
130 stream->encoding = kCFStringEncodingUTF8;
131 stream->flags |= ENCODING_MATCHES_ASCII;
132 return true;
133 } else if (stream->encoding == kCFStringEncodingInvalidId) {
134 return false;
135 } else if (__CFStringEncodingIsSupersetOfASCII(stream->encoding)) {
136 stream->flags |= ENCODING_MATCHES_ASCII;
137 }
138 return true;
139 }
140
141 CF_INLINE void _fillStringWithCharacters(CFMutableStringRef string, UniChar *characters, CFIndex numChars) {
142 CFStringDelete(string, CFRangeMake(0, CFStringGetLength(string)));
143 if (numChars) {
144 CFStringAppendCharacters(string, characters, numChars);
145 }
146 }
147
148 CF_PRIVATE Boolean _openInputStream(_CFXMLInputStream *stream) {
149 if (NULL == stream->data) {
150 return false;
151 } else {
152 stream->currentByte = CFDataGetBytePtr(stream->data);
153 if (determineEncoding(stream)) {
154 stream->flags |= STREAM_OPEN;
155 return true;
156 } else {
157 return false;
158 }
159 }
160 }
161
162 CF_PRIVATE void _initializeInputStream(_CFXMLInputStream *stream, CFAllocatorRef alloc, CFURLRef dataSource, CFDataRef xmlData) {
163 stream->data = xmlData ? (CFDataRef)CFRetain(xmlData) : NULL;
164 stream->url = dataSource ? (CFURLRef)CFRetain(dataSource) : NULL;
165 stream->encoding = kCFStringEncodingInvalidId;
166 stream->currentByte = NULL;
167
168 stream->allocator = (CFAllocatorRef)CFRetain(alloc);
169 stream->charBuffer = NULL;
170 stream->currentChar = NULL;
171 stream->mark = NULL;
172 stream->parserMark = NULL;
173 stream->bufferLength = 0;
174 stream->bufferCapacity = 0;
175
176 stream->charIndex = 1;
177 stream->lineNum = 1;
178
179 stream->flags = 0;
180 stream->nameSet = NULL;
181 stream->tempString = NULL;
182 }
183
184
185 CF_PRIVATE void _freeInputStream(_CFXMLInputStream *stream) {
186 if (stream->data) CFRelease(stream->data);
187 if (stream->url) CFRelease(stream->url);
188 if (stream->charBuffer) CFAllocatorDeallocate(stream->allocator, stream->charBuffer);
189 if (stream->nameSet) CFRelease(stream->nameSet);
190 if (stream->tempString) CFRelease(stream->tempString);
191 CFRelease(stream->allocator);
192 }
193
194 CF_PRIVATE CFStringEncoding _inputStreamGetEncoding(_CFXMLInputStream *stream) {
195 return stream->encoding;
196 }
197
198 CF_PRIVATE CFIndex _inputStreamCurrentLocation(_CFXMLInputStream *stream) {
199 return stream->charIndex;
200 }
201
202 CF_PRIVATE CFIndex _inputStreamCurrentLine(_CFXMLInputStream *stream) {
203 return stream->lineNum;
204 }
205
206 CF_PRIVATE Boolean _inputStreamAtEOF(_CFXMLInputStream *stream) {
207 if (!(stream->flags & STREAM_OPEN)) return false;
208 if (stream->currentChar) return false;
209 if (stream->currentByte - CFDataGetBytePtr(stream->data) < CFDataGetLength(stream->data)) return false;
210 return true;
211 }
212
213 CF_PRIVATE Boolean _inputStreamComposingErrorOccurred(_CFXMLInputStream *stream) {
214 return stream->flags & ENCODING_COMPOSITION_ERROR;
215 }
216
217 #define INITIAL_BUFFER_SIZE 64
218 static void growCharacterBuffer(_CFXMLInputStream *stream) {
219 if (!stream->charBuffer) {
220 stream->charBuffer = (UniChar *)CFAllocatorAllocate(stream->allocator, INITIAL_BUFFER_SIZE*sizeof(UniChar), 0);
221 stream->bufferCapacity = INITIAL_BUFFER_SIZE;
222 } else {
223 CFIndex currCharDelta = stream->currentChar ? stream->currentChar - stream->charBuffer : -1;
224 CFIndex markDelta = stream->mark ? stream->mark - stream->charBuffer: -1;
225 CFIndex parserMarkDelta = stream->parserMark ? stream->parserMark - stream->charBuffer: -1;
226 UniChar *newBuffer = (UniChar *)CFAllocatorReallocate(stream->allocator, stream->charBuffer, stream->bufferCapacity * 2 * sizeof(UniChar), 0);
227 stream->bufferCapacity *= 2;
228 if (newBuffer != stream->charBuffer) {
229 stream->charBuffer = newBuffer;
230 if (currCharDelta != -1) {
231 stream->currentChar = newBuffer + currCharDelta;
232 }
233 if (markDelta != -1) {
234 stream->mark = newBuffer + markDelta;
235 }
236 if (parserMarkDelta != -1) {
237 stream->parserMark = newBuffer + parserMarkDelta;
238 }
239 }
240 }
241 }
242
243 static CFIndex loadCharacters(UniChar *base, CFIndex maxLength, _CFXMLInputStream *stream) {
244 const uint8_t *dataEnd = CFDataGetBytePtr(stream->data) + CFDataGetLength(stream->data);
245 if (stream->flags & (ENCODING_IS_UNICODE_NATURAL|ENCODING_IS_UNICODE_SWAPPED) ) {
246 CFIndex charsToTranslate = (dataEnd - stream->currentByte) / sizeof(UniChar);
247 if (charsToTranslate > maxLength) {
248 charsToTranslate = maxLength;
249 }
250 if (stream->flags & ENCODING_IS_UNICODE_NATURAL) {
251 memmove(base, stream->currentByte, charsToTranslate * sizeof(UniChar));
252 stream->currentByte += (charsToTranslate * sizeof(UniChar));
253 } else {
254 CFIndex i;
255 uint8_t *baseBytePtr = (uint8_t *)base;
256 for (i = 0; i < charsToTranslate; i ++) {
257 *(baseBytePtr + 1) = *stream->currentByte;
258 *baseBytePtr = *(stream->currentByte + 1);
259 baseBytePtr += 2;
260 stream->currentByte += 2;
261 }
262 }
263 return charsToTranslate;
264 } else {
265 CFIndex lengthConsumed = 0;
266 CFIndex usedByteLength, usedCharLength;
267 UInt32 conversionResult;
268 if (stream->flags & ENCODING_MATCHES_ASCII) {
269 while (stream->currentByte < dataEnd && lengthConsumed < maxLength) {
270 if (*stream->currentByte > 0x7f) break;
271 *base = *stream->currentByte;
272 base ++;
273 stream->currentByte ++;
274 lengthConsumed ++;
275 }
276 if (stream->currentByte == dataEnd || lengthConsumed == maxLength) {
277 return lengthConsumed;
278 }
279 }
280 conversionResult = CFStringEncodingBytesToUnicode(stream->encoding, 0, stream->currentByte, dataEnd - stream->currentByte, &usedByteLength, base, maxLength-lengthConsumed, &usedCharLength);
281 if(kCFStringEncodingConversionSuccess != conversionResult) {
282 switch(conversionResult) {
283 case kCFStringEncodingConverterUnavailable:
284 case kCFStringEncodingInvalidInputStream:
285 stream->flags |= ENCODING_COMPOSITION_ERROR;
286 break;
287 case kCFStringEncodingInsufficientOutputBufferLength:
288 default:
289 break;
290 }
291 }
292 if (usedByteLength > 0) {
293 stream->currentByte += usedByteLength;
294 lengthConsumed += usedCharLength;
295 }
296 return lengthConsumed;
297 }
298 }
299
300 // returns number of characters filled
301 CF_INLINE CFIndex fillToCapacity(_CFXMLInputStream *stream) {
302 CFIndex numFilled;
303 if (stream->bufferLength >= stream->bufferCapacity) return 0;
304 // Try and fill in the remaining characters
305 numFilled = loadCharacters(stream->charBuffer+stream->bufferLength, stream->bufferCapacity - stream->bufferLength, stream);
306 if (numFilled != 0) {
307 stream->currentChar = stream->charBuffer + stream->bufferLength;
308 stream->bufferLength += numFilled;
309 }
310 return numFilled;
311 }
312
313 // we are expected to move mark & parserMark relative to any moved characters, set currentChar to the first new character fetched, update bufferLength, and advance currentByte as appropriate. Does not check for EOF; it is the caller's responsibility to verify this.
314 static void fillCharacterBuffer(_CFXMLInputStream *stream) {
315 if (!stream->charBuffer) {
316 growCharacterBuffer(stream);
317 }
318 if (!stream->mark && !stream->parserMark) {
319 // This is the easy case; we can freely overwrite the buffer; if either mark or parserMark is set, we must not remove any characters from those marks and the end of the buffer
320 CFIndex fillLength = stream->bufferCapacity-5; // We leave a few characters at the end, b/c we don't want to reallocate (doubling the amount of memory used) just b/c we're matching a small string near the end of the filled buffer
321 stream->bufferLength = loadCharacters(stream->charBuffer, fillLength, stream);
322 CFAssert(stream->bufferLength != 0, __kCFLogAssertion, "CF internal error: XML parser input stream corruption");
323 stream->currentChar = stream->charBuffer;
324 } else {
325 // We do everything we can not to allocate; first we fill any remaining characters. If that doesn't work, we try shifting the characters starting at the earlier of mark or parserMark to the beginning of buffer, then filling the newly-freed characters.
326 Boolean done;
327
328 // First try just filling the remaining capacity
329 done = (fillToCapacity(stream) != 0);
330 if (!done) {
331 const UniChar *leftMostMark;
332 if (stream->mark && !stream->parserMark) {
333 leftMostMark = stream->mark;
334 } else if (stream->parserMark && !stream->mark) {
335 leftMostMark = stream->parserMark;
336 } else if (stream->parserMark < stream->mark) {
337 leftMostMark = stream->parserMark;
338 } else {
339 leftMostMark = stream->mark;
340 }
341 if (leftMostMark > stream->charBuffer) {
342 CFIndex delta = leftMostMark - stream->charBuffer;
343 memmove(stream->charBuffer, leftMostMark, (stream->bufferLength - delta) * sizeof(UniChar));
344 stream->bufferLength -= delta;
345 if (stream->mark) {
346 stream->mark -= delta;
347 }
348 if (stream->parserMark) {
349 stream->parserMark -= delta;
350 }
351 // Now try to fill the newly-opened space
352 done = (fillToCapacity(stream) != 0);
353 delta = loadCharacters(stream->charBuffer + stream->bufferLength, stream->bufferCapacity - stream->bufferLength, stream);
354 }
355 }
356 if (!done) {
357 // No help for it; now we must allocate
358 growCharacterBuffer(stream);
359 fillToCapacity(stream); // If this doesn't work, we give up.
360 }
361 }
362 }
363
364 /* The guts of getCharacter() have been separated in order to allow getCharacter() to be small and more easily inline-able. Note that a check late in the 10.3 development cycle indicates that getCharacter() should call getCharacterGuts() less than 2% of the time. (In 29000 calls, less than 400 called this. Note that a majority of calls have advanceStream set, so that was left in the inline version. Also note that some calls to getCharacter() were changed to go through the functions _inputStreamGetCharacter() or _inputStreamPeekCharacter(), as the expansion in using the inline version didn't seem worthwhile. See 3275503 for some data supporting this.
365 */
366 static Boolean getCharacterGuts(_CFXMLInputStream *stream, UniChar *ch, Boolean advanceStream) {
367 if (stream->currentByte - CFDataGetBytePtr(stream->data) >= CFDataGetLength(stream->data)) {
368 return false; // EOF
369 } else if (!((stream->mark || stream->parserMark) && advanceStream) &&
370 (((stream->flags & ENCODING_MATCHES_ASCII) && *(stream->currentByte) < 0x7F) ||
371 (stream->flags & (ENCODING_IS_UNICODE_NATURAL | ENCODING_IS_UNICODE_SWAPPED)))) {
372 // We can only perform optimizations if neither mark is set (if the mark is set, we must fill the character buffer so we can retrieve the characters later), and the encoding is Unicode, or the encoding matches ASCII and we're looking at a low-byte character.
373 if (stream->flags & ENCODING_MATCHES_ASCII) {
374 *ch = (UniChar)*(stream->currentByte);
375 if (advanceStream) {
376 stream->currentByte ++;
377 }
378 } else if (stream->flags & ENCODING_IS_UNICODE_NATURAL) {
379 *ch = *(UniChar *)(stream->currentByte);
380 if (advanceStream) {
381 stream->currentByte += 2;
382 }
383 } else {
384 // Unicode with swapped bytes
385 *ch = CFSwapInt16(*(UniChar *)(stream->currentByte));
386 if (advanceStream) {
387 stream->currentByte += 2;
388 }
389 }
390 } else {
391 fillCharacterBuffer(stream); // this takes into account markIsSet to make sure and do the right thing
392 if (!stream->charBuffer || !stream->currentChar) {
393 return false;
394 } else {
395 *ch = *(stream->currentChar);
396 if (advanceStream) {
397 stream->currentChar ++;
398 if (stream->currentChar == stream->charBuffer + stream->bufferLength) {
399 stream->currentChar = NULL;
400 }
401 }
402 }
403 }
404 return true;
405 }
406
407 /* See comments above getCharacterGuts()
408 */
409 CF_INLINE Boolean getCharacter(_CFXMLInputStream *stream, UniChar *ch, Boolean advanceStream) {
410 if (!(stream->flags & STREAM_OPEN)) {
411 return false;
412 } else if (stream->currentChar) {
413 *ch = *stream->currentChar;
414 if (advanceStream) {
415 stream->currentChar ++;
416 if (stream->currentChar == stream->charBuffer + stream->bufferLength) {
417 stream->currentChar = NULL;
418 }
419 }
420 } else {
421 if (!getCharacterGuts(stream, ch, advanceStream)) return false;
422 }
423 if (advanceStream) {
424 UniChar nextChar;
425 stream->charIndex ++;
426 if ((*ch == '\n') || ((*ch == '\r') && (!_inputStreamPeekCharacter(stream, &nextChar) || nextChar != '\n'))) stream->lineNum ++;
427 }
428 return true;
429 }
430
431 CF_PRIVATE Boolean _inputStreamPeekCharacter(_CFXMLInputStream *stream, UniChar *ch) {
432 return getCharacter(stream, ch, false);
433 }
434
435 CF_PRIVATE Boolean _inputStreamGetCharacter(_CFXMLInputStream *stream, UniChar *ch) {
436 return getCharacter(stream, ch, true);
437 }
438
439 CF_PRIVATE Boolean _inputStreamReturnCharacter(_CFXMLInputStream *stream, UniChar ch) {
440 Boolean decrementLineNum = false;
441 if (ch == '\n') {
442 decrementLineNum = true;
443 } else if (ch == '\r') {
444 UniChar nextChar;
445 if (!_inputStreamPeekCharacter(stream, &nextChar) || nextChar != '\n') {
446 decrementLineNum = true;
447 }
448 }
449
450 if (!(stream->flags & STREAM_OPEN)) {
451 return false;
452 } else if (stream->currentChar) {
453 if (stream->currentChar != stream->charBuffer) {
454 stream->currentChar --;
455 } else {
456 // Yuck; we're unlucky and are returning a character _before_ the first character in charBuffer
457 if (stream->bufferLength >= stream->bufferCapacity) {
458 growCharacterBuffer(stream);
459 }
460 memmove(stream->charBuffer + 1, stream->charBuffer, stream->bufferLength * sizeof(UniChar));
461 *stream->charBuffer = ch;
462 stream->bufferLength ++;
463 if (stream->mark) {
464 stream->mark ++;
465 }
466 if (stream->parserMark) {
467 stream->parserMark ++;
468 }
469 }
470 } else if ((stream->mark || stream->parserMark) && stream->bufferLength) {
471 // We've been collecting characters in charBuffer; the only reason stream->currentChar is NULL is that we've processed the last character thusfar translated from data. That last character is the one being returned.
472 stream->currentChar = stream->charBuffer + stream->bufferLength - 1;
473 } else if (stream->charBuffer) {
474 // We have processed all the meaningful characters from charBuffer and have no reason to preserve them. We use charBuffer to hold this one character that has been returned to us.
475 *stream->charBuffer = ch;
476 stream->currentChar = stream->charBuffer;
477 stream->bufferLength = 1;
478 if (stream->mark) {
479 stream->mark ++;
480 }
481 if (stream->parserMark) {
482 stream->parserMark ++;
483 }
484 } else if (stream->currentByte > CFDataGetBytePtr(stream->data)) {
485 // We have no character buffer available, so that means one of two things - either we've never needed a character buffer because all the characters could come directly out of the byte stream, or we've not yet processed the first character. The former means we can just back up the byte pointer; the latter means Bad Things have happened.
486 if (stream->flags & ENCODING_MATCHES_ASCII) {
487 stream->currentByte --;
488 } else { // Must be Unicode
489 stream->currentByte -= 2;
490 }
491 } else {
492 return false;
493 }
494 stream->charIndex --;
495 if (decrementLineNum) {
496 stream->lineNum --;
497 }
498 return true;
499 }
500
501 // Returns the pointer to hold as the mark
502 static UniChar *dropMark(_CFXMLInputStream *stream) {
503 if (stream->currentChar) {
504 return stream->currentChar;
505 } else if (stream->mark || stream->parserMark) {
506 return stream->charBuffer + stream->bufferLength;
507 } else {
508 if (!stream->charBuffer) {
509 growCharacterBuffer(stream);
510 }
511 stream->bufferLength = 0; // This will be sufficient to force a fetch into the buffer when the next character is requested
512 return stream->charBuffer;
513 }
514
515 }
516
517 CF_PRIVATE void _inputStreamSetMark(_CFXMLInputStream *stream) {
518 CFAssert(stream->mark == NULL, __kCFLogAssertion, "CF internal error: parser input stream malformed");
519 stream->mark = dropMark(stream);
520 }
521
522 CF_PRIVATE void _inputStreamClearMark(_CFXMLInputStream *stream) {
523 CFAssert(stream->mark != NULL, __kCFLogAssertion, "CF internal error: parser input stream malformed");
524 stream->mark = NULL;
525 }
526
527 CF_PRIVATE void _inputStreamGetCharactersFromMark(_CFXMLInputStream *stream, CFMutableStringRef string) {
528 UniChar *end = stream->currentChar ? stream->currentChar : stream->charBuffer + stream->bufferLength;
529 CFIndex numChars = end - stream->mark;
530 CFAssert(stream->mark, __kCFLogAssertion, "CF internal error: malformed XML input stream");
531 _fillStringWithCharacters(string, stream->mark, numChars);
532 }
533
534 static void restoreToMark(_CFXMLInputStream *stream, UniChar *mark) {
535 UniChar *end = stream->currentChar ? stream->currentChar : stream->charBuffer + stream->bufferLength;
536 if (end > mark) {
537 CFIndex numChars = end - mark;
538 stream->charIndex -= numChars;
539 stream->currentChar = mark;
540
541 // This is annoying; to keep the line count accurate, if the first character we are returning is a CR, we must decrement the line count iff the next character is NOT a LF
542 if (*(end - 1) == '\r') {
543 UniChar nextChar;
544 if (_inputStreamPeekCharacter(stream, &nextChar) && nextChar == '\n') {
545 end --;
546 }
547 }
548 while (end != mark) {
549 end --;
550 if (*end == '\r') {
551 stream->lineNum --;
552 } else if (*end == '\n') {
553 stream->lineNum --;
554 if (end != mark && *(end - 1) == '\r') {
555 end --;
556 }
557 }
558 }
559 }
560 }
561
562 CF_PRIVATE void _inputStreamBackUpToMark(_CFXMLInputStream *stream) {
563 CFAssert(stream->mark != NULL || stream->charBuffer == NULL, __kCFLogAssertion, "CF internal error: malformed XML input stream");
564 restoreToMark(stream, stream->mark);
565 }
566
567 CF_INLINE Boolean isWhitespaceChar(UniChar ch) {
568 return (ch == '\n' || ch == '\r' || ch == ' ' || ch == '\t');
569 }
570
571 CF_PRIVATE CFIndex _inputStreamSkipWhitespace(_CFXMLInputStream *stream, CFMutableStringRef str) {
572 UniChar ch;
573 CFIndex len = 0;
574 if (str) {
575 stream->parserMark = dropMark(stream);
576 }
577 while (getCharacter(stream, &ch, true) && isWhitespaceChar(ch)) {
578 len ++;
579 }
580 if (!isWhitespaceChar(ch)) {
581 _inputStreamReturnCharacter(stream, ch);
582 }
583 if (str) {
584 _fillStringWithCharacters(str, stream->parserMark, len);
585 stream->parserMark = NULL;
586 }
587 return len;
588 }
589
590 // false return means EOF was encountered without finding scanChars
591 CF_PRIVATE Boolean _inputStreamScanToCharacters(_CFXMLInputStream *stream, const UniChar *scanChars, CFIndex numChars, CFMutableStringRef str) {
592 Boolean done = false;
593 CFIndex firstRepeatIndex = -1;
594 CFIndex len = 0;
595 stream->parserMark = dropMark(stream);
596 do {
597 UniChar ch;
598 while (_inputStreamGetCharacter(stream, &ch) && ch != scanChars[0]) {
599 len ++;
600 }
601 if (ch != scanChars[0]) {
602 restoreToMark(stream, stream->parserMark);
603 stream->parserMark = NULL;
604 return false;
605 } else {
606 CFIndex i;
607 for (i = 1; i < numChars; i ++) {
608 if (!_inputStreamGetCharacter(stream, &ch)) break;
609 if (ch != scanChars[i]) break;
610 }
611 if (i == numChars) {
612 done = true;
613 } else {
614 if (firstRepeatIndex == -1) {
615 CFIndex j;
616 for (j = 1; j < numChars; j ++) {
617 if (scanChars[0] == scanChars[j]) {
618 break;
619 }
620 }
621 firstRepeatIndex = j;
622 }
623 _inputStreamReturnCharacter(stream, ch);
624 while (i > firstRepeatIndex) {
625 i --;
626 _inputStreamReturnCharacter(stream, scanChars[i]);
627 }
628 len += i;
629 }
630 }
631 } while (!done);
632 if (str) {
633 _fillStringWithCharacters(str, stream->parserMark, len);
634 }
635 stream->parserMark = NULL;
636 return true;
637 }
638
639 CF_PRIVATE Boolean _inputStreamMatchString(_CFXMLInputStream *stream, const UniChar *stringToMatch, CFIndex length) {
640 const UniChar *end = stringToMatch+length;
641 const UniChar *sPtr=stringToMatch;
642 stream->parserMark = dropMark(stream);
643 while (sPtr < end) {
644 UniChar ch;
645 if (!_inputStreamGetCharacter(stream, &ch)) break;
646 if (ch != *sPtr) break;
647 sPtr ++;
648 }
649 if (sPtr != end) {
650 restoreToMark(stream, stream->parserMark);
651 stream->parserMark = NULL;
652 return false;
653 } else {
654 stream->parserMark = NULL;
655 return true;
656 }
657 }
658
659 CF_PRIVATE Boolean _inputStreamScanQuotedString(_CFXMLInputStream *stream, CFMutableStringRef str) {
660 UniChar ch;
661 if (!_inputStreamPeekCharacter(stream, &ch)) return false;
662 if (ch != '\'' && ch != '\"') return false;
663
664 _inputStreamGetCharacter(stream, &ch);
665 if (!_inputStreamScanToCharacters(stream, &ch, 1, str)) {
666 return false;
667 }
668 return true;
669 }
670
671 /*
672 [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender
673 [5] Name ::= (Letter | '_' | ':') (NameChar)*
674 [7] Nmtoken ::= (NameChar)+
675 [84] Letter ::= BaseChar | Ideographic
676
677 We don't do this quite right; we rely on the Unicode charsets to do this analysis. While
678 the productions in the XML spec are based on the Unicode character sets, the definitions
679 differ slightly to avoid those areas where the Unicode standard is still being resolved.
680 At any rate, I'd lay money that using the Unicode charsets, we will be more correct than
681 the vast majority of parsers out there.
682
683 Letter == kCFUniCharLetterCharacterSet
684 Digit == kCFUniCharDecimalDigitCharacterSet
685 CombiningChar == kCFUniCharNonBaseCharacterSet
686 Extender - complex, and not represented by a uniform character set.
687 */
688 CF_PRIVATE Boolean _inputStreamScanXMLName(_CFXMLInputStream *stream, Boolean isNMToken, CFStringRef *str) {
689 UniChar ch;
690 Boolean success = true;
691 stream->parserMark = dropMark(stream);
692 if (!isNMToken) {
693 // Only difference between an NMToken and a Name is Names have a stricter condition on the first character
694 if (!getCharacter(stream, &ch, false)) {
695 success = false;
696 } else if (!CFUniCharIsMemberOf(ch, kCFUniCharLetterCharacterSet) && ch != '_' && ch != ':') {
697 success = false;
698 } else {
699 getCharacter(stream, &ch, true);
700 }
701 }
702 if (success) {
703 while (getCharacter(stream, &ch, true)) {
704 if (!CFUniCharIsMemberOf(ch, kCFUniCharLetterCharacterSet) && !CFUniCharIsMemberOf(ch, kCFUniCharDecimalDigitCharacterSet) && ch != '.' && ch != '-' && ch != '_' && ch != ':' && !CFUniCharIsMemberOf(ch, kCFUniCharNonBaseCharacterSet)) {
705 _inputStreamReturnCharacter(stream, ch);
706 break;
707 }
708 }
709 if (NULL == stream->currentChar || stream->currentChar == stream->parserMark) {
710 success = false; // Must have processed at least one character
711 }
712 }
713 if (success) {
714 if (str) {
715 if (!stream->nameSet) {
716 stream->nameSet = CFSetCreateMutable(stream->allocator, 0, &kCFTypeSetCallBacks);
717 stream->tempString = CFStringCreateMutableWithExternalCharactersNoCopy(stream->allocator, NULL, 0, 0, kCFAllocatorNull);
718 }
719 CFStringSetExternalCharactersNoCopy(stream->tempString, stream->parserMark, stream->currentChar-stream->parserMark, stream->currentChar-stream->parserMark);
720 if (!CFSetGetValueIfPresent(stream->nameSet, stream->tempString, (const void **)str)) {
721 *str = (CFStringRef)CFStringCreateCopy(stream->allocator, stream->tempString);
722 CFSetAddValue(stream->nameSet, *str);
723 CFRelease(*str);
724 }
725 }
726 } else {
727 restoreToMark(stream, stream->parserMark);
728 }
729 stream->parserMark = NULL;
730 return success;
731 }
732
733