]> git.saurik.com Git - apple/cf.git/blame - Parsing.subproj/CFXMLInputStream.c
CF-299.tar.gz
[apple/cf.git] / Parsing.subproj / CFXMLInputStream.c
CommitLineData
9ce05555
A
1/*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
7 *
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25/* CFXMLInputStream.c
26 Copyright 1999-2002, Apple, Inc. All rights reserved.
27 Responsibility: Chris Parker
28*/
29
30#include <CoreFoundation/CFCharacterSet.h>
31#include <string.h>
32#include "CFStringEncodingConverter.h"
33#include "CFUniChar.h"
34#include "CFXMLInputStream.h"
35
36/* Utility functions used in parsing */
37static Boolean determineEncoding(_CFXMLInputStream *stream) {
38 const uint8_t *bytes = (uint8_t *)CFDataGetBytePtr(stream->data);
39 UInt32 length = CFDataGetLength(stream->data);
40 const uint8_t *idx = 0L, *end = 0L;
41 const uint8_t *base = 0L;
42 char quote = ' ';
43 Boolean useUTF8 = false;
44
45 // Check for the byte order mark first
46 if (length > 2) {
47 // This clause checks for the unicode byte order mark, or a Unicode sequence lacking the BOM; technically an error, but this check is recommended by the XML spec
48 if ((*bytes == 0xFF && *(bytes+1) == 0xFE) ||*(bytes+1) == 0x00) {
49 stream->flags |= ENCODING_IS_UNICODE_SWAPPED;
50 if (*bytes == 0xFF) {
51 stream->currentByte = bytes + 2;
52 }
53 stream->encoding = kCFStringEncodingUnicode;
54 return true;
55 } else if ((*bytes == 0xFE && *(bytes+1) == 0xFF) || *bytes == 0x00) {
56 stream->flags |= ENCODING_IS_UNICODE_NATURAL;
57 if (*bytes == 0xFE) {
58 stream->currentByte = bytes + 2;
59 }
60 stream->encoding = kCFStringEncodingUnicode;
61 return true;
62 } else if(*bytes == 0xEF && *(bytes+1) == 0xBB && *(bytes+2) == 0xBF) {
63 if(*bytes == 0xEF) {
64 stream->currentByte = bytes + 3;
65 }
66 stream->encoding = kCFStringEncodingUTF8;
67 stream->flags |= ENCODING_MATCHES_ASCII;
68 return true;
69 }
70 }
71 // Scan for the <?xml.... ?> opening
72 if (length < 5 || strncmp((char const *) bytes, "<?xml", 5) != 0) {
73 useUTF8 = true;
74 }
75 if (!useUTF8) {
76 idx = bytes + 5;
77 end = bytes + length;
78 // Found "<?xml"; now we scan for "encoding"
79 while (idx < end) {
80 uint8_t ch = *idx;
81 const uint8_t *scan;
82 if ( ch == '?' || ch == '>') {
83 useUTF8 = true;
84 break;
85 }
86 idx ++;
87 scan = idx;
88 if (ch == 'e' && *scan++ == 'n' && *scan++ == 'c' && *scan++ == 'o' && *scan++ == 'd' && *scan++ == 'i' && *scan++ == 'n' && *scan++ == 'g' && *scan++ == '=') {
89 idx = scan;
90 break;
91 }
92 }
93 if (!useUTF8 && idx >= end) {
94 useUTF8 = true;
95 }
96 }
97 if (!useUTF8) {
98 // Found "encoding="; see if we've got an honest-to-goodness encoding name
99 quote = *idx;
100 if (quote != '\'' && quote != '\"') {
101 useUTF8 = true;
102 }
103 }
104 if (!useUTF8) {
105 base = idx + 1; // Move past the quote character
106 idx ++;
107 while (idx < end && *idx != quote) idx ++;
108 if (idx >= end) {
109 useUTF8 = true;
110 }
111 }
112 if (!useUTF8) {
113 UInt32 len = idx - base;
114 if (len == 5 && (*base == 'u' || *base == 'U') && (base[1] == 't' || base[1] == 'T') && (base[2] == 'f' || base[2] == 'F') && (base[3] == '-') && (base[4] == '8')) {
115 useUTF8 = true;
116 } else {
117 CFStringRef encodingName = CFStringCreateWithBytes(stream->allocator, base, len, kCFStringEncodingISOLatin1, false);
118 stream->encoding = CFStringConvertIANACharSetNameToEncoding(encodingName);
119 CFRelease(encodingName);
120 }
121 }
122 if (useUTF8) {
123 stream->encoding = kCFStringEncodingUTF8;
124 stream->flags |= ENCODING_MATCHES_ASCII;
125 return true;
126 } else if (stream->encoding == kCFStringEncodingInvalidId) {
127 return false;
128 } else if (__CFStringEncodingIsSupersetOfASCII(stream->encoding)) {
129 stream->flags |= ENCODING_MATCHES_ASCII;
130 }
131 return true;
132}
133
134CF_INLINE void _fillStringWithCharacters(CFMutableStringRef string, UniChar *characters, CFIndex numChars) {
135 CFStringDelete(string, CFRangeMake(0, CFStringGetLength(string)));
136 if (numChars) {
137 CFStringAppendCharacters(string, characters, numChars);
138 }
139}
140
141__private_extern__ Boolean _openInputStream(_CFXMLInputStream *stream) {
142 if (NULL == stream->data) {
143 return false;
144 } else {
145 stream->currentByte = CFDataGetBytePtr(stream->data);
146 if (determineEncoding(stream)) {
147 stream->flags |= STREAM_OPEN;
148 return true;
149 } else {
150 return false;
151 }
152 }
153}
154
155__private_extern__ void _initializeInputStream(_CFXMLInputStream *stream, CFAllocatorRef alloc, CFURLRef dataSource, CFDataRef xmlData) {
156 stream->data = xmlData ? CFRetain(xmlData) : NULL;
157 stream->url = dataSource ? CFRetain(dataSource) : NULL;
158 stream->encoding = kCFStringEncodingInvalidId;
159 stream->currentByte = NULL;
160
161 stream->allocator = CFRetain(alloc);
162 stream->charBuffer = NULL;
163 stream->currentChar = NULL;
164 stream->mark = NULL;
165 stream->parserMark = NULL;
166 stream->bufferLength = 0;
167 stream->bufferCapacity = 0;
168
169 stream->charIndex = 1;
170 stream->lineNum = 1;
171
172 stream->flags = 0;
173 stream->nameSet = NULL;
174 stream->tempString = NULL;
175}
176
177
178__private_extern__ void _freeInputStream(_CFXMLInputStream *stream) {
179 if (stream->data) CFRelease(stream->data);
180 if (stream->url) CFRelease(stream->url);
181 if (stream->charBuffer) CFAllocatorDeallocate(stream->allocator, stream->charBuffer);
182 if (stream->nameSet) CFRelease(stream->nameSet);
183 if (stream->tempString) CFRelease(stream->tempString);
184 CFRelease(stream->allocator);
185}
186
187__private_extern__ CFStringEncoding _inputStreamGetEncoding(_CFXMLInputStream *stream) {
188 return stream->encoding;
189}
190
191__private_extern__ CFIndex _inputStreamCurrentLocation(_CFXMLInputStream *stream) {
192 return stream->charIndex;
193}
194
195__private_extern__ CFIndex _inputStreamCurrentLine(_CFXMLInputStream *stream) {
196 return stream->lineNum;
197}
198
199__private_extern__ Boolean _inputStreamAtEOF(_CFXMLInputStream *stream) {
200 if (!(stream->flags & STREAM_OPEN)) return false;
201 if (stream->currentChar) return false;
202 if (stream->currentByte - CFDataGetBytePtr(stream->data) < CFDataGetLength(stream->data)) return false;
203 return true;
204}
205
206__private_extern__ Boolean _inputStreamComposingErrorOccurred(_CFXMLInputStream *stream) {
207 return stream->flags & ENCODING_COMPOSITION_ERROR;
208}
209
210#define INITIAL_BUFFER_SIZE 64
211static void growCharacterBuffer(_CFXMLInputStream *stream) {
212 if (!stream->charBuffer) {
213 stream->charBuffer = CFAllocatorAllocate(stream->allocator, INITIAL_BUFFER_SIZE*sizeof(UniChar), 0);
214 stream->bufferCapacity = INITIAL_BUFFER_SIZE;
215 } else {
216 CFIndex currCharDelta = stream->currentChar ? stream->currentChar - stream->charBuffer : -1;
217 CFIndex markDelta = stream->mark ? stream->mark - stream->charBuffer: -1;
218 CFIndex parserMarkDelta = stream->parserMark ? stream->parserMark - stream->charBuffer: -1;
219 UniChar *newBuffer = CFAllocatorReallocate(stream->allocator, stream->charBuffer, stream->bufferCapacity * 2 * sizeof(UniChar), 0);
220 stream->bufferCapacity *= 2;
221 if (newBuffer != stream->charBuffer) {
222 stream->charBuffer = newBuffer;
223 if (currCharDelta != -1) {
224 stream->currentChar = newBuffer + currCharDelta;
225 }
226 if (markDelta != -1) {
227 stream->mark = newBuffer + markDelta;
228 }
229 if (parserMarkDelta != -1) {
230 stream->parserMark = newBuffer + parserMarkDelta;
231 }
232 }
233 }
234}
235
236static CFIndex loadCharacters(UniChar *base, CFIndex maxLength, _CFXMLInputStream *stream) {
237 const uint8_t *dataEnd = CFDataGetBytePtr(stream->data) + CFDataGetLength(stream->data);
238 if (stream->flags & (ENCODING_IS_UNICODE_NATURAL|ENCODING_IS_UNICODE_SWAPPED) ) {
239 CFIndex charsToTranslate = (dataEnd - stream->currentByte) / sizeof(UniChar);
240 if (charsToTranslate > maxLength) {
241 charsToTranslate = maxLength;
242 }
243 if (stream->flags & ENCODING_IS_UNICODE_NATURAL) {
244 memmove(base, stream->currentByte, charsToTranslate * sizeof(UniChar));
245 stream->currentByte += (charsToTranslate * sizeof(UniChar));
246 } else {
247 CFIndex i;
248 uint8_t *baseBytePtr = (uint8_t *)base;
249 for (i = 0; i < charsToTranslate; i ++) {
250 *(baseBytePtr + 1) = *stream->currentByte;
251 *baseBytePtr = *(stream->currentByte + 1);
252 baseBytePtr += 2;
253 stream->currentByte += 2;
254 }
255 }
256 return charsToTranslate;
257 } else {
258 CFIndex lengthConsumed = 0, usedByteLength, usedCharLength;
259 UInt32 conversionResult;
260 if (stream->flags & ENCODING_MATCHES_ASCII) {
261 while (stream->currentByte < dataEnd && lengthConsumed < maxLength) {
262 if (*stream->currentByte > 0x7f) break;
263 *base = *stream->currentByte;
264 base ++;
265 stream->currentByte ++;
266 lengthConsumed ++;
267 }
268 if (stream->currentByte == dataEnd || lengthConsumed == maxLength) {
269 return lengthConsumed;
270 }
271 }
272 conversionResult = CFStringEncodingBytesToUnicode(stream->encoding, 0, stream->currentByte, dataEnd - stream->currentByte, &usedByteLength, base, maxLength-lengthConsumed, &usedCharLength);
273 if(kCFStringEncodingConversionSuccess != conversionResult) {
274 switch(conversionResult) {
275 case kCFStringEncodingConverterUnavailable:
276 case kCFStringEncodingInvalidInputStream:
277 stream->flags |= ENCODING_COMPOSITION_ERROR;
278 break;
279 case kCFStringEncodingInsufficientOutputBufferLength:
280 default:
281 break;
282 }
283 }
284 if (usedByteLength > 0) {
285 stream->currentByte += usedByteLength;
286 lengthConsumed += usedCharLength;
287 }
288 return lengthConsumed;
289 }
290}
291
292// returns number of characters filled
293CF_INLINE CFIndex fillToCapacity(_CFXMLInputStream *stream) {
294 CFIndex numFilled;
295 if (stream->bufferLength >= stream->bufferCapacity) return 0;
296 // Try and fill in the remaining characters
297 numFilled = loadCharacters(stream->charBuffer+stream->bufferLength, stream->bufferCapacity - stream->bufferLength, stream);
298 if (numFilled != 0) {
299 stream->currentChar = stream->charBuffer + stream->bufferLength;
300 stream->bufferLength += numFilled;
301 }
302 return numFilled;
303}
304
305// we are expected to move mark & parserMark relative to any moved characters, set currentChar to the first new character fetched, update bufferLength, and advance currentByte as appropriate. Does not check for EOF; it is the caller's responsibility to verify this.
306static void fillCharacterBuffer(_CFXMLInputStream *stream) {
307 if (!stream->charBuffer) {
308 growCharacterBuffer(stream);
309 }
310 if (!stream->mark && !stream->parserMark) {
311 // This is the easy case; we can freely overwrite the buffer; if either mark or parserMark is set, we must not remove any characters from those marks and the end of the buffer
312 CFIndex fillLength = stream->bufferCapacity-5; // We leave a few characters at the end, b/c we don't want to reallocate (doubling the amount of memory used) just b/c we're matching a small string near the end of the filled buffer
313 stream->bufferLength = loadCharacters(stream->charBuffer, fillLength, stream);
314 CFAssert(stream->bufferLength != 0, __kCFLogAssertion, "CF internal error: XML parser input stream corruption");
315 stream->currentChar = stream->charBuffer;
316 } else {
317 // We do everything we can not to allocate; first we fill any remaining characters. If that doesn't work, we try shifting the characters starting at the earlier of mark or parserMark to the beginning of buffer, then filling the newly-freed characters.
318 Boolean done;
319
320 // First try just filling the remaining capacity
321 done = (fillToCapacity(stream) != 0);
322 if (!done) {
323 const UniChar *leftMostMark;
324 if (stream->mark && !stream->parserMark) {
325 leftMostMark = stream->mark;
326 } else if (stream->parserMark && !stream->mark) {
327 leftMostMark = stream->parserMark;
328 } else if (stream->parserMark < stream->mark) {
329 leftMostMark = stream->parserMark;
330 } else {
331 leftMostMark = stream->mark;
332 }
333 if (leftMostMark > stream->charBuffer) {
334 CFIndex delta = leftMostMark - stream->charBuffer;
335 memmove(stream->charBuffer, leftMostMark, (stream->bufferLength - delta) * sizeof(UniChar));
336 stream->bufferLength -= delta;
337 if (stream->mark) {
338 stream->mark -= delta;
339 }
340 if (stream->parserMark) {
341 stream->parserMark -= delta;
342 }
343 // Now try to fill the newly-opened space
344 done = (fillToCapacity(stream) != 0);
345 delta = loadCharacters(stream->charBuffer+stream->bufferLength, stream->bufferCapacity - stream->bufferLength, stream);
346 }
347 }
348 if (!done) {
349 // No help for it; now we must allocate
350 growCharacterBuffer(stream);
351 fillToCapacity(stream); // If this doesn't work, we give up.
352 }
353 }
354}
355
356/* The guts of getCharacter() have been separated in order to allow getCharacter() to be small and more easily inline-able. Note that a check late in the 10.3 development cycle indicates that getCharacter() should call getCharacterGuts() less than 2% of the time. (In 29000 calls, less than 400 called this. Note that a majority of calls have advanceStream set, so that was left in the inline version. Also note that some calls to getCharacter() were changed to go through the functions _inputStreamGetCharacter() or _inputStreamPeekCharacter(), as the expansion in using the inline version didn't seem worthwhile. See 3275503 for some data supporting this.
357*/
358static Boolean getCharacterGuts(_CFXMLInputStream *stream, UniChar *ch, Boolean advanceStream) {
359 if (stream->currentByte - CFDataGetBytePtr(stream->data) >= CFDataGetLength(stream->data)) {
360 return false; // EOF
361 } else if (!((stream->mark || stream->parserMark) && advanceStream) &&
362 (((stream->flags & ENCODING_MATCHES_ASCII) && *(stream->currentByte) < 0x7F) ||
363 (stream->flags & (ENCODING_IS_UNICODE_NATURAL | ENCODING_IS_UNICODE_SWAPPED)))) {
364 // We can only perform optimizations if neither mark is set (if the mark is set, we must fill the character buffer so we can retrieve the characters later), and the encoding is Unicode, or the encoding matches ASCII and we're looking at a low-byte character.
365 if (stream->flags & ENCODING_MATCHES_ASCII) {
366 *ch = (UniChar)*(stream->currentByte);
367 if (advanceStream) {
368 stream->currentByte ++;
369 }
370 } else if (stream->flags & ENCODING_IS_UNICODE_NATURAL) {
371 *ch = (*stream->currentByte) << 8;
372 *ch += *(stream->currentByte + 1);
373 if (advanceStream) {
374 stream->currentByte += 2;
375 }
376 } else {
377 // Unicode with swapped bytes
378 *ch = (*(stream->currentByte + 1)) << 8;
379 *ch += *stream->currentByte;
380 if (advanceStream) {
381 stream->currentByte += 2;
382 }
383 }
384 } else {
385 fillCharacterBuffer(stream); // this takes into account markIsSet to make sure and do the right thing
386 if (!stream->charBuffer || !stream->currentChar) {
387 return false;
388 } else {
389 *ch = *(stream->currentChar);
390 if (advanceStream) {
391 stream->currentChar ++;
392 if (stream->currentChar == stream->charBuffer + stream->bufferLength) {
393 stream->currentChar = NULL;
394 }
395 }
396 }
397 }
398 return true;
399}
400
401/* See comments above getCharacterGuts()
402*/
403CF_INLINE Boolean getCharacter(_CFXMLInputStream *stream, UniChar *ch, Boolean advanceStream) {
404 if (!(stream->flags & STREAM_OPEN)) {
405 return false;
406 } else if (stream->currentChar) {
407 *ch = *stream->currentChar;
408 if (advanceStream) {
409 stream->currentChar ++;
410 if (stream->currentChar == stream->charBuffer + stream->bufferLength) {
411 stream->currentChar = NULL;
412 }
413 }
414 } else {
415 if (!getCharacterGuts(stream, ch, advanceStream)) return false;
416 }
417 if (advanceStream) {
418 UniChar nextChar;
419 stream->charIndex ++;
420 if ((*ch == '\n') || ((*ch == '\r') && (!_inputStreamPeekCharacter(stream, &nextChar) || nextChar != '\n'))) stream->lineNum ++;
421 }
422 return true;
423}
424
425__private_extern__ Boolean _inputStreamPeekCharacter(_CFXMLInputStream *stream, UniChar *ch) {
426 return getCharacter(stream, ch, false);
427}
428
429__private_extern__ Boolean _inputStreamGetCharacter(_CFXMLInputStream *stream, UniChar *ch) {
430 return getCharacter(stream, ch, true);
431}
432
433__private_extern__ Boolean _inputStreamReturnCharacter(_CFXMLInputStream *stream, UniChar ch) {
434 Boolean decrementLineNum = false;
435 if (ch == '\n') {
436 decrementLineNum = true;
437 } else if (ch == '\r') {
438 UniChar nextChar;
439 if (!_inputStreamPeekCharacter(stream, &nextChar) || nextChar != '\n') {
440 decrementLineNum = true;
441 }
442 }
443
444 if (!(stream->flags & STREAM_OPEN)) {
445 return false;
446 } else if (stream->currentChar) {
447 if (stream->currentChar != stream->charBuffer) {
448 stream->currentChar --;
449 } else {
450 // Yuck; we're unlucky and are returning a character _before_ the first character in charBuffer
451 if (stream->bufferLength >= stream->bufferCapacity) {
452 growCharacterBuffer(stream);
453 }
454 memmove(stream->charBuffer + 1, stream->charBuffer, stream->bufferLength * sizeof(UniChar));
455 *stream->charBuffer = ch;
456 stream->bufferLength ++;
457 if (stream->mark) {
458 stream->mark ++;
459 }
460 if (stream->parserMark) {
461 stream->parserMark ++;
462 }
463 }
464 } else if ((stream->mark || stream->parserMark) && stream->bufferLength) {
465 // We've been collecting characters in charBuffer; the only reason stream->currentChar is NULL is that we've processed the last character thusfar translated from data. That last character is the one being returned.
466 stream->currentChar = stream->charBuffer + stream->bufferLength - 1;
467 } else if (stream->charBuffer) {
468 // We have processed all the meaningful characters from charBuffer and have no reason to preserve them. We use charBuffer to hold this one character that has been returned to us.
469 *stream->charBuffer = ch;
470 stream->currentChar = stream->charBuffer;
471 stream->bufferLength = 1;
472 if (stream->mark) {
473 stream->mark ++;
474 }
475 if (stream->parserMark) {
476 stream->parserMark ++;
477 }
478 } else if (stream->currentByte > CFDataGetBytePtr(stream->data)) {
479 // We have no character buffer available, so that means one of two things - either we've never needed a character buffer because all the characters could come directly out of the byte stream, or we've not yet processed the first character. The former means we can just back up the byte pointer; the latter means Bad Things have happened.
480 if (stream->flags & ENCODING_MATCHES_ASCII) {
481 stream->currentByte --;
482 } else { // Must be Unicode
483 stream->currentByte -= 2;
484 }
485 } else {
486 return false;
487 }
488 stream->charIndex --;
489 if (decrementLineNum) {
490 stream->lineNum --;
491 }
492 return true;
493}
494
495// Returns the pointer to hold as the mark
496static UniChar *dropMark(_CFXMLInputStream *stream) {
497 if (stream->currentChar) {
498 return stream->currentChar;
499 } else if (stream->mark || stream->parserMark) {
500 return stream->charBuffer + stream->bufferLength;
501 } else {
502 if (!stream->charBuffer) {
503 growCharacterBuffer(stream);
504 }
505 stream->bufferLength = 0; // This will be sufficient to force a fetch into the buffer when the next character is requested
506 return stream->charBuffer;
507 }
508
509}
510
511__private_extern__ void _inputStreamSetMark(_CFXMLInputStream *stream) {
512 CFAssert(stream->mark == NULL, __kCFLogAssertion, "CF internal error: parser input stream malformed");
513 stream->mark = dropMark(stream);
514}
515
516__private_extern__ void _inputStreamClearMark(_CFXMLInputStream *stream) {
517 CFAssert(stream->mark != NULL, __kCFLogAssertion, "CF internal error: parser input stream malformed");
518 stream->mark = NULL;
519}
520
521__private_extern__ void _inputStreamGetCharactersFromMark(_CFXMLInputStream *stream, CFMutableStringRef string) {
522 UniChar *end = stream->currentChar ? stream->currentChar : stream->charBuffer + stream->bufferLength;
523 CFIndex numChars = end - stream->mark;
524 CFAssert(stream->mark, __kCFLogAssertion, "CF internal error: malformed XML input stream");
525 _fillStringWithCharacters(string, stream->mark, numChars);
526}
527
528static void restoreToMark(_CFXMLInputStream *stream, UniChar *mark) {
529 UniChar *end = stream->currentChar ? stream->currentChar : stream->charBuffer + stream->bufferLength;
530 if (end > mark) {
531 CFIndex numChars = end - mark;
532 stream->charIndex -= numChars;
533 stream->currentChar = mark;
534
535 // This is annoying; to keep the line count accurate, if the first character we are returning is a CR, we must decrement the line count iff the next character is NOT a LF
536 if (*(end - 1) == '\r') {
537 UniChar nextChar;
538 if (_inputStreamPeekCharacter(stream, &nextChar) && nextChar == '\n') {
539 end --;
540 }
541 }
542 while (end != mark) {
543 end --;
544 if (*end == '\r') {
545 stream->lineNum --;
546 } else if (*end == '\n') {
547 stream->lineNum --;
548 if (end != mark && *(end - 1) == '\r') {
549 end --;
550 }
551 }
552 }
553 }
554}
555
556__private_extern__ void _inputStreamBackUpToMark(_CFXMLInputStream *stream) {
557 CFAssert(stream->mark != NULL || stream->charBuffer == NULL, __kCFLogAssertion, "CF internal error: malformed XML input stream");
558 restoreToMark(stream, stream->mark);
559}
560
561CF_INLINE Boolean isWhitespaceChar(UniChar ch) {
562 return (ch == '\n' || ch == '\r' || ch == ' ' || ch == '\t');
563}
564
565__private_extern__ CFIndex _inputStreamSkipWhitespace(_CFXMLInputStream *stream, CFMutableStringRef str) {
566 UniChar ch;
567 CFIndex len = 0;
568 if (str) {
569 stream->parserMark = dropMark(stream);
570 }
571 while (getCharacter(stream, &ch, true) && isWhitespaceChar(ch)) {
572 len ++;
573 }
574 if (!isWhitespaceChar(ch)) {
575 _inputStreamReturnCharacter(stream, ch);
576 }
577 if (str) {
578 _fillStringWithCharacters(str, stream->parserMark, len);
579 stream->parserMark = NULL;
580 }
581 return len;
582}
583
584// false return means EOF was encountered without finding scanChars
585__private_extern__ Boolean _inputStreamScanToCharacters(_CFXMLInputStream *stream, const UniChar *scanChars, CFIndex numChars, CFMutableStringRef str) {
586 Boolean done = false;
587 CFIndex firstRepeatIndex = -1;
588 CFIndex len = 0;
589 stream->parserMark = dropMark(stream);
590 do {
591 UniChar ch;
592 while (_inputStreamGetCharacter(stream, &ch) && ch != scanChars[0]) {
593 len ++;
594 }
595 if (ch != scanChars[0]) {
596 restoreToMark(stream, stream->parserMark);
597 stream->parserMark = NULL;
598 return false;
599 } else {
600 CFIndex i;
601 for (i = 1; i < numChars; i ++) {
602 if (!_inputStreamGetCharacter(stream, &ch)) break;
603 if (ch != scanChars[i]) break;
604 }
605 if (i == numChars) {
606 done = true;
607 } else {
608 if (firstRepeatIndex == -1) {
609 CFIndex j;
610 for (j = 1; j < numChars; j ++) {
611 if (scanChars[0] == scanChars[j]) {
612 break;
613 }
614 }
615 firstRepeatIndex = j;
616 }
617 _inputStreamReturnCharacter(stream, ch);
618 while (i > firstRepeatIndex) {
619 i --;
620 _inputStreamReturnCharacter(stream, scanChars[i]);
621 }
622 len += i;
623 }
624 }
625 } while (!done);
626 if (str) {
627 _fillStringWithCharacters(str, stream->parserMark, len);
628 }
629 stream->parserMark = NULL;
630 return true;
631}
632
633__private_extern__ Boolean _inputStreamMatchString(_CFXMLInputStream *stream, const UniChar *stringToMatch, CFIndex length) {
634 const UniChar *end = stringToMatch+length;
635 const UniChar *sPtr=stringToMatch;
636 stream->parserMark = dropMark(stream);
637 while (sPtr < end) {
638 UniChar ch;
639 if (!_inputStreamGetCharacter(stream, &ch)) break;
640 if (ch != *sPtr) break;
641 sPtr ++;
642 }
643 if (sPtr != end) {
644 restoreToMark(stream, stream->parserMark);
645 stream->parserMark = NULL;
646 return false;
647 } else {
648 stream->parserMark = NULL;
649 return true;
650 }
651}
652
653__private_extern__ Boolean _inputStreamScanQuotedString(_CFXMLInputStream *stream, CFMutableStringRef str) {
654 UniChar ch;
655 if (!_inputStreamPeekCharacter(stream, &ch)) return false;
656 if (ch != '\'' && ch != '\"') return false;
657
658 _inputStreamGetCharacter(stream, &ch);
659 if (!_inputStreamScanToCharacters(stream, &ch, 1, str)) {
660 return false;
661 }
662 return true;
663}
664
665/*
666 [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender
667 [5] Name ::= (Letter | '_' | ':') (NameChar)*
668 [7] Nmtoken ::= (NameChar)+
669 [84] Letter ::= BaseChar | Ideographic
670
671 We don't do this quite right; we rely on the Unicode charsets to do this analysis. While
672 the productions in the XML spec are based on the Unicode character sets, the definitions
673 differ slightly to avoid those areas where the Unicode standard is still being resolved.
674 At any rate, I'd lay money that using the Unicode charsets, we will be more correct than
675 the vast majority of parsers out there.
676
677 Letter == kCFUniCharLetterCharacterSet
678 Digit == kCFUniCharDecimalDigitCharacterSet
679 CombiningChar == kCFUniCharNonBaseCharacterSet
680 Extender - complex, and not represented by a uniform character set.
681 */
682__private_extern__ Boolean _inputStreamScanXMLName(_CFXMLInputStream *stream, Boolean isNMToken, CFStringRef *str) {
683 UniChar ch;
684 Boolean success = true;
685 stream->parserMark = dropMark(stream);
686 if (!isNMToken) {
687 // Only difference between an NMToken and a Name is Names have a stricter condition on the first character
688 if (!getCharacter(stream, &ch, false)) {
689 success = false;
690 } else if (!CFUniCharIsMemberOf(ch, kCFUniCharLetterCharacterSet) && ch != '_' && ch != ':') {
691 success = false;
692 } else {
693 getCharacter(stream, &ch, true);
694 }
695 }
696 if (success) {
697 while (getCharacter(stream, &ch, true)) {
698 if (!CFUniCharIsMemberOf(ch, kCFUniCharLetterCharacterSet) && !CFUniCharIsMemberOf(ch, kCFUniCharDecimalDigitCharacterSet) && ch != '.' && ch != '-' && ch != '_' && ch != ':' && !CFUniCharIsMemberOf(ch, kCFUniCharNonBaseCharacterSet)) {
699 _inputStreamReturnCharacter(stream, ch);
700 break;
701 }
702 }
703 if (stream->currentChar == stream->parserMark) {
704 success = false; // Must have processed at least one character
705 }
706 }
707 if (success) {
708 if (str) {
709 if (!stream->nameSet) {
710 stream->nameSet = CFSetCreateMutable(stream->allocator, 0, &kCFTypeSetCallBacks);
711 stream->tempString = CFStringCreateMutableWithExternalCharactersNoCopy(stream->allocator, NULL, 0, 0, kCFAllocatorNull);
712 }
713 CFStringSetExternalCharactersNoCopy(stream->tempString, stream->parserMark, stream->currentChar-stream->parserMark, stream->currentChar-stream->parserMark);
714 if (!CFSetGetValueIfPresent(stream->nameSet, stream->tempString, (const void **)str)) {
715 *str = CFStringCreateCopy(stream->allocator, stream->tempString);
716 CFSetAddValue(stream->nameSet, *str);
717 CFRelease(*str);
718 }
719 }
720 } else {
721 restoreToMark(stream, stream->parserMark);
722 }
723 stream->parserMark = NULL;
724 return success;
725}
726