]> git.saurik.com Git - apple/cf.git/blob - Parsing.subproj/CFXMLInputStream.c
CF-368.27.tar.gz
[apple/cf.git] / Parsing.subproj / CFXMLInputStream.c
1 /*
2 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23 /* CFXMLInputStream.c
24 Copyright 1999-2002, Apple, Inc. All rights reserved.
25 Responsibility: Chris Parker
26 */
27
28 #include <CoreFoundation/CFCharacterSet.h>
29 #include <CoreFoundation/CFURLAccess.h>
30 #include <string.h>
31 #include "CFStringEncodingConverter.h"
32 #include "CFUniChar.h"
33 #include "CFXMLInputStream.h"
34
35 /* Utility functions used in parsing */
36 static Boolean determineEncoding(_CFXMLInputStream *stream) {
37 const uint8_t *bytes = (uint8_t *)CFDataGetBytePtr(stream->data);
38 UInt32 length = CFDataGetLength(stream->data);
39 const uint8_t *idx = 0L, *end = 0L;
40 const uint8_t *base = 0L;
41 char quote = ' ';
42 Boolean useUTF8 = false;
43
44 // Check for the byte order mark first
45 if (length > 2) {
46 // This clause checks for the unicode byte order mark, or a Unicode sequence lacking the BOM; technically an error, but this check is recommended by the XML spec
47 if ((*bytes == 0xFF && *(bytes+1) == 0xFE) ||*(bytes+1) == 0x00) {
48 stream->flags |= ENCODING_IS_UNICODE_SWAPPED;
49 if (*bytes == 0xFF) {
50 stream->currentByte = bytes + 2;
51 }
52 stream->encoding = kCFStringEncodingUnicode;
53 return true;
54 } else if ((*bytes == 0xFE && *(bytes+1) == 0xFF) || *bytes == 0x00) {
55 stream->flags |= ENCODING_IS_UNICODE_NATURAL;
56 if (*bytes == 0xFE) {
57 stream->currentByte = bytes + 2;
58 }
59 stream->encoding = kCFStringEncodingUnicode;
60 return true;
61 } else if(*bytes == 0xEF && *(bytes+1) == 0xBB && *(bytes+2) == 0xBF) {
62 if(*bytes == 0xEF) {
63 stream->currentByte = bytes + 3;
64 }
65 stream->encoding = kCFStringEncodingUTF8;
66 stream->flags |= ENCODING_MATCHES_ASCII;
67 return true;
68 }
69 }
70 // Scan for the <?xml.... ?> opening
71 if (length < 5 || strncmp((char const *) bytes, "<?xml", 5) != 0) {
72 useUTF8 = true;
73 }
74 if (!useUTF8) {
75 idx = bytes + 5;
76 end = bytes + length;
77 // Found "<?xml"; now we scan for "encoding"
78 while (idx < end) {
79 uint8_t ch = *idx;
80 const uint8_t *scan;
81 if ( ch == '?' || ch == '>') {
82 useUTF8 = true;
83 break;
84 }
85 idx ++;
86 scan = idx;
87 if (ch == 'e' && *scan++ == 'n' && *scan++ == 'c' && *scan++ == 'o' && *scan++ == 'd' && *scan++ == 'i' && *scan++ == 'n' && *scan++ == 'g' && *scan++ == '=') {
88 idx = scan;
89 break;
90 }
91 }
92 if (!useUTF8 && idx >= end) {
93 useUTF8 = true;
94 }
95 }
96 if (!useUTF8) {
97 // Found "encoding="; see if we've got an honest-to-goodness encoding name
98 quote = *idx;
99 if (quote != '\'' && quote != '\"') {
100 useUTF8 = true;
101 }
102 }
103 if (!useUTF8) {
104 base = idx + 1; // Move past the quote character
105 idx ++;
106 while (idx < end && *idx != quote) idx ++;
107 if (idx >= end) {
108 useUTF8 = true;
109 }
110 }
111 if (!useUTF8) {
112 UInt32 len = idx - base;
113 if (len == 5 && (*base == 'u' || *base == 'U') && (base[1] == 't' || base[1] == 'T') && (base[2] == 'f' || base[2] == 'F') && (base[3] == '-') && (base[4] == '8')) {
114 useUTF8 = true;
115 } else {
116 CFStringRef encodingName = CFStringCreateWithBytes(stream->allocator, base, len, kCFStringEncodingISOLatin1, false);
117 stream->encoding = CFStringConvertIANACharSetNameToEncoding(encodingName);
118 CFRelease(encodingName);
119 }
120 }
121 if (useUTF8) {
122 stream->encoding = kCFStringEncodingUTF8;
123 stream->flags |= ENCODING_MATCHES_ASCII;
124 return true;
125 } else if (stream->encoding == kCFStringEncodingInvalidId) {
126 return false;
127 } else if (__CFStringEncodingIsSupersetOfASCII(stream->encoding)) {
128 stream->flags |= ENCODING_MATCHES_ASCII;
129 }
130 return true;
131 }
132
133 CF_INLINE void _fillStringWithCharacters(CFMutableStringRef string, UniChar *characters, CFIndex numChars) {
134 CFStringDelete(string, CFRangeMake(0, CFStringGetLength(string)));
135 if (numChars) {
136 CFStringAppendCharacters(string, characters, numChars);
137 }
138 }
139
140 __private_extern__ Boolean _openInputStream(_CFXMLInputStream *stream) {
141 if (NULL == stream->data && NULL != stream->url) {
142 CFDataRef data = NULL;
143 if (CFURLCreateDataAndPropertiesFromResource(stream->allocator, stream->url, &data, NULL, NULL, NULL)) {
144 stream->data = data;
145 }
146 }
147 if (NULL == stream->data) {
148 return false;
149 } else {
150 stream->currentByte = CFDataGetBytePtr(stream->data);
151 if (determineEncoding(stream)) {
152 stream->flags |= STREAM_OPEN;
153 return true;
154 } else {
155 return false;
156 }
157 }
158 }
159
160 __private_extern__ void _initializeInputStream(_CFXMLInputStream *stream, CFAllocatorRef alloc, CFURLRef dataSource, CFDataRef xmlData) {
161 stream->data = xmlData ? CFRetain(xmlData) : NULL;
162 stream->url = dataSource ? CFRetain(dataSource) : NULL;
163 stream->encoding = kCFStringEncodingInvalidId;
164 stream->currentByte = NULL;
165
166 stream->allocator = CFRetain(alloc);
167 stream->charBuffer = NULL;
168 stream->currentChar = NULL;
169 stream->mark = NULL;
170 stream->parserMark = NULL;
171 stream->bufferLength = 0;
172 stream->bufferCapacity = 0;
173
174 stream->charIndex = 1;
175 stream->lineNum = 1;
176
177 stream->flags = 0;
178 stream->nameSet = NULL;
179 stream->tempString = NULL;
180 }
181
182
183 __private_extern__ void _freeInputStream(_CFXMLInputStream *stream) {
184 if (stream->data) CFRelease(stream->data);
185 if (stream->url) CFRelease(stream->url);
186 if (stream->charBuffer) CFAllocatorDeallocate(stream->allocator, stream->charBuffer);
187 if (stream->nameSet) CFRelease(stream->nameSet);
188 if (stream->tempString) CFRelease(stream->tempString);
189 CFRelease(stream->allocator);
190 }
191
192 __private_extern__ CFStringEncoding _inputStreamGetEncoding(_CFXMLInputStream *stream) {
193 return stream->encoding;
194 }
195
196 __private_extern__ CFIndex _inputStreamCurrentLocation(_CFXMLInputStream *stream) {
197 return stream->charIndex;
198 }
199
200 __private_extern__ CFIndex _inputStreamCurrentLine(_CFXMLInputStream *stream) {
201 return stream->lineNum;
202 }
203
204 __private_extern__ Boolean _inputStreamAtEOF(_CFXMLInputStream *stream) {
205 if (!(stream->flags & STREAM_OPEN)) return false;
206 if (stream->currentChar) return false;
207 if (stream->currentByte - CFDataGetBytePtr(stream->data) < CFDataGetLength(stream->data)) return false;
208 return true;
209 }
210
211 __private_extern__ Boolean _inputStreamComposingErrorOccurred(_CFXMLInputStream *stream) {
212 return stream->flags & ENCODING_COMPOSITION_ERROR;
213 }
214
215 #define INITIAL_BUFFER_SIZE 64
216 static void growCharacterBuffer(_CFXMLInputStream *stream) {
217 if (!stream->charBuffer) {
218 stream->charBuffer = CFAllocatorAllocate(stream->allocator, INITIAL_BUFFER_SIZE*sizeof(UniChar), 0);
219 stream->bufferCapacity = INITIAL_BUFFER_SIZE;
220 } else {
221 CFIndex currCharDelta = stream->currentChar ? stream->currentChar - stream->charBuffer : -1;
222 CFIndex markDelta = stream->mark ? stream->mark - stream->charBuffer: -1;
223 CFIndex parserMarkDelta = stream->parserMark ? stream->parserMark - stream->charBuffer: -1;
224 UniChar *newBuffer = CFAllocatorReallocate(stream->allocator, stream->charBuffer, stream->bufferCapacity * 2 * sizeof(UniChar), 0);
225 stream->bufferCapacity *= 2;
226 if (newBuffer != stream->charBuffer) {
227 stream->charBuffer = newBuffer;
228 if (currCharDelta != -1) {
229 stream->currentChar = newBuffer + currCharDelta;
230 }
231 if (markDelta != -1) {
232 stream->mark = newBuffer + markDelta;
233 }
234 if (parserMarkDelta != -1) {
235 stream->parserMark = newBuffer + parserMarkDelta;
236 }
237 }
238 }
239 }
240
241 static CFIndex loadCharacters(UniChar *base, CFIndex maxLength, _CFXMLInputStream *stream) {
242 const uint8_t *dataEnd = CFDataGetBytePtr(stream->data) + CFDataGetLength(stream->data);
243 if (stream->flags & (ENCODING_IS_UNICODE_NATURAL|ENCODING_IS_UNICODE_SWAPPED) ) {
244 CFIndex charsToTranslate = (dataEnd - stream->currentByte) / sizeof(UniChar);
245 if (charsToTranslate > maxLength) {
246 charsToTranslate = maxLength;
247 }
248 if (stream->flags & ENCODING_IS_UNICODE_NATURAL) {
249 memmove(base, stream->currentByte, charsToTranslate * sizeof(UniChar));
250 stream->currentByte += (charsToTranslate * sizeof(UniChar));
251 } else {
252 CFIndex i;
253 uint8_t *baseBytePtr = (uint8_t *)base;
254 for (i = 0; i < charsToTranslate; i ++) {
255 *(baseBytePtr + 1) = *stream->currentByte;
256 *baseBytePtr = *(stream->currentByte + 1);
257 baseBytePtr += 2;
258 stream->currentByte += 2;
259 }
260 }
261 return charsToTranslate;
262 } else {
263 CFIndex lengthConsumed = 0, usedByteLength, usedCharLength;
264 UInt32 conversionResult;
265 if (stream->flags & ENCODING_MATCHES_ASCII) {
266 while (stream->currentByte < dataEnd && lengthConsumed < maxLength) {
267 if (*stream->currentByte > 0x7f) break;
268 *base = *stream->currentByte;
269 base ++;
270 stream->currentByte ++;
271 lengthConsumed ++;
272 }
273 if (stream->currentByte == dataEnd || lengthConsumed == maxLength) {
274 return lengthConsumed;
275 }
276 }
277 conversionResult = CFStringEncodingBytesToUnicode(stream->encoding, 0, stream->currentByte, dataEnd - stream->currentByte, &usedByteLength, base, maxLength-lengthConsumed, &usedCharLength);
278 if(kCFStringEncodingConversionSuccess != conversionResult) {
279 switch(conversionResult) {
280 case kCFStringEncodingConverterUnavailable:
281 case kCFStringEncodingInvalidInputStream:
282 stream->flags |= ENCODING_COMPOSITION_ERROR;
283 break;
284 case kCFStringEncodingInsufficientOutputBufferLength:
285 default:
286 break;
287 }
288 }
289 if (usedByteLength > 0) {
290 stream->currentByte += usedByteLength;
291 lengthConsumed += usedCharLength;
292 }
293 return lengthConsumed;
294 }
295 }
296
297 // returns number of characters filled
298 CF_INLINE CFIndex fillToCapacity(_CFXMLInputStream *stream) {
299 CFIndex numFilled;
300 if (stream->bufferLength >= stream->bufferCapacity) return 0;
301 // Try and fill in the remaining characters
302 numFilled = loadCharacters(stream->charBuffer+stream->bufferLength, stream->bufferCapacity - stream->bufferLength, stream);
303 if (numFilled != 0) {
304 stream->currentChar = stream->charBuffer + stream->bufferLength;
305 stream->bufferLength += numFilled;
306 }
307 return numFilled;
308 }
309
310 // we are expected to move mark & parserMark relative to any moved characters, set currentChar to the first new character fetched, update bufferLength, and advance currentByte as appropriate. Does not check for EOF; it is the caller's responsibility to verify this.
311 static void fillCharacterBuffer(_CFXMLInputStream *stream) {
312 if (!stream->charBuffer) {
313 growCharacterBuffer(stream);
314 }
315 if (!stream->mark && !stream->parserMark) {
316 // This is the easy case; we can freely overwrite the buffer; if either mark or parserMark is set, we must not remove any characters from those marks and the end of the buffer
317 CFIndex fillLength = stream->bufferCapacity-5; // We leave a few characters at the end, b/c we don't want to reallocate (doubling the amount of memory used) just b/c we're matching a small string near the end of the filled buffer
318 stream->bufferLength = loadCharacters(stream->charBuffer, fillLength, stream);
319 CFAssert(stream->bufferLength != 0, __kCFLogAssertion, "CF internal error: XML parser input stream corruption");
320 stream->currentChar = stream->charBuffer;
321 } else {
322 // We do everything we can not to allocate; first we fill any remaining characters. If that doesn't work, we try shifting the characters starting at the earlier of mark or parserMark to the beginning of buffer, then filling the newly-freed characters.
323 Boolean done;
324
325 // First try just filling the remaining capacity
326 done = (fillToCapacity(stream) != 0);
327 if (!done) {
328 const UniChar *leftMostMark;
329 if (stream->mark && !stream->parserMark) {
330 leftMostMark = stream->mark;
331 } else if (stream->parserMark && !stream->mark) {
332 leftMostMark = stream->parserMark;
333 } else if (stream->parserMark < stream->mark) {
334 leftMostMark = stream->parserMark;
335 } else {
336 leftMostMark = stream->mark;
337 }
338 if (leftMostMark > stream->charBuffer) {
339 CFIndex delta = leftMostMark - stream->charBuffer;
340 memmove(stream->charBuffer, leftMostMark, (stream->bufferLength - delta) * sizeof(UniChar));
341 stream->bufferLength -= delta;
342 if (stream->mark) {
343 stream->mark -= delta;
344 }
345 if (stream->parserMark) {
346 stream->parserMark -= delta;
347 }
348 // Now try to fill the newly-opened space
349 done = (fillToCapacity(stream) != 0);
350 delta = loadCharacters(stream->charBuffer+stream->bufferLength, stream->bufferCapacity - stream->bufferLength, stream);
351 }
352 }
353 if (!done) {
354 // No help for it; now we must allocate
355 growCharacterBuffer(stream);
356 fillToCapacity(stream); // If this doesn't work, we give up.
357 }
358 }
359 }
360
361 /* The guts of getCharacter() have been separated in order to allow getCharacter() to be small and more easily inline-able. Note that a check late in the 10.3 development cycle indicates that getCharacter() should call getCharacterGuts() less than 2% of the time. (In 29000 calls, less than 400 called this. Note that a majority of calls have advanceStream set, so that was left in the inline version. Also note that some calls to getCharacter() were changed to go through the functions _inputStreamGetCharacter() or _inputStreamPeekCharacter(), as the expansion in using the inline version didn't seem worthwhile. See 3275503 for some data supporting this.
362 */
363 static Boolean getCharacterGuts(_CFXMLInputStream *stream, UniChar *ch, Boolean advanceStream) {
364 if (stream->currentByte - CFDataGetBytePtr(stream->data) >= CFDataGetLength(stream->data)) {
365 return false; // EOF
366 } else if (!((stream->mark || stream->parserMark) && advanceStream) &&
367 (((stream->flags & ENCODING_MATCHES_ASCII) && *(stream->currentByte) < 0x7F) ||
368 (stream->flags & (ENCODING_IS_UNICODE_NATURAL | ENCODING_IS_UNICODE_SWAPPED)))) {
369 // We can only perform optimizations if neither mark is set (if the mark is set, we must fill the character buffer so we can retrieve the characters later), and the encoding is Unicode, or the encoding matches ASCII and we're looking at a low-byte character.
370 if (stream->flags & ENCODING_MATCHES_ASCII) {
371 *ch = (UniChar)*(stream->currentByte);
372 if (advanceStream) {
373 stream->currentByte ++;
374 }
375 } else if (stream->flags & ENCODING_IS_UNICODE_NATURAL) {
376 *ch = (*stream->currentByte) << 8;
377 *ch += *(stream->currentByte + 1);
378 if (advanceStream) {
379 stream->currentByte += 2;
380 }
381 } else {
382 // Unicode with swapped bytes
383 *ch = (*(stream->currentByte + 1)) << 8;
384 *ch += *stream->currentByte;
385 if (advanceStream) {
386 stream->currentByte += 2;
387 }
388 }
389 } else {
390 fillCharacterBuffer(stream); // this takes into account markIsSet to make sure and do the right thing
391 if (!stream->charBuffer || !stream->currentChar) {
392 return false;
393 } else {
394 *ch = *(stream->currentChar);
395 if (advanceStream) {
396 stream->currentChar ++;
397 if (stream->currentChar == stream->charBuffer + stream->bufferLength) {
398 stream->currentChar = NULL;
399 }
400 }
401 }
402 }
403 return true;
404 }
405
406 /* See comments above getCharacterGuts()
407 */
408 CF_INLINE Boolean getCharacter(_CFXMLInputStream *stream, UniChar *ch, Boolean advanceStream) {
409 if (!(stream->flags & STREAM_OPEN)) {
410 return false;
411 } else if (stream->currentChar) {
412 *ch = *stream->currentChar;
413 if (advanceStream) {
414 stream->currentChar ++;
415 if (stream->currentChar == stream->charBuffer + stream->bufferLength) {
416 stream->currentChar = NULL;
417 }
418 }
419 } else {
420 if (!getCharacterGuts(stream, ch, advanceStream)) return false;
421 }
422 if (advanceStream) {
423 UniChar nextChar;
424 stream->charIndex ++;
425 if ((*ch == '\n') || ((*ch == '\r') && (!_inputStreamPeekCharacter(stream, &nextChar) || nextChar != '\n'))) stream->lineNum ++;
426 }
427 return true;
428 }
429
430 __private_extern__ Boolean _inputStreamPeekCharacter(_CFXMLInputStream *stream, UniChar *ch) {
431 return getCharacter(stream, ch, false);
432 }
433
434 __private_extern__ Boolean _inputStreamGetCharacter(_CFXMLInputStream *stream, UniChar *ch) {
435 return getCharacter(stream, ch, true);
436 }
437
438 __private_extern__ Boolean _inputStreamReturnCharacter(_CFXMLInputStream *stream, UniChar ch) {
439 Boolean decrementLineNum = false;
440 if (ch == '\n') {
441 decrementLineNum = true;
442 } else if (ch == '\r') {
443 UniChar nextChar;
444 if (!_inputStreamPeekCharacter(stream, &nextChar) || nextChar != '\n') {
445 decrementLineNum = true;
446 }
447 }
448
449 if (!(stream->flags & STREAM_OPEN)) {
450 return false;
451 } else if (stream->currentChar) {
452 if (stream->currentChar != stream->charBuffer) {
453 stream->currentChar --;
454 } else {
455 // Yuck; we're unlucky and are returning a character _before_ the first character in charBuffer
456 if (stream->bufferLength >= stream->bufferCapacity) {
457 growCharacterBuffer(stream);
458 }
459 memmove(stream->charBuffer + 1, stream->charBuffer, stream->bufferLength * sizeof(UniChar));
460 *stream->charBuffer = ch;
461 stream->bufferLength ++;
462 if (stream->mark) {
463 stream->mark ++;
464 }
465 if (stream->parserMark) {
466 stream->parserMark ++;
467 }
468 }
469 } else if ((stream->mark || stream->parserMark) && stream->bufferLength) {
470 // We've been collecting characters in charBuffer; the only reason stream->currentChar is NULL is that we've processed the last character thusfar translated from data. That last character is the one being returned.
471 stream->currentChar = stream->charBuffer + stream->bufferLength - 1;
472 } else if (stream->charBuffer) {
473 // We have processed all the meaningful characters from charBuffer and have no reason to preserve them. We use charBuffer to hold this one character that has been returned to us.
474 *stream->charBuffer = ch;
475 stream->currentChar = stream->charBuffer;
476 stream->bufferLength = 1;
477 if (stream->mark) {
478 stream->mark ++;
479 }
480 if (stream->parserMark) {
481 stream->parserMark ++;
482 }
483 } else if (stream->currentByte > CFDataGetBytePtr(stream->data)) {
484 // We have no character buffer available, so that means one of two things - either we've never needed a character buffer because all the characters could come directly out of the byte stream, or we've not yet processed the first character. The former means we can just back up the byte pointer; the latter means Bad Things have happened.
485 if (stream->flags & ENCODING_MATCHES_ASCII) {
486 stream->currentByte --;
487 } else { // Must be Unicode
488 stream->currentByte -= 2;
489 }
490 } else {
491 return false;
492 }
493 stream->charIndex --;
494 if (decrementLineNum) {
495 stream->lineNum --;
496 }
497 return true;
498 }
499
500 // Returns the pointer to hold as the mark
501 static UniChar *dropMark(_CFXMLInputStream *stream) {
502 if (stream->currentChar) {
503 return stream->currentChar;
504 } else if (stream->mark || stream->parserMark) {
505 return stream->charBuffer + stream->bufferLength;
506 } else {
507 if (!stream->charBuffer) {
508 growCharacterBuffer(stream);
509 }
510 stream->bufferLength = 0; // This will be sufficient to force a fetch into the buffer when the next character is requested
511 return stream->charBuffer;
512 }
513
514 }
515
516 __private_extern__ void _inputStreamSetMark(_CFXMLInputStream *stream) {
517 CFAssert(stream->mark == NULL, __kCFLogAssertion, "CF internal error: parser input stream malformed");
518 stream->mark = dropMark(stream);
519 }
520
521 __private_extern__ void _inputStreamClearMark(_CFXMLInputStream *stream) {
522 CFAssert(stream->mark != NULL, __kCFLogAssertion, "CF internal error: parser input stream malformed");
523 stream->mark = NULL;
524 }
525
526 __private_extern__ void _inputStreamGetCharactersFromMark(_CFXMLInputStream *stream, CFMutableStringRef string) {
527 UniChar *end = stream->currentChar ? stream->currentChar : stream->charBuffer + stream->bufferLength;
528 CFIndex numChars = end - stream->mark;
529 CFAssert(stream->mark, __kCFLogAssertion, "CF internal error: malformed XML input stream");
530 _fillStringWithCharacters(string, stream->mark, numChars);
531 }
532
533 static void restoreToMark(_CFXMLInputStream *stream, UniChar *mark) {
534 UniChar *end = stream->currentChar ? stream->currentChar : stream->charBuffer + stream->bufferLength;
535 if (end > mark) {
536 CFIndex numChars = end - mark;
537 stream->charIndex -= numChars;
538 stream->currentChar = mark;
539
540 // This is annoying; to keep the line count accurate, if the first character we are returning is a CR, we must decrement the line count iff the next character is NOT a LF
541 if (*(end - 1) == '\r') {
542 UniChar nextChar;
543 if (_inputStreamPeekCharacter(stream, &nextChar) && nextChar == '\n') {
544 end --;
545 }
546 }
547 while (end != mark) {
548 end --;
549 if (*end == '\r') {
550 stream->lineNum --;
551 } else if (*end == '\n') {
552 stream->lineNum --;
553 if (end != mark && *(end - 1) == '\r') {
554 end --;
555 }
556 }
557 }
558 }
559 }
560
561 __private_extern__ void _inputStreamBackUpToMark(_CFXMLInputStream *stream) {
562 CFAssert(stream->mark != NULL || stream->charBuffer == NULL, __kCFLogAssertion, "CF internal error: malformed XML input stream");
563 restoreToMark(stream, stream->mark);
564 }
565
566 CF_INLINE Boolean isWhitespaceChar(UniChar ch) {
567 return (ch == '\n' || ch == '\r' || ch == ' ' || ch == '\t');
568 }
569
570 __private_extern__ CFIndex _inputStreamSkipWhitespace(_CFXMLInputStream *stream, CFMutableStringRef str) {
571 UniChar ch;
572 CFIndex len = 0;
573 if (str) {
574 stream->parserMark = dropMark(stream);
575 }
576 while (getCharacter(stream, &ch, true) && isWhitespaceChar(ch)) {
577 len ++;
578 }
579 if (!isWhitespaceChar(ch)) {
580 _inputStreamReturnCharacter(stream, ch);
581 }
582 if (str) {
583 _fillStringWithCharacters(str, stream->parserMark, len);
584 stream->parserMark = NULL;
585 }
586 return len;
587 }
588
589 // false return means EOF was encountered without finding scanChars
590 __private_extern__ Boolean _inputStreamScanToCharacters(_CFXMLInputStream *stream, const UniChar *scanChars, CFIndex numChars, CFMutableStringRef str) {
591 Boolean done = false;
592 CFIndex firstRepeatIndex = -1;
593 CFIndex len = 0;
594 stream->parserMark = dropMark(stream);
595 do {
596 UniChar ch;
597 while (_inputStreamGetCharacter(stream, &ch) && ch != scanChars[0]) {
598 len ++;
599 }
600 if (ch != scanChars[0]) {
601 restoreToMark(stream, stream->parserMark);
602 stream->parserMark = NULL;
603 return false;
604 } else {
605 CFIndex i;
606 for (i = 1; i < numChars; i ++) {
607 if (!_inputStreamGetCharacter(stream, &ch)) break;
608 if (ch != scanChars[i]) break;
609 }
610 if (i == numChars) {
611 done = true;
612 } else {
613 if (firstRepeatIndex == -1) {
614 CFIndex j;
615 for (j = 1; j < numChars; j ++) {
616 if (scanChars[0] == scanChars[j]) {
617 break;
618 }
619 }
620 firstRepeatIndex = j;
621 }
622 _inputStreamReturnCharacter(stream, ch);
623 while (i > firstRepeatIndex) {
624 i --;
625 _inputStreamReturnCharacter(stream, scanChars[i]);
626 }
627 len += i;
628 }
629 }
630 } while (!done);
631 if (str) {
632 _fillStringWithCharacters(str, stream->parserMark, len);
633 }
634 stream->parserMark = NULL;
635 return true;
636 }
637
638 __private_extern__ Boolean _inputStreamMatchString(_CFXMLInputStream *stream, const UniChar *stringToMatch, CFIndex length) {
639 const UniChar *end = stringToMatch+length;
640 const UniChar *sPtr=stringToMatch;
641 stream->parserMark = dropMark(stream);
642 while (sPtr < end) {
643 UniChar ch;
644 if (!_inputStreamGetCharacter(stream, &ch)) break;
645 if (ch != *sPtr) break;
646 sPtr ++;
647 }
648 if (sPtr != end) {
649 restoreToMark(stream, stream->parserMark);
650 stream->parserMark = NULL;
651 return false;
652 } else {
653 stream->parserMark = NULL;
654 return true;
655 }
656 }
657
658 __private_extern__ Boolean _inputStreamScanQuotedString(_CFXMLInputStream *stream, CFMutableStringRef str) {
659 UniChar ch;
660 if (!_inputStreamPeekCharacter(stream, &ch)) return false;
661 if (ch != '\'' && ch != '\"') return false;
662
663 _inputStreamGetCharacter(stream, &ch);
664 if (!_inputStreamScanToCharacters(stream, &ch, 1, str)) {
665 return false;
666 }
667 return true;
668 }
669
670 /*
671 [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender
672 [5] Name ::= (Letter | '_' | ':') (NameChar)*
673 [7] Nmtoken ::= (NameChar)+
674 [84] Letter ::= BaseChar | Ideographic
675
676 We don't do this quite right; we rely on the Unicode charsets to do this analysis. While
677 the productions in the XML spec are based on the Unicode character sets, the definitions
678 differ slightly to avoid those areas where the Unicode standard is still being resolved.
679 At any rate, I'd lay money that using the Unicode charsets, we will be more correct than
680 the vast majority of parsers out there.
681
682 Letter == kCFUniCharLetterCharacterSet
683 Digit == kCFUniCharDecimalDigitCharacterSet
684 CombiningChar == kCFUniCharNonBaseCharacterSet
685 Extender - complex, and not represented by a uniform character set.
686 */
687 __private_extern__ Boolean _inputStreamScanXMLName(_CFXMLInputStream *stream, Boolean isNMToken, CFStringRef *str) {
688 UniChar ch;
689 Boolean success = true;
690 stream->parserMark = dropMark(stream);
691 if (!isNMToken) {
692 // Only difference between an NMToken and a Name is Names have a stricter condition on the first character
693 if (!getCharacter(stream, &ch, false)) {
694 success = false;
695 } else if (!CFUniCharIsMemberOf(ch, kCFUniCharLetterCharacterSet) && ch != '_' && ch != ':') {
696 success = false;
697 } else {
698 getCharacter(stream, &ch, true);
699 }
700 }
701 if (success) {
702 while (getCharacter(stream, &ch, true)) {
703 if (!CFUniCharIsMemberOf(ch, kCFUniCharLetterCharacterSet) && !CFUniCharIsMemberOf(ch, kCFUniCharDecimalDigitCharacterSet) && ch != '.' && ch != '-' && ch != '_' && ch != ':' && !CFUniCharIsMemberOf(ch, kCFUniCharNonBaseCharacterSet)) {
704 _inputStreamReturnCharacter(stream, ch);
705 break;
706 }
707 }
708 if (stream->currentChar == stream->parserMark) {
709 success = false; // Must have processed at least one character
710 }
711 }
712 if (success) {
713 if (str) {
714 if (!stream->nameSet) {
715 stream->nameSet = CFSetCreateMutable(stream->allocator, 0, &kCFTypeSetCallBacks);
716 stream->tempString = CFStringCreateMutableWithExternalCharactersNoCopy(stream->allocator, NULL, 0, 0, kCFAllocatorNull);
717 }
718 CFStringSetExternalCharactersNoCopy(stream->tempString, stream->parserMark, stream->currentChar-stream->parserMark, stream->currentChar-stream->parserMark);
719 if (!CFSetGetValueIfPresent(stream->nameSet, stream->tempString, (const void **)str)) {
720 *str = CFStringCreateCopy(stream->allocator, stream->tempString);
721 CFSetAddValue(stream->nameSet, *str);
722 CFRelease(*str);
723 }
724 }
725 } else {
726 restoreToMark(stream, stream->parserMark);
727 }
728 stream->parserMark = NULL;
729 return success;
730 }
731