]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * Copyright (c) 2014 Apple Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. Please obtain a copy of the License at | |
10 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
11 | * file. | |
12 | * | |
13 | * The Original Code and all software distributed under the License are | |
14 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
15 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
16 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
18 | * Please see the License for the specific language governing rights and | |
19 | * limitations under the License. | |
20 | * | |
21 | * @APPLE_LICENSE_HEADER_END@ | |
22 | */ | |
23 | ||
24 | /* CFXMLInputStream.c | |
25 | Copyright (c) 1999-2014, Apple Inc. All rights reserved. | |
26 | Responsibility: David Smith | |
27 | */ | |
28 | ||
29 | #include "CFXMLInputStream.h" | |
30 | #include <CoreFoundation/CFCharacterSet.h> | |
31 | #include <string.h> | |
32 | #include "CFStringEncodingConverter.h" | |
33 | #include "CFUniChar.h" | |
34 | ||
35 | /* Utility functions used in parsing */ | |
36 | static Boolean determineEncoding(_CFXMLInputStream *stream) { | |
37 | const uint8_t *bytes = (uint8_t *)CFDataGetBytePtr(stream->data); | |
38 | UInt32 length = CFDataGetLength(stream->data); | |
39 | const uint8_t *idx = 0L, *end = 0L; | |
40 | const uint8_t *base = 0L; | |
41 | char quote = ' '; | |
42 | Boolean useUTF8 = false; | |
43 | ||
44 | // Check for the byte order mark first | |
45 | if (length > 2) { | |
46 | // This clause checks for the unicode byte order mark, or a Unicode sequence lacking the BOM; technically an error, but this check is recommended by the XML spec | |
47 | if ((*bytes == 0xFF && *(bytes+1) == 0xFE) ||*(bytes+1) == 0x00) { | |
48 | #if __BIG_ENDIAN__ | |
49 | stream->flags |= ENCODING_IS_UNICODE_SWAPPED; | |
50 | #else | |
51 | stream->flags |= ENCODING_IS_UNICODE_NATURAL; | |
52 | #endif | |
53 | if (*bytes == 0xFF) { | |
54 | stream->currentByte = bytes + 2; | |
55 | } | |
56 | stream->encoding = kCFStringEncodingUnicode; | |
57 | return true; | |
58 | } else if ((*bytes == 0xFE && *(bytes+1) == 0xFF) || *bytes == 0x00) { | |
59 | #if __BIG_ENDIAN__ | |
60 | stream->flags |= ENCODING_IS_UNICODE_NATURAL; | |
61 | #else | |
62 | stream->flags |= ENCODING_IS_UNICODE_SWAPPED; | |
63 | #endif | |
64 | if (*bytes == 0xFE) { | |
65 | stream->currentByte = bytes + 2; | |
66 | } | |
67 | stream->encoding = kCFStringEncodingUnicode; | |
68 | return true; | |
69 | } else if(*bytes == 0xEF && *(bytes+1) == 0xBB && *(bytes+2) == 0xBF) { | |
70 | if(*bytes == 0xEF) { | |
71 | stream->currentByte = bytes + 3; | |
72 | } | |
73 | stream->encoding = kCFStringEncodingUTF8; | |
74 | stream->flags |= ENCODING_MATCHES_ASCII; | |
75 | return true; | |
76 | } | |
77 | } | |
78 | // Scan for the <?xml.... ?> opening | |
79 | if (length < 5 || strncmp((char const *) bytes, "<?xml", 5) != 0) { | |
80 | useUTF8 = true; | |
81 | } | |
82 | if (!useUTF8) { | |
83 | idx = bytes + 5; | |
84 | end = bytes + length; | |
85 | // Found "<?xml"; now we scan for "encoding" | |
86 | while (idx < end) { | |
87 | uint8_t ch = *idx; | |
88 | const uint8_t *scan; | |
89 | if ( ch == '?' || ch == '>') { | |
90 | useUTF8 = true; | |
91 | break; | |
92 | } | |
93 | idx ++; | |
94 | scan = idx; | |
95 | if (ch == 'e' && *scan++ == 'n' && *scan++ == 'c' && *scan++ == 'o' && *scan++ == 'd' && *scan++ == 'i' && *scan++ == 'n' && *scan++ == 'g' && *scan++ == '=') { | |
96 | idx = scan; | |
97 | break; | |
98 | } | |
99 | } | |
100 | if (!useUTF8 && idx >= end) { | |
101 | useUTF8 = true; | |
102 | } | |
103 | } | |
104 | if (!useUTF8) { | |
105 | // Found "encoding="; see if we've got an honest-to-goodness encoding name | |
106 | quote = *idx; | |
107 | if (quote != '\'' && quote != '\"') { | |
108 | useUTF8 = true; | |
109 | } | |
110 | } | |
111 | if (!useUTF8) { | |
112 | base = idx + 1; // Move past the quote character | |
113 | idx ++; | |
114 | while (idx < end && *idx != quote) idx ++; | |
115 | if (idx >= end) { | |
116 | useUTF8 = true; | |
117 | } | |
118 | } | |
119 | if (!useUTF8) { | |
120 | UInt32 len = idx - base; | |
121 | if (len == 5 && (*base == 'u' || *base == 'U') && (base[1] == 't' || base[1] == 'T') && (base[2] == 'f' || base[2] == 'F') && (base[3] == '-') && (base[4] == '8')) { | |
122 | useUTF8 = true; | |
123 | } else { | |
124 | CFStringRef encodingName = CFStringCreateWithBytes(stream->allocator, base, len, kCFStringEncodingISOLatin1, false); | |
125 | stream->encoding = CFStringConvertIANACharSetNameToEncoding(encodingName); | |
126 | CFRelease(encodingName); | |
127 | } | |
128 | } | |
129 | if (useUTF8) { | |
130 | stream->encoding = kCFStringEncodingUTF8; | |
131 | stream->flags |= ENCODING_MATCHES_ASCII; | |
132 | return true; | |
133 | } else if (stream->encoding == kCFStringEncodingInvalidId) { | |
134 | return false; | |
135 | } else if (__CFStringEncodingIsSupersetOfASCII(stream->encoding)) { | |
136 | stream->flags |= ENCODING_MATCHES_ASCII; | |
137 | } | |
138 | return true; | |
139 | } | |
140 | ||
141 | CF_INLINE void _fillStringWithCharacters(CFMutableStringRef string, UniChar *characters, CFIndex numChars) { | |
142 | CFStringDelete(string, CFRangeMake(0, CFStringGetLength(string))); | |
143 | if (numChars) { | |
144 | CFStringAppendCharacters(string, characters, numChars); | |
145 | } | |
146 | } | |
147 | ||
148 | CF_PRIVATE Boolean _openInputStream(_CFXMLInputStream *stream) { | |
149 | if (NULL == stream->data) { | |
150 | return false; | |
151 | } else { | |
152 | stream->currentByte = CFDataGetBytePtr(stream->data); | |
153 | if (determineEncoding(stream)) { | |
154 | stream->flags |= STREAM_OPEN; | |
155 | return true; | |
156 | } else { | |
157 | return false; | |
158 | } | |
159 | } | |
160 | } | |
161 | ||
162 | CF_PRIVATE void _initializeInputStream(_CFXMLInputStream *stream, CFAllocatorRef alloc, CFURLRef dataSource, CFDataRef xmlData) { | |
163 | stream->data = xmlData ? (CFDataRef)CFRetain(xmlData) : NULL; | |
164 | stream->url = dataSource ? (CFURLRef)CFRetain(dataSource) : NULL; | |
165 | stream->encoding = kCFStringEncodingInvalidId; | |
166 | stream->currentByte = NULL; | |
167 | ||
168 | stream->allocator = (CFAllocatorRef)CFRetain(alloc); | |
169 | stream->charBuffer = NULL; | |
170 | stream->currentChar = NULL; | |
171 | stream->mark = NULL; | |
172 | stream->parserMark = NULL; | |
173 | stream->bufferLength = 0; | |
174 | stream->bufferCapacity = 0; | |
175 | ||
176 | stream->charIndex = 1; | |
177 | stream->lineNum = 1; | |
178 | ||
179 | stream->flags = 0; | |
180 | stream->nameSet = NULL; | |
181 | stream->tempString = NULL; | |
182 | } | |
183 | ||
184 | ||
185 | CF_PRIVATE void _freeInputStream(_CFXMLInputStream *stream) { | |
186 | if (stream->data) CFRelease(stream->data); | |
187 | if (stream->url) CFRelease(stream->url); | |
188 | if (stream->charBuffer) CFAllocatorDeallocate(stream->allocator, stream->charBuffer); | |
189 | if (stream->nameSet) CFRelease(stream->nameSet); | |
190 | if (stream->tempString) CFRelease(stream->tempString); | |
191 | CFRelease(stream->allocator); | |
192 | } | |
193 | ||
194 | CF_PRIVATE CFStringEncoding _inputStreamGetEncoding(_CFXMLInputStream *stream) { | |
195 | return stream->encoding; | |
196 | } | |
197 | ||
198 | CF_PRIVATE CFIndex _inputStreamCurrentLocation(_CFXMLInputStream *stream) { | |
199 | return stream->charIndex; | |
200 | } | |
201 | ||
202 | CF_PRIVATE CFIndex _inputStreamCurrentLine(_CFXMLInputStream *stream) { | |
203 | return stream->lineNum; | |
204 | } | |
205 | ||
206 | CF_PRIVATE Boolean _inputStreamAtEOF(_CFXMLInputStream *stream) { | |
207 | if (!(stream->flags & STREAM_OPEN)) return false; | |
208 | if (stream->currentChar) return false; | |
209 | if (stream->currentByte - CFDataGetBytePtr(stream->data) < CFDataGetLength(stream->data)) return false; | |
210 | return true; | |
211 | } | |
212 | ||
213 | CF_PRIVATE Boolean _inputStreamComposingErrorOccurred(_CFXMLInputStream *stream) { | |
214 | return stream->flags & ENCODING_COMPOSITION_ERROR; | |
215 | } | |
216 | ||
217 | #define INITIAL_BUFFER_SIZE 64 | |
218 | static void growCharacterBuffer(_CFXMLInputStream *stream) { | |
219 | if (!stream->charBuffer) { | |
220 | stream->charBuffer = (UniChar *)CFAllocatorAllocate(stream->allocator, INITIAL_BUFFER_SIZE*sizeof(UniChar), 0); | |
221 | stream->bufferCapacity = INITIAL_BUFFER_SIZE; | |
222 | } else { | |
223 | CFIndex currCharDelta = stream->currentChar ? stream->currentChar - stream->charBuffer : -1; | |
224 | CFIndex markDelta = stream->mark ? stream->mark - stream->charBuffer: -1; | |
225 | CFIndex parserMarkDelta = stream->parserMark ? stream->parserMark - stream->charBuffer: -1; | |
226 | UniChar *newBuffer = (UniChar *)CFAllocatorReallocate(stream->allocator, stream->charBuffer, stream->bufferCapacity * 2 * sizeof(UniChar), 0); | |
227 | stream->bufferCapacity *= 2; | |
228 | if (newBuffer != stream->charBuffer) { | |
229 | stream->charBuffer = newBuffer; | |
230 | if (currCharDelta != -1) { | |
231 | stream->currentChar = newBuffer + currCharDelta; | |
232 | } | |
233 | if (markDelta != -1) { | |
234 | stream->mark = newBuffer + markDelta; | |
235 | } | |
236 | if (parserMarkDelta != -1) { | |
237 | stream->parserMark = newBuffer + parserMarkDelta; | |
238 | } | |
239 | } | |
240 | } | |
241 | } | |
242 | ||
243 | static CFIndex loadCharacters(UniChar *base, CFIndex maxLength, _CFXMLInputStream *stream) { | |
244 | const uint8_t *dataEnd = CFDataGetBytePtr(stream->data) + CFDataGetLength(stream->data); | |
245 | if (stream->flags & (ENCODING_IS_UNICODE_NATURAL|ENCODING_IS_UNICODE_SWAPPED) ) { | |
246 | CFIndex charsToTranslate = (dataEnd - stream->currentByte) / sizeof(UniChar); | |
247 | if (charsToTranslate > maxLength) { | |
248 | charsToTranslate = maxLength; | |
249 | } | |
250 | if (stream->flags & ENCODING_IS_UNICODE_NATURAL) { | |
251 | memmove(base, stream->currentByte, charsToTranslate * sizeof(UniChar)); | |
252 | stream->currentByte += (charsToTranslate * sizeof(UniChar)); | |
253 | } else { | |
254 | CFIndex i; | |
255 | uint8_t *baseBytePtr = (uint8_t *)base; | |
256 | for (i = 0; i < charsToTranslate; i ++) { | |
257 | *(baseBytePtr + 1) = *stream->currentByte; | |
258 | *baseBytePtr = *(stream->currentByte + 1); | |
259 | baseBytePtr += 2; | |
260 | stream->currentByte += 2; | |
261 | } | |
262 | } | |
263 | return charsToTranslate; | |
264 | } else { | |
265 | CFIndex lengthConsumed = 0; | |
266 | CFIndex usedByteLength, usedCharLength; | |
267 | UInt32 conversionResult; | |
268 | if (stream->flags & ENCODING_MATCHES_ASCII) { | |
269 | while (stream->currentByte < dataEnd && lengthConsumed < maxLength) { | |
270 | if (*stream->currentByte > 0x7f) break; | |
271 | *base = *stream->currentByte; | |
272 | base ++; | |
273 | stream->currentByte ++; | |
274 | lengthConsumed ++; | |
275 | } | |
276 | if (stream->currentByte == dataEnd || lengthConsumed == maxLength) { | |
277 | return lengthConsumed; | |
278 | } | |
279 | } | |
280 | conversionResult = CFStringEncodingBytesToUnicode(stream->encoding, 0, stream->currentByte, dataEnd - stream->currentByte, &usedByteLength, base, maxLength-lengthConsumed, &usedCharLength); | |
281 | if(kCFStringEncodingConversionSuccess != conversionResult) { | |
282 | switch(conversionResult) { | |
283 | case kCFStringEncodingConverterUnavailable: | |
284 | case kCFStringEncodingInvalidInputStream: | |
285 | stream->flags |= ENCODING_COMPOSITION_ERROR; | |
286 | break; | |
287 | case kCFStringEncodingInsufficientOutputBufferLength: | |
288 | default: | |
289 | break; | |
290 | } | |
291 | } | |
292 | if (usedByteLength > 0) { | |
293 | stream->currentByte += usedByteLength; | |
294 | lengthConsumed += usedCharLength; | |
295 | } | |
296 | return lengthConsumed; | |
297 | } | |
298 | } | |
299 | ||
300 | // returns number of characters filled | |
301 | CF_INLINE CFIndex fillToCapacity(_CFXMLInputStream *stream) { | |
302 | CFIndex numFilled; | |
303 | if (stream->bufferLength >= stream->bufferCapacity) return 0; | |
304 | // Try and fill in the remaining characters | |
305 | numFilled = loadCharacters(stream->charBuffer+stream->bufferLength, stream->bufferCapacity - stream->bufferLength, stream); | |
306 | if (numFilled != 0) { | |
307 | stream->currentChar = stream->charBuffer + stream->bufferLength; | |
308 | stream->bufferLength += numFilled; | |
309 | } | |
310 | return numFilled; | |
311 | } | |
312 | ||
313 | // we are expected to move mark & parserMark relative to any moved characters, set currentChar to the first new character fetched, update bufferLength, and advance currentByte as appropriate. Does not check for EOF; it is the caller's responsibility to verify this. | |
314 | static void fillCharacterBuffer(_CFXMLInputStream *stream) { | |
315 | if (!stream->charBuffer) { | |
316 | growCharacterBuffer(stream); | |
317 | } | |
318 | if (!stream->mark && !stream->parserMark) { | |
319 | // This is the easy case; we can freely overwrite the buffer; if either mark or parserMark is set, we must not remove any characters from those marks and the end of the buffer | |
320 | CFIndex fillLength = stream->bufferCapacity-5; // We leave a few characters at the end, b/c we don't want to reallocate (doubling the amount of memory used) just b/c we're matching a small string near the end of the filled buffer | |
321 | stream->bufferLength = loadCharacters(stream->charBuffer, fillLength, stream); | |
322 | CFAssert(stream->bufferLength != 0, __kCFLogAssertion, "CF internal error: XML parser input stream corruption"); | |
323 | stream->currentChar = stream->charBuffer; | |
324 | } else { | |
325 | // We do everything we can not to allocate; first we fill any remaining characters. If that doesn't work, we try shifting the characters starting at the earlier of mark or parserMark to the beginning of buffer, then filling the newly-freed characters. | |
326 | Boolean done; | |
327 | ||
328 | // First try just filling the remaining capacity | |
329 | done = (fillToCapacity(stream) != 0); | |
330 | if (!done) { | |
331 | const UniChar *leftMostMark; | |
332 | if (stream->mark && !stream->parserMark) { | |
333 | leftMostMark = stream->mark; | |
334 | } else if (stream->parserMark && !stream->mark) { | |
335 | leftMostMark = stream->parserMark; | |
336 | } else if (stream->parserMark < stream->mark) { | |
337 | leftMostMark = stream->parserMark; | |
338 | } else { | |
339 | leftMostMark = stream->mark; | |
340 | } | |
341 | if (leftMostMark > stream->charBuffer) { | |
342 | CFIndex delta = leftMostMark - stream->charBuffer; | |
343 | memmove(stream->charBuffer, leftMostMark, (stream->bufferLength - delta) * sizeof(UniChar)); | |
344 | stream->bufferLength -= delta; | |
345 | if (stream->mark) { | |
346 | stream->mark -= delta; | |
347 | } | |
348 | if (stream->parserMark) { | |
349 | stream->parserMark -= delta; | |
350 | } | |
351 | // Now try to fill the newly-opened space | |
352 | done = (fillToCapacity(stream) != 0); | |
353 | delta = loadCharacters(stream->charBuffer + stream->bufferLength, stream->bufferCapacity - stream->bufferLength, stream); | |
354 | } | |
355 | } | |
356 | if (!done) { | |
357 | // No help for it; now we must allocate | |
358 | growCharacterBuffer(stream); | |
359 | fillToCapacity(stream); // If this doesn't work, we give up. | |
360 | } | |
361 | } | |
362 | } | |
363 | ||
364 | /* The guts of getCharacter() have been separated in order to allow getCharacter() to be small and more easily inline-able. Note that a check late in the 10.3 development cycle indicates that getCharacter() should call getCharacterGuts() less than 2% of the time. (In 29000 calls, less than 400 called this. Note that a majority of calls have advanceStream set, so that was left in the inline version. Also note that some calls to getCharacter() were changed to go through the functions _inputStreamGetCharacter() or _inputStreamPeekCharacter(), as the expansion in using the inline version didn't seem worthwhile. See 3275503 for some data supporting this. | |
365 | */ | |
366 | static Boolean getCharacterGuts(_CFXMLInputStream *stream, UniChar *ch, Boolean advanceStream) { | |
367 | if (stream->currentByte - CFDataGetBytePtr(stream->data) >= CFDataGetLength(stream->data)) { | |
368 | return false; // EOF | |
369 | } else if (!((stream->mark || stream->parserMark) && advanceStream) && | |
370 | (((stream->flags & ENCODING_MATCHES_ASCII) && *(stream->currentByte) < 0x7F) || | |
371 | (stream->flags & (ENCODING_IS_UNICODE_NATURAL | ENCODING_IS_UNICODE_SWAPPED)))) { | |
372 | // We can only perform optimizations if neither mark is set (if the mark is set, we must fill the character buffer so we can retrieve the characters later), and the encoding is Unicode, or the encoding matches ASCII and we're looking at a low-byte character. | |
373 | if (stream->flags & ENCODING_MATCHES_ASCII) { | |
374 | *ch = (UniChar)*(stream->currentByte); | |
375 | if (advanceStream) { | |
376 | stream->currentByte ++; | |
377 | } | |
378 | } else if (stream->flags & ENCODING_IS_UNICODE_NATURAL) { | |
379 | *ch = *(UniChar *)(stream->currentByte); | |
380 | if (advanceStream) { | |
381 | stream->currentByte += 2; | |
382 | } | |
383 | } else { | |
384 | // Unicode with swapped bytes | |
385 | *ch = CFSwapInt16(*(UniChar *)(stream->currentByte)); | |
386 | if (advanceStream) { | |
387 | stream->currentByte += 2; | |
388 | } | |
389 | } | |
390 | } else { | |
391 | fillCharacterBuffer(stream); // this takes into account markIsSet to make sure and do the right thing | |
392 | if (!stream->charBuffer || !stream->currentChar) { | |
393 | return false; | |
394 | } else { | |
395 | *ch = *(stream->currentChar); | |
396 | if (advanceStream) { | |
397 | stream->currentChar ++; | |
398 | if (stream->currentChar == stream->charBuffer + stream->bufferLength) { | |
399 | stream->currentChar = NULL; | |
400 | } | |
401 | } | |
402 | } | |
403 | } | |
404 | return true; | |
405 | } | |
406 | ||
407 | /* See comments above getCharacterGuts() | |
408 | */ | |
409 | CF_INLINE Boolean getCharacter(_CFXMLInputStream *stream, UniChar *ch, Boolean advanceStream) { | |
410 | if (!(stream->flags & STREAM_OPEN)) { | |
411 | return false; | |
412 | } else if (stream->currentChar) { | |
413 | *ch = *stream->currentChar; | |
414 | if (advanceStream) { | |
415 | stream->currentChar ++; | |
416 | if (stream->currentChar == stream->charBuffer + stream->bufferLength) { | |
417 | stream->currentChar = NULL; | |
418 | } | |
419 | } | |
420 | } else { | |
421 | if (!getCharacterGuts(stream, ch, advanceStream)) return false; | |
422 | } | |
423 | if (advanceStream) { | |
424 | UniChar nextChar; | |
425 | stream->charIndex ++; | |
426 | if ((*ch == '\n') || ((*ch == '\r') && (!_inputStreamPeekCharacter(stream, &nextChar) || nextChar != '\n'))) stream->lineNum ++; | |
427 | } | |
428 | return true; | |
429 | } | |
430 | ||
431 | CF_PRIVATE Boolean _inputStreamPeekCharacter(_CFXMLInputStream *stream, UniChar *ch) { | |
432 | return getCharacter(stream, ch, false); | |
433 | } | |
434 | ||
435 | CF_PRIVATE Boolean _inputStreamGetCharacter(_CFXMLInputStream *stream, UniChar *ch) { | |
436 | return getCharacter(stream, ch, true); | |
437 | } | |
438 | ||
439 | CF_PRIVATE Boolean _inputStreamReturnCharacter(_CFXMLInputStream *stream, UniChar ch) { | |
440 | Boolean decrementLineNum = false; | |
441 | if (ch == '\n') { | |
442 | decrementLineNum = true; | |
443 | } else if (ch == '\r') { | |
444 | UniChar nextChar; | |
445 | if (!_inputStreamPeekCharacter(stream, &nextChar) || nextChar != '\n') { | |
446 | decrementLineNum = true; | |
447 | } | |
448 | } | |
449 | ||
450 | if (!(stream->flags & STREAM_OPEN)) { | |
451 | return false; | |
452 | } else if (stream->currentChar) { | |
453 | if (stream->currentChar != stream->charBuffer) { | |
454 | stream->currentChar --; | |
455 | } else { | |
456 | // Yuck; we're unlucky and are returning a character _before_ the first character in charBuffer | |
457 | if (stream->bufferLength >= stream->bufferCapacity) { | |
458 | growCharacterBuffer(stream); | |
459 | } | |
460 | memmove(stream->charBuffer + 1, stream->charBuffer, stream->bufferLength * sizeof(UniChar)); | |
461 | *stream->charBuffer = ch; | |
462 | stream->bufferLength ++; | |
463 | if (stream->mark) { | |
464 | stream->mark ++; | |
465 | } | |
466 | if (stream->parserMark) { | |
467 | stream->parserMark ++; | |
468 | } | |
469 | } | |
470 | } else if ((stream->mark || stream->parserMark) && stream->bufferLength) { | |
471 | // We've been collecting characters in charBuffer; the only reason stream->currentChar is NULL is that we've processed the last character thusfar translated from data. That last character is the one being returned. | |
472 | stream->currentChar = stream->charBuffer + stream->bufferLength - 1; | |
473 | } else if (stream->charBuffer) { | |
474 | // We have processed all the meaningful characters from charBuffer and have no reason to preserve them. We use charBuffer to hold this one character that has been returned to us. | |
475 | *stream->charBuffer = ch; | |
476 | stream->currentChar = stream->charBuffer; | |
477 | stream->bufferLength = 1; | |
478 | if (stream->mark) { | |
479 | stream->mark ++; | |
480 | } | |
481 | if (stream->parserMark) { | |
482 | stream->parserMark ++; | |
483 | } | |
484 | } else if (stream->currentByte > CFDataGetBytePtr(stream->data)) { | |
485 | // We have no character buffer available, so that means one of two things - either we've never needed a character buffer because all the characters could come directly out of the byte stream, or we've not yet processed the first character. The former means we can just back up the byte pointer; the latter means Bad Things have happened. | |
486 | if (stream->flags & ENCODING_MATCHES_ASCII) { | |
487 | stream->currentByte --; | |
488 | } else { // Must be Unicode | |
489 | stream->currentByte -= 2; | |
490 | } | |
491 | } else { | |
492 | return false; | |
493 | } | |
494 | stream->charIndex --; | |
495 | if (decrementLineNum) { | |
496 | stream->lineNum --; | |
497 | } | |
498 | return true; | |
499 | } | |
500 | ||
501 | // Returns the pointer to hold as the mark | |
502 | static UniChar *dropMark(_CFXMLInputStream *stream) { | |
503 | if (stream->currentChar) { | |
504 | return stream->currentChar; | |
505 | } else if (stream->mark || stream->parserMark) { | |
506 | return stream->charBuffer + stream->bufferLength; | |
507 | } else { | |
508 | if (!stream->charBuffer) { | |
509 | growCharacterBuffer(stream); | |
510 | } | |
511 | stream->bufferLength = 0; // This will be sufficient to force a fetch into the buffer when the next character is requested | |
512 | return stream->charBuffer; | |
513 | } | |
514 | ||
515 | } | |
516 | ||
517 | CF_PRIVATE void _inputStreamSetMark(_CFXMLInputStream *stream) { | |
518 | CFAssert(stream->mark == NULL, __kCFLogAssertion, "CF internal error: parser input stream malformed"); | |
519 | stream->mark = dropMark(stream); | |
520 | } | |
521 | ||
522 | CF_PRIVATE void _inputStreamClearMark(_CFXMLInputStream *stream) { | |
523 | CFAssert(stream->mark != NULL, __kCFLogAssertion, "CF internal error: parser input stream malformed"); | |
524 | stream->mark = NULL; | |
525 | } | |
526 | ||
527 | CF_PRIVATE void _inputStreamGetCharactersFromMark(_CFXMLInputStream *stream, CFMutableStringRef string) { | |
528 | UniChar *end = stream->currentChar ? stream->currentChar : stream->charBuffer + stream->bufferLength; | |
529 | CFIndex numChars = end - stream->mark; | |
530 | CFAssert(stream->mark, __kCFLogAssertion, "CF internal error: malformed XML input stream"); | |
531 | _fillStringWithCharacters(string, stream->mark, numChars); | |
532 | } | |
533 | ||
534 | static void restoreToMark(_CFXMLInputStream *stream, UniChar *mark) { | |
535 | UniChar *end = stream->currentChar ? stream->currentChar : stream->charBuffer + stream->bufferLength; | |
536 | if (end > mark) { | |
537 | CFIndex numChars = end - mark; | |
538 | stream->charIndex -= numChars; | |
539 | stream->currentChar = mark; | |
540 | ||
541 | // This is annoying; to keep the line count accurate, if the first character we are returning is a CR, we must decrement the line count iff the next character is NOT a LF | |
542 | if (*(end - 1) == '\r') { | |
543 | UniChar nextChar; | |
544 | if (_inputStreamPeekCharacter(stream, &nextChar) && nextChar == '\n') { | |
545 | end --; | |
546 | } | |
547 | } | |
548 | while (end != mark) { | |
549 | end --; | |
550 | if (*end == '\r') { | |
551 | stream->lineNum --; | |
552 | } else if (*end == '\n') { | |
553 | stream->lineNum --; | |
554 | if (end != mark && *(end - 1) == '\r') { | |
555 | end --; | |
556 | } | |
557 | } | |
558 | } | |
559 | } | |
560 | } | |
561 | ||
562 | CF_PRIVATE void _inputStreamBackUpToMark(_CFXMLInputStream *stream) { | |
563 | CFAssert(stream->mark != NULL || stream->charBuffer == NULL, __kCFLogAssertion, "CF internal error: malformed XML input stream"); | |
564 | restoreToMark(stream, stream->mark); | |
565 | } | |
566 | ||
567 | CF_INLINE Boolean isWhitespaceChar(UniChar ch) { | |
568 | return (ch == '\n' || ch == '\r' || ch == ' ' || ch == '\t'); | |
569 | } | |
570 | ||
571 | CF_PRIVATE CFIndex _inputStreamSkipWhitespace(_CFXMLInputStream *stream, CFMutableStringRef str) { | |
572 | UniChar ch; | |
573 | CFIndex len = 0; | |
574 | if (str) { | |
575 | stream->parserMark = dropMark(stream); | |
576 | } | |
577 | while (getCharacter(stream, &ch, true) && isWhitespaceChar(ch)) { | |
578 | len ++; | |
579 | } | |
580 | if (!isWhitespaceChar(ch)) { | |
581 | _inputStreamReturnCharacter(stream, ch); | |
582 | } | |
583 | if (str) { | |
584 | _fillStringWithCharacters(str, stream->parserMark, len); | |
585 | stream->parserMark = NULL; | |
586 | } | |
587 | return len; | |
588 | } | |
589 | ||
590 | // false return means EOF was encountered without finding scanChars | |
591 | CF_PRIVATE Boolean _inputStreamScanToCharacters(_CFXMLInputStream *stream, const UniChar *scanChars, CFIndex numChars, CFMutableStringRef str) { | |
592 | Boolean done = false; | |
593 | CFIndex firstRepeatIndex = -1; | |
594 | CFIndex len = 0; | |
595 | stream->parserMark = dropMark(stream); | |
596 | do { | |
597 | UniChar ch; | |
598 | while (_inputStreamGetCharacter(stream, &ch) && ch != scanChars[0]) { | |
599 | len ++; | |
600 | } | |
601 | if (ch != scanChars[0]) { | |
602 | restoreToMark(stream, stream->parserMark); | |
603 | stream->parserMark = NULL; | |
604 | return false; | |
605 | } else { | |
606 | CFIndex i; | |
607 | for (i = 1; i < numChars; i ++) { | |
608 | if (!_inputStreamGetCharacter(stream, &ch)) break; | |
609 | if (ch != scanChars[i]) break; | |
610 | } | |
611 | if (i == numChars) { | |
612 | done = true; | |
613 | } else { | |
614 | if (firstRepeatIndex == -1) { | |
615 | CFIndex j; | |
616 | for (j = 1; j < numChars; j ++) { | |
617 | if (scanChars[0] == scanChars[j]) { | |
618 | break; | |
619 | } | |
620 | } | |
621 | firstRepeatIndex = j; | |
622 | } | |
623 | _inputStreamReturnCharacter(stream, ch); | |
624 | while (i > firstRepeatIndex) { | |
625 | i --; | |
626 | _inputStreamReturnCharacter(stream, scanChars[i]); | |
627 | } | |
628 | len += i; | |
629 | } | |
630 | } | |
631 | } while (!done); | |
632 | if (str) { | |
633 | _fillStringWithCharacters(str, stream->parserMark, len); | |
634 | } | |
635 | stream->parserMark = NULL; | |
636 | return true; | |
637 | } | |
638 | ||
639 | CF_PRIVATE Boolean _inputStreamMatchString(_CFXMLInputStream *stream, const UniChar *stringToMatch, CFIndex length) { | |
640 | const UniChar *end = stringToMatch+length; | |
641 | const UniChar *sPtr=stringToMatch; | |
642 | stream->parserMark = dropMark(stream); | |
643 | while (sPtr < end) { | |
644 | UniChar ch; | |
645 | if (!_inputStreamGetCharacter(stream, &ch)) break; | |
646 | if (ch != *sPtr) break; | |
647 | sPtr ++; | |
648 | } | |
649 | if (sPtr != end) { | |
650 | restoreToMark(stream, stream->parserMark); | |
651 | stream->parserMark = NULL; | |
652 | return false; | |
653 | } else { | |
654 | stream->parserMark = NULL; | |
655 | return true; | |
656 | } | |
657 | } | |
658 | ||
659 | CF_PRIVATE Boolean _inputStreamScanQuotedString(_CFXMLInputStream *stream, CFMutableStringRef str) { | |
660 | UniChar ch; | |
661 | if (!_inputStreamPeekCharacter(stream, &ch)) return false; | |
662 | if (ch != '\'' && ch != '\"') return false; | |
663 | ||
664 | _inputStreamGetCharacter(stream, &ch); | |
665 | if (!_inputStreamScanToCharacters(stream, &ch, 1, str)) { | |
666 | return false; | |
667 | } | |
668 | return true; | |
669 | } | |
670 | ||
671 | /* | |
672 | [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender | |
673 | [5] Name ::= (Letter | '_' | ':') (NameChar)* | |
674 | [7] Nmtoken ::= (NameChar)+ | |
675 | [84] Letter ::= BaseChar | Ideographic | |
676 | ||
677 | We don't do this quite right; we rely on the Unicode charsets to do this analysis. While | |
678 | the productions in the XML spec are based on the Unicode character sets, the definitions | |
679 | differ slightly to avoid those areas where the Unicode standard is still being resolved. | |
680 | At any rate, I'd lay money that using the Unicode charsets, we will be more correct than | |
681 | the vast majority of parsers out there. | |
682 | ||
683 | Letter == kCFUniCharLetterCharacterSet | |
684 | Digit == kCFUniCharDecimalDigitCharacterSet | |
685 | CombiningChar == kCFUniCharNonBaseCharacterSet | |
686 | Extender - complex, and not represented by a uniform character set. | |
687 | */ | |
688 | CF_PRIVATE Boolean _inputStreamScanXMLName(_CFXMLInputStream *stream, Boolean isNMToken, CFStringRef *str) { | |
689 | UniChar ch; | |
690 | Boolean success = true; | |
691 | stream->parserMark = dropMark(stream); | |
692 | if (!isNMToken) { | |
693 | // Only difference between an NMToken and a Name is Names have a stricter condition on the first character | |
694 | if (!getCharacter(stream, &ch, false)) { | |
695 | success = false; | |
696 | } else if (!CFUniCharIsMemberOf(ch, kCFUniCharLetterCharacterSet) && ch != '_' && ch != ':') { | |
697 | success = false; | |
698 | } else { | |
699 | getCharacter(stream, &ch, true); | |
700 | } | |
701 | } | |
702 | if (success) { | |
703 | while (getCharacter(stream, &ch, true)) { | |
704 | if (!CFUniCharIsMemberOf(ch, kCFUniCharLetterCharacterSet) && !CFUniCharIsMemberOf(ch, kCFUniCharDecimalDigitCharacterSet) && ch != '.' && ch != '-' && ch != '_' && ch != ':' && !CFUniCharIsMemberOf(ch, kCFUniCharNonBaseCharacterSet)) { | |
705 | _inputStreamReturnCharacter(stream, ch); | |
706 | break; | |
707 | } | |
708 | } | |
709 | if (NULL == stream->currentChar || stream->currentChar == stream->parserMark) { | |
710 | success = false; // Must have processed at least one character | |
711 | } | |
712 | } | |
713 | if (success) { | |
714 | if (str) { | |
715 | if (!stream->nameSet) { | |
716 | stream->nameSet = CFSetCreateMutable(stream->allocator, 0, &kCFTypeSetCallBacks); | |
717 | stream->tempString = CFStringCreateMutableWithExternalCharactersNoCopy(stream->allocator, NULL, 0, 0, kCFAllocatorNull); | |
718 | } | |
719 | CFStringSetExternalCharactersNoCopy(stream->tempString, stream->parserMark, stream->currentChar-stream->parserMark, stream->currentChar-stream->parserMark); | |
720 | if (!CFSetGetValueIfPresent(stream->nameSet, stream->tempString, (const void **)str)) { | |
721 | *str = (CFStringRef)CFStringCreateCopy(stream->allocator, stream->tempString); | |
722 | CFSetAddValue(stream->nameSet, *str); | |
723 | CFRelease(*str); | |
724 | } | |
725 | } | |
726 | } else { | |
727 | restoreToMark(stream, stream->parserMark); | |
728 | } | |
729 | stream->parserMark = NULL; | |
730 | return success; | |
731 | } | |
732 | ||
733 |