X-Git-Url: https://git.saurik.com/apple/javascriptcore.git/blobdiff_plain/1df5f87f1309a8daa30dabdee855f48ae40d14ab..6fe7ccc865dc7d7541b93c5bcaf6368d2c98a174:/wtf/unicode/UTF8.cpp?ds=inline diff --git a/wtf/unicode/UTF8.cpp b/wtf/unicode/UTF8.cpp deleted file mode 100644 index 3a88906..0000000 --- a/wtf/unicode/UTF8.cpp +++ /dev/null @@ -1,407 +0,0 @@ -/* - * Copyright (C) 2007 Apple Inc. All rights reserved. - * Copyright (C) 2010 Patrick Gansterer - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "config.h" -#include "UTF8.h" - -#include "ASCIICType.h" -#include -#include - -namespace WTF { -namespace Unicode { - -inline int inlineUTF8SequenceLengthNonASCII(char b0) -{ - if ((b0 & 0xC0) != 0xC0) - return 0; - if ((b0 & 0xE0) == 0xC0) - return 2; - if ((b0 & 0xF0) == 0xE0) - return 3; - if ((b0 & 0xF8) == 0xF0) - return 4; - return 0; -} - -inline int inlineUTF8SequenceLength(char b0) -{ - return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0); -} - -int UTF8SequenceLength(char b0) -{ - return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0); -} - -int decodeUTF8Sequence(const char* sequence) -{ - // Handle 0-byte sequences (never valid). - const unsigned char b0 = sequence[0]; - const int length = inlineUTF8SequenceLength(b0); - if (length == 0) - return -1; - - // Handle 1-byte sequences (plain ASCII). - const unsigned char b1 = sequence[1]; - if (length == 1) { - if (b1) - return -1; - return b0; - } - - // Handle 2-byte sequences. - if ((b1 & 0xC0) != 0x80) - return -1; - const unsigned char b2 = sequence[2]; - if (length == 2) { - if (b2) - return -1; - const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F); - if (c < 0x80) - return -1; - return c; - } - - // Handle 3-byte sequences. - if ((b2 & 0xC0) != 0x80) - return -1; - const unsigned char b3 = sequence[3]; - if (length == 3) { - if (b3) - return -1; - const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F); - if (c < 0x800) - return -1; - // UTF-16 surrogates should never appear in UTF-8 data. - if (c >= 0xD800 && c <= 0xDFFF) - return -1; - return c; - } - - // Handle 4-byte sequences. - if ((b3 & 0xC0) != 0x80) - return -1; - const unsigned char b4 = sequence[4]; - if (length == 4) { - if (b4) - return -1; - const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F); - if (c < 0x10000 || c > 0x10FFFF) - return -1; - return c; - } - - return -1; -} - -// Once the bits are split out into bytes of UTF-8, this is a mask OR-ed -// into the first byte, depending on how many bytes follow. There are -// as many entries in this table as there are UTF-8 sequence types. -// (I.e., one byte sequence, two byte... etc.). Remember that sequencs -// for *legal* UTF-8 will be 4 or fewer bytes total. -static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; - -ConversionResult convertUTF16ToUTF8( - const UChar** sourceStart, const UChar* sourceEnd, - char** targetStart, char* targetEnd, bool strict) -{ - ConversionResult result = conversionOK; - const UChar* source = *sourceStart; - char* target = *targetStart; - while (source < sourceEnd) { - UChar32 ch; - unsigned short bytesToWrite = 0; - const UChar32 byteMask = 0xBF; - const UChar32 byteMark = 0x80; - const UChar* oldSource = source; // In case we have to back up because of target overflow. - ch = static_cast(*source++); - // If we have a surrogate pair, convert to UChar32 first. - if (ch >= 0xD800 && ch <= 0xDBFF) { - // If the 16 bits following the high surrogate are in the source buffer... - if (source < sourceEnd) { - UChar32 ch2 = static_cast(*source); - // If it's a low surrogate, convert to UChar32. - if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { - ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000; - ++source; - } else if (strict) { // it's an unpaired high surrogate - --source; // return to the illegal value itself - result = sourceIllegal; - break; - } - } else { // We don't have the 16 bits following the high surrogate. - --source; // return to the high surrogate - result = sourceExhausted; - break; - } - } else if (strict) { - // UTF-16 surrogate values are illegal in UTF-32 - if (ch >= 0xDC00 && ch <= 0xDFFF) { - --source; // return to the illegal value itself - result = sourceIllegal; - break; - } - } - // Figure out how many bytes the result will require - if (ch < (UChar32)0x80) { - bytesToWrite = 1; - } else if (ch < (UChar32)0x800) { - bytesToWrite = 2; - } else if (ch < (UChar32)0x10000) { - bytesToWrite = 3; - } else if (ch < (UChar32)0x110000) { - bytesToWrite = 4; - } else { - bytesToWrite = 3; - ch = replacementCharacter; - } - - target += bytesToWrite; - if (target > targetEnd) { - source = oldSource; // Back up source pointer! - target -= bytesToWrite; - result = targetExhausted; - break; - } - switch (bytesToWrite) { // note: everything falls through. - case 4: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; - case 3: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; - case 2: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; - case 1: *--target = (char)(ch | firstByteMark[bytesToWrite]); - } - target += bytesToWrite; - } - *sourceStart = source; - *targetStart = target; - return result; -} - -// This must be called with the length pre-determined by the first byte. -// If presented with a length > 4, this returns false. The Unicode -// definition of UTF-8 goes up to 4-byte sequences. -static bool isLegalUTF8(const unsigned char* source, int length) -{ - unsigned char a; - const unsigned char* srcptr = source + length; - switch (length) { - default: return false; - // Everything else falls through when "true"... - case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; - case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; - case 2: if ((a = (*--srcptr)) > 0xBF) return false; - - switch (*source) { - // no fall-through in this inner switch - case 0xE0: if (a < 0xA0) return false; break; - case 0xED: if (a > 0x9F) return false; break; - case 0xF0: if (a < 0x90) return false; break; - case 0xF4: if (a > 0x8F) return false; break; - default: if (a < 0x80) return false; - } - - case 1: if (*source >= 0x80 && *source < 0xC2) return false; - } - if (*source > 0xF4) - return false; - return true; -} - -// Magic values subtracted from a buffer value during UTF8 conversion. -// This table contains as many values as there might be trailing bytes -// in a UTF-8 sequence. -static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, - 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; - -static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length) -{ - UChar32 character = 0; - - // The cases all fall through. - switch (length) { - case 6: character += static_cast(*sequence++); character <<= 6; - case 5: character += static_cast(*sequence++); character <<= 6; - case 4: character += static_cast(*sequence++); character <<= 6; - case 3: character += static_cast(*sequence++); character <<= 6; - case 2: character += static_cast(*sequence++); character <<= 6; - case 1: character += static_cast(*sequence++); - } - - return character - offsetsFromUTF8[length - 1]; -} - -ConversionResult convertUTF8ToUTF16( - const char** sourceStart, const char* sourceEnd, - UChar** targetStart, UChar* targetEnd, bool strict) -{ - ConversionResult result = conversionOK; - const char* source = *sourceStart; - UChar* target = *targetStart; - while (source < sourceEnd) { - int utf8SequenceLength = inlineUTF8SequenceLength(*source); - if (sourceEnd - source < utf8SequenceLength) { - result = sourceExhausted; - break; - } - // Do this check whether lenient or strict - if (!isLegalUTF8(reinterpret_cast(source), utf8SequenceLength)) { - result = sourceIllegal; - break; - } - - UChar32 character = readUTF8Sequence(source, utf8SequenceLength); - - if (target >= targetEnd) { - source -= utf8SequenceLength; // Back up source pointer! - result = targetExhausted; - break; - } - - if (U_IS_BMP(character)) { - // UTF-16 surrogate values are illegal in UTF-32 - if (U_IS_SURROGATE(character)) { - if (strict) { - source -= utf8SequenceLength; // return to the illegal value itself - result = sourceIllegal; - break; - } else - *target++ = replacementCharacter; - } else - *target++ = character; // normal case - } else if (U_IS_SUPPLEMENTARY(character)) { - // target is a character in range 0xFFFF - 0x10FFFF - if (target + 1 >= targetEnd) { - source -= utf8SequenceLength; // Back up source pointer! - result = targetExhausted; - break; - } - *target++ = U16_LEAD(character); - *target++ = U16_TRAIL(character); - } else { - if (strict) { - source -= utf8SequenceLength; // return to the start - result = sourceIllegal; - break; // Bail out; shouldn't continue - } else - *target++ = replacementCharacter; - } - } - *sourceStart = source; - *targetStart = target; - return result; -} - -unsigned calculateStringHashAndLengthFromUTF8(const char* data, const char* dataEnd, unsigned& dataLength, unsigned& utf16Length) -{ - if (!data) - return 0; - - StringHasher stringHasher; - dataLength = 0; - utf16Length = 0; - - while (data < dataEnd || (!dataEnd && *data)) { - if (isASCII(*data)) { - stringHasher.addCharacter(*data++); - dataLength++; - utf16Length++; - continue; - } - - int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data); - dataLength += utf8SequenceLength; - - if (!dataEnd) { - for (int i = 1; i < utf8SequenceLength; ++i) { - if (!data[i]) - return 0; - } - } else if (dataEnd - data < utf8SequenceLength) - return 0; - - if (!isLegalUTF8(reinterpret_cast(data), utf8SequenceLength)) - return 0; - - UChar32 character = readUTF8Sequence(data, utf8SequenceLength); - ASSERT(!isASCII(character)); - - if (U_IS_BMP(character)) { - // UTF-16 surrogate values are illegal in UTF-32 - if (U_IS_SURROGATE(character)) - return 0; - stringHasher.addCharacter(static_cast(character)); // normal case - utf16Length++; - } else if (U_IS_SUPPLEMENTARY(character)) { - stringHasher.addCharacters(static_cast(U16_LEAD(character)), - static_cast(U16_TRAIL(character))); - utf16Length += 2; - } else - return 0; - } - - return stringHasher.hash(); -} - -bool equalUTF16WithUTF8(const UChar* a, const UChar* aEnd, const char* b, const char* bEnd) -{ - while (b < bEnd) { - if (isASCII(*b)) { - if (*a++ != *b++) - return false; - continue; - } - - int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b); - - if (bEnd - b < utf8SequenceLength) - return false; - - if (!isLegalUTF8(reinterpret_cast(b), utf8SequenceLength)) - return 0; - - UChar32 character = readUTF8Sequence(b, utf8SequenceLength); - ASSERT(!isASCII(character)); - - if (U_IS_BMP(character)) { - // UTF-16 surrogate values are illegal in UTF-32 - if (U_IS_SURROGATE(character)) - return false; - if (*a++ != character) - return false; - } else if (U_IS_SUPPLEMENTARY(character)) { - if (*a++ != U16_LEAD(character)) - return false; - if (*a++ != U16_TRAIL(character)) - return false; - } else - return false; - } - - return a == aEnd; -} - -} // namespace Unicode -} // namespace WTF