X-Git-Url: https://git.saurik.com/wxWidgets.git/blobdiff_plain/9a83f860948059b0273b5cc6d9e43fadad3ebfca..801423ee3454d200581cd51d35fbcdad19f2208a:/src/common/convauto.cpp diff --git a/src/common/convauto.cpp b/src/common/convauto.cpp index f4e394d04d..8620d4e02e 100644 --- a/src/common/convauto.cpp +++ b/src/common/convauto.cpp @@ -23,11 +23,6 @@ #pragma hdrstop #endif -#if wxUSE_WCHAR_T - -#ifndef WX_PRECOMP -#endif //WX_PRECOMP - #include "wx/convauto.h" // we use latin1 by default as it seems the least bad choice: the files we need @@ -52,55 +47,88 @@ void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc) /* static */ wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen) { - if ( srcLen < 2 ) - { - // minimal BOM is 2 bytes so bail out immediately and simplify the code - // below which wouldn't need to check for length for UTF-16 cases - return BOM_None; - } - // examine the buffer for BOM presence // - // see http://www.unicode.org/faq/utf_bom.html#BOM - switch ( *src++ ) + // quoting from http://www.unicode.org/faq/utf_bom.html#BOM: + // + // Bytes Encoding Form + // + // 00 00 FE FF UTF-32, big-endian + // FF FE 00 00 UTF-32, little-endian + // FE FF UTF-16, big-endian + // FF FE UTF-16, little-endian + // EF BB BF UTF-8 + // + // as some BOMs are prefixes of other ones we may need to read more bytes + // to disambiguate them + + switch ( srcLen ) { - case '\0': - // could only be big endian UTF-32 (00 00 FE FF) - if ( srcLen >= 4 && - src[0] == '\0' && - src[1] == '\xfe' && - src[2] == '\xff' ) + case 0: + return BOM_Unknown; + + case 1: + if ( src[0] == '\x00' || src[0] == '\xFF' || + src[0] == '\xFE' || src[0] == '\xEF') { - return BOM_UTF32BE; + // this could be a BOM but we don't know yet + return BOM_Unknown; } break; - case '\xfe': - // could only be big endian UTF-16 (FE FF) - if ( *src++ == '\xff' ) + case 2: + case 3: + if ( src[0] == '\xEF' && src[1] == '\xBB' ) { - return BOM_UTF16BE; + if ( srcLen == 3 ) + return src[2] == '\xBF' ? BOM_UTF8 : BOM_None; + + return BOM_Unknown; } - break; - case '\xff': - // could be either little endian UTF-16 or UTF-32, both start - // with FF FE - if ( *src++ == '\xfe' ) + if ( src[0] == '\xFE' && src[1] == '\xFF' ) + return BOM_UTF16BE; + + if ( src[0] == '\xFF' && src[1] == '\xFE' ) { - return srcLen >= 4 && src[0] == '\0' && src[1] == '\0' - ? BOM_UTF32LE - : BOM_UTF16LE; + // if the next byte is 0, it could be an UTF-32LE BOM but if it + // isn't we can be sure it's UTF-16LE + if ( srcLen == 3 && src[2] != '\x00' ) + return BOM_UTF16LE; + + return BOM_Unknown; } - break; - case '\xef': - // is this UTF-8 BOM (EF BB BF)? - if ( srcLen >= 3 && src[0] == '\xbb' && src[1] == '\xbf' ) + if ( src[0] == '\x00' && src[1] == '\x00' ) { - return BOM_UTF8; + // this could only be UTF-32BE, check that the data we have so + // far allows for it + if ( srcLen == 3 && src[2] != '\xFE' ) + return BOM_None; + + return BOM_Unknown; } break; + + default: + // we have at least 4 characters so we may finally decide whether + // we have a BOM or not + if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' ) + return BOM_UTF8; + + if ( src[0] == '\x00' && src[1] == '\x00' && + src[2] == '\xFE' && src[3] == '\xFF' ) + return BOM_UTF32BE; + + if ( src[0] == '\xFF' && src[1] == '\xFE' && + src[2] == '\x00' && src[3] == '\x00' ) + return BOM_UTF32LE; + + if ( src[0] == '\xFE' && src[1] == '\xFF' ) + return BOM_UTF16BE; + + if ( src[0] == '\xFF' && src[1] == '\xFE' ) + return BOM_UTF16LE; } return BOM_None; @@ -112,6 +140,14 @@ void wxConvAuto::InitFromBOM(BOMType bomType) switch ( bomType ) { + case BOM_Unknown: + wxFAIL_MSG( "shouldn't be called for this BOM type" ); + break; + + case BOM_None: + // use the default + break; + case BOM_UTF32BE: m_conv = new wxMBConvUTF32BE; m_ownsConv = true; @@ -137,12 +173,16 @@ void wxConvAuto::InitFromBOM(BOMType bomType) break; default: - wxFAIL_MSG( wxT("unexpected BOM type") ); - // fall through: still need to create something + wxFAIL_MSG( "unknown BOM type" ); + } - case BOM_None: - InitWithUTF8(); - m_consumedBOM = true; // as there is nothing to consume + if ( !m_conv ) + { + // we end up here if there is no BOM or we didn't recognize it somehow + // (this shouldn't happen but still don't crash if it does), so use the + // default encoding + InitWithUTF8(); + m_consumedBOM = true; // as there is nothing to consume } } @@ -151,6 +191,14 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const int ofs; switch ( m_bomType ) { + case BOM_Unknown: + wxFAIL_MSG( "shouldn't be called for this BOM type" ); + return; + + case BOM_None: + ofs = 0; + break; + case BOM_UTF32BE: case BOM_UTF32LE: ofs = 4; @@ -166,11 +214,8 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const break; default: - wxFAIL_MSG( wxT("unexpected BOM type") ); - // fall through: still need to create something - - case BOM_None: - ofs = 0; + wxFAIL_MSG( "unknown BOM type" ); + return; } *src += ofs; @@ -178,11 +223,15 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const *len -= ofs; } -void wxConvAuto::InitFromInput(const char **src, size_t *len) +bool wxConvAuto::InitFromInput(const char *src, size_t len) { - m_bomType = DetectBOM(*src, *len); + m_bomType = DetectBOM(src, len == wxNO_LEN ? strlen(src) : len); + if ( m_bomType == BOM_Unknown ) + return false; + InitFromBOM(m_bomType); - SkipBOM(src, len); + + return true; } size_t @@ -195,17 +244,33 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen, // dst as typically we're first called with NULL dst to calculate the // needed buffer size wxConvAuto *self = const_cast(this); + + if ( !m_conv ) { - self->InitFromInput(&src, &srcLen); - if ( dst ) - self->m_consumedBOM = true; + if ( !self->InitFromInput(src, srcLen) ) + { + // there is not enough data to determine whether we have a BOM or + // not, so fail for now -- the caller is supposed to call us again + // with more data + return wxCONV_FAILED; + } } - if ( !m_consumedBOM && dst ) + if ( !m_consumedBOM ) { - self->m_consumedBOM = true; SkipBOM(&src, &srcLen); + if ( srcLen == 0 ) + { + // there is nothing left except the BOM so we'd return 0 below but + // this is unexpected: decoding a non-empty string must either fail + // or return something non-empty, in particular this would break + // the code in wxTextInputStream::NextChar() + // + // so still return an error as we need some more data to be able to + // decode it + return wxCONV_FAILED; + } } // try to convert using the auto-detected encoding @@ -228,6 +293,10 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen, } } + // don't skip the BOM again the next time if we really consumed it + if ( rc != wxCONV_FAILED && dst && !m_consumedBOM ) + self->m_consumedBOM = true; + return rc; } @@ -243,6 +312,3 @@ wxConvAuto::FromWChar(char *dst, size_t dstLen, return m_conv->FromWChar(dst, dstLen, src, srcLen); } - -#endif // wxUSE_WCHAR_T -