X-Git-Url: https://git.saurik.com/wxWidgets.git/blobdiff_plain/830f8f11bca5c0892ae767ba14790c8b5b59011f..ab9893576c877f5691023315c23c8348d9d6affd:/src/common/convauto.cpp diff --git a/src/common/convauto.cpp b/src/common/convauto.cpp index d43bb6d1ba..8d8c24c0a3 100644 --- a/src/common/convauto.cpp +++ b/src/common/convauto.cpp @@ -23,69 +23,116 @@ #pragma hdrstop #endif -#if wxUSE_WCHAR_T - #ifndef WX_PRECOMP + #include "wx/wx.h" #endif //WX_PRECOMP #include "wx/convauto.h" +// we use latin1 by default as it seems the least bad choice: the files we need +// to detect input of don't always come from the user system (they are often +// received from other machines) and so using wxFONTENCODING_SYSTEM doesn't +// seem to be a good idea and there is no other reasonable alternative +wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1; + // ============================================================================ // implementation // ============================================================================ /* static */ -wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen) +void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc) { - if ( srcLen < 2 ) - { - // minimal BOM is 2 bytes so bail out immediately and simplify the code - // below which wouldn't need to check for length for UTF-16 cases - return BOM_None; - } + wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT, + wxT("wxFONTENCODING_DEFAULT doesn't make sense here") ); + + ms_defaultMBEncoding = enc; +} +/* static */ +wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen) +{ // examine the buffer for BOM presence // - // see http://www.unicode.org/faq/utf_bom.html#BOM - switch ( *src++ ) + // quoting from http://www.unicode.org/faq/utf_bom.html#BOM: + // + // Bytes Encoding Form + // + // 00 00 FE FF UTF-32, big-endian + // FF FE 00 00 UTF-32, little-endian + // FE FF UTF-16, big-endian + // FF FE UTF-16, little-endian + // EF BB BF UTF-8 + // + // as some BOMs are prefixes of other ones we may need to read more bytes + // to disambiguate them + + switch ( srcLen ) { - case '\0': - // could only be big endian UTF-32 (00 00 FE FF) - if ( srcLen >= 4 && - src[0] == '\0' && - src[1] == '\xfe' && - src[2] == '\xff' ) + case 0: + return BOM_Unknown; + + case 1: + if ( src[0] == '\x00' || src[0] == '\xFF' || + src[0] == '\xFE' || src[0] == '\xEF') { - return BOM_UTF32BE; + // this could be a BOM but we don't know yet + return BOM_Unknown; } break; - case '\xfe': - // could only be big endian UTF-16 (FE FF) - if ( *src++ == '\xff' ) + case 2: + case 3: + if ( src[0] == '\xEF' && src[1] == '\xBB' ) { - return BOM_UTF16BE; + if ( srcLen == 3 ) + return src[2] == '\xBF' ? BOM_UTF8 : BOM_None; + + return BOM_Unknown; } - break; - case '\xff': - // could be either little endian UTF-16 or UTF-32, both start - // with FF FE - if ( *src++ == '\xfe' ) + if ( src[0] == '\xFE' && src[1] == '\xFF' ) + return BOM_UTF16BE; + + if ( src[0] == '\xFF' && src[1] == '\xFE' ) { - return srcLen >= 4 && src[0] == '\0' && src[1] == '\0' - ? BOM_UTF32LE - : BOM_UTF16LE; + // if the next byte is 0, it could be an UTF-32LE BOM but if it + // isn't we can be sure it's UTF-16LE + if ( srcLen == 3 && src[2] != '\x00' ) + return BOM_UTF16LE; + + return BOM_Unknown; } - break; - case '\xef': - // is this UTF-8 BOM (EF BB BF)? - if ( srcLen >= 3 && src[0] == '\xbb' && src[1] == '\xbf' ) + if ( src[0] == '\x00' && src[1] == '\x00' ) { - return BOM_UTF8; + // this could only be UTF-32BE, check that the data we have so + // far allows for it + if ( srcLen == 3 && src[2] != '\xFE' ) + return BOM_None; + + return BOM_Unknown; } break; + + default: + // we have at least 4 characters so we may finally decide whether + // we have a BOM or not + if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' ) + return BOM_UTF8; + + if ( src[0] == '\x00' && src[1] == '\x00' && + src[2] == '\xFE' && src[3] == '\xFF' ) + return BOM_UTF32BE; + + if ( src[0] == '\xFF' && src[1] == '\xFE' && + src[2] == '\x00' && src[3] == '\x00' ) + return BOM_UTF32LE; + + if ( src[0] == '\xFE' && src[1] == '\xFF' ) + return BOM_UTF16BE; + + if ( src[0] == '\xFF' && src[1] == '\xFE' ) + return BOM_UTF16LE; } return BOM_None; @@ -97,6 +144,14 @@ void wxConvAuto::InitFromBOM(BOMType bomType) switch ( bomType ) { + case BOM_Unknown: + wxFAIL_MSG( "shouldn't be called for this BOM type" ); + break; + + case BOM_None: + // use the default + break; + case BOM_UTF32BE: m_conv = new wxMBConvUTF32BE; m_ownsConv = true; @@ -118,17 +173,20 @@ void wxConvAuto::InitFromBOM(BOMType bomType) break; case BOM_UTF8: - m_conv = &wxConvUTF8; - m_ownsConv = false; + InitWithUTF8(); break; default: - wxFAIL_MSG( _T("unexpected BOM type") ); - // fall through: still need to create something + wxFAIL_MSG( "unknown BOM type" ); + } - case BOM_None: - InitWithDefault(); - m_consumedBOM = true; // as there is nothing to consume + if ( !m_conv ) + { + // we end up here if there is no BOM or we didn't recognize it somehow + // (this shouldn't happen but still don't crash if it does), so use the + // default encoding + InitWithUTF8(); + m_consumedBOM = true; // as there is nothing to consume } } @@ -137,6 +195,14 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const int ofs; switch ( m_bomType ) { + case BOM_Unknown: + wxFAIL_MSG( "shouldn't be called for this BOM type" ); + return; + + case BOM_None: + ofs = 0; + break; + case BOM_UTF32BE: case BOM_UTF32LE: ofs = 4; @@ -152,11 +218,8 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const break; default: - wxFAIL_MSG( _T("unexpected BOM type") ); - // fall through: still need to create something - - case BOM_None: - ofs = 0; + wxFAIL_MSG( "unknown BOM type" ); + return; } *src += ofs; @@ -164,11 +227,15 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const *len -= ofs; } -void wxConvAuto::InitFromInput(const char **src, size_t *len) +bool wxConvAuto::InitFromInput(const char *src, size_t len) { - m_bomType = DetectBOM(*src, *len); + m_bomType = DetectBOM(src, len); + if ( m_bomType == BOM_Unknown ) + return false; + InitFromBOM(m_bomType); - SkipBOM(src, len); + + return true; } size_t @@ -180,21 +247,61 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen, // during this initial call but also during the first call with non-NULL // dst as typically we're first called with NULL dst to calculate the // needed buffer size - wxConvAuto *self = wx_const_cast(wxConvAuto *, this); + wxConvAuto *self = const_cast(this); + + if ( !m_conv ) { - self->InitFromInput(&src, &srcLen); - if ( dst ) - self->m_consumedBOM = true; + if ( !self->InitFromInput(src, srcLen) ) + { + // there is not enough data to determine whether we have a BOM or + // not, so fail for now -- the caller is supposed to call us again + // with more data + return wxCONV_FAILED; + } } - if ( !m_consumedBOM && dst ) + if ( !m_consumedBOM ) { - self->m_consumedBOM = true; SkipBOM(&src, &srcLen); + if ( srcLen == 0 ) + { + // there is nothing left except the BOM so we'd return 0 below but + // this is unexpected: decoding a non-empty string must either fail + // or return something non-empty, in particular this would break + // the code in wxTextInputStream::NextChar() + // + // so still return an error as we need some more data to be able to + // decode it + return wxCONV_FAILED; + } } - return m_conv->ToWChar(dst, dstLen, src, srcLen); + // try to convert using the auto-detected encoding + size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen); + if ( rc == wxCONV_FAILED && m_bomType == BOM_None ) + { + // if the conversion failed but we didn't really detect anything and + // simply tried UTF-8 by default, retry it using the fall-back + if ( m_encDefault != wxFONTENCODING_MAX ) + { + if ( m_ownsConv ) + delete m_conv; + + self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT + ? GetFallbackEncoding() + : m_encDefault); + self->m_ownsConv = true; + + rc = m_conv->ToWChar(dst, dstLen, src, srcLen); + } + } + + // don't skip the BOM again the next time if we really consumed it + if ( rc != wxCONV_FAILED && dst && !m_consumedBOM ) + self->m_consumedBOM = true; + + return rc; } size_t @@ -204,11 +311,8 @@ wxConvAuto::FromWChar(char *dst, size_t dstLen, if ( !m_conv ) { // default to UTF-8 for the multibyte output - wx_const_cast(wxConvAuto *, this)->InitWithDefault(); + const_cast(this)->InitWithUTF8(); } return m_conv->FromWChar(dst, dstLen, src, srcLen); } - -#endif // wxUSE_WCHAR_T -