X-Git-Url: https://git.saurik.com/wxWidgets.git/blobdiff_plain/830f8f11bca5c0892ae767ba14790c8b5b59011f..535a0e088970edfcd385cd16d1c96958a317349e:/src/common/convauto.cpp diff --git a/src/common/convauto.cpp b/src/common/convauto.cpp index d43bb6d1ba..3fcccd849c 100644 --- a/src/common/convauto.cpp +++ b/src/common/convauto.cpp @@ -23,112 +23,199 @@ #pragma hdrstop #endif -#if wxUSE_WCHAR_T +#include "wx/convauto.h" -#ifndef WX_PRECOMP -#endif //WX_PRECOMP +// we use latin1 by default as it seems the least bad choice: the files we need +// to detect input of don't always come from the user system (they are often +// received from other machines) and so using wxFONTENCODING_SYSTEM doesn't +// seem to be a good idea and there is no other reasonable alternative +wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1; -#include "wx/convauto.h" +namespace +{ + +const char BOM_UTF32BE[] = { '\x00', '\x00', '\xFE', '\xFF' }; +const char BOM_UTF32LE[] = { '\xFF', '\xFE', '\x00', '\x00' }; +const char BOM_UTF16BE[] = { '\xFE', '\xFF' }; +const char BOM_UTF16LE[] = { '\xFF', '\xFE' }; +const char BOM_UTF8[] = { '\xEF', '\xBB', '\xBF' }; + +} // anonymous namespace // ============================================================================ // implementation // ============================================================================ /* static */ -wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen) +void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc) { - if ( srcLen < 2 ) + wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT, + wxT("wxFONTENCODING_DEFAULT doesn't make sense here") ); + + ms_defaultMBEncoding = enc; +} + +/* static */ +const char* wxConvAuto::GetBOMChars(wxBOM bom, size_t* count) +{ + wxCHECK_MSG( count , NULL, wxS("count pointer must be provided") ); + + switch ( bom ) { - // minimal BOM is 2 bytes so bail out immediately and simplify the code - // below which wouldn't need to check for length for UTF-16 cases - return BOM_None; + case wxBOM_UTF32BE: *count = WXSIZEOF(BOM_UTF32BE); return BOM_UTF32BE; + case wxBOM_UTF32LE: *count = WXSIZEOF(BOM_UTF32LE); return BOM_UTF32LE; + case wxBOM_UTF16BE: *count = WXSIZEOF(BOM_UTF16BE); return BOM_UTF16BE; + case wxBOM_UTF16LE: *count = WXSIZEOF(BOM_UTF16LE); return BOM_UTF16LE; + case wxBOM_UTF8 : *count = WXSIZEOF(BOM_UTF8 ); return BOM_UTF8; + case wxBOM_Unknown: + case wxBOM_None: + wxFAIL_MSG( wxS("Invalid BOM type") ); + return NULL; } + wxFAIL_MSG( wxS("Unknown BOM type") ); + return NULL; +} + +/* static */ +wxBOM wxConvAuto::DetectBOM(const char *src, size_t srcLen) +{ // examine the buffer for BOM presence // - // see http://www.unicode.org/faq/utf_bom.html#BOM - switch ( *src++ ) + // quoting from http://www.unicode.org/faq/utf_bom.html#BOM: + // + // Bytes Encoding Form + // + // 00 00 FE FF UTF-32, big-endian + // FF FE 00 00 UTF-32, little-endian + // FE FF UTF-16, big-endian + // FF FE UTF-16, little-endian + // EF BB BF UTF-8 + // + // as some BOMs are prefixes of other ones we may need to read more bytes + // to disambiguate them + + switch ( srcLen ) { - case '\0': - // could only be big endian UTF-32 (00 00 FE FF) - if ( srcLen >= 4 && - src[0] == '\0' && - src[1] == '\xfe' && - src[2] == '\xff' ) + case 0: + return wxBOM_Unknown; + + case 1: + if ( src[0] == '\x00' || src[0] == '\xFF' || + src[0] == '\xFE' || src[0] == '\xEF') { - return BOM_UTF32BE; + // this could be a BOM but we don't know yet + return wxBOM_Unknown; } break; - case '\xfe': - // could only be big endian UTF-16 (FE FF) - if ( *src++ == '\xff' ) + case 2: + case 3: + if ( src[0] == '\xEF' && src[1] == '\xBB' ) { - return BOM_UTF16BE; + if ( srcLen == 3 ) + return src[2] == '\xBF' ? wxBOM_UTF8 : wxBOM_None; + + return wxBOM_Unknown; } - break; - case '\xff': - // could be either little endian UTF-16 or UTF-32, both start - // with FF FE - if ( *src++ == '\xfe' ) + if ( src[0] == '\xFE' && src[1] == '\xFF' ) + return wxBOM_UTF16BE; + + if ( src[0] == '\xFF' && src[1] == '\xFE' ) { - return srcLen >= 4 && src[0] == '\0' && src[1] == '\0' - ? BOM_UTF32LE - : BOM_UTF16LE; + // if the next byte is 0, it could be an UTF-32LE BOM but if it + // isn't we can be sure it's UTF-16LE + if ( srcLen == 3 && src[2] != '\x00' ) + return wxBOM_UTF16LE; + + return wxBOM_Unknown; } - break; - case '\xef': - // is this UTF-8 BOM (EF BB BF)? - if ( srcLen >= 3 && src[0] == '\xbb' && src[1] == '\xbf' ) + if ( src[0] == '\x00' && src[1] == '\x00' ) { - return BOM_UTF8; + // this could only be UTF-32BE, check that the data we have so + // far allows for it + if ( srcLen == 3 && src[2] != '\xFE' ) + return wxBOM_None; + + return wxBOM_Unknown; } break; + + default: + // we have at least 4 characters so we may finally decide whether + // we have a BOM or not + if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' ) + return wxBOM_UTF8; + + if ( src[0] == '\x00' && src[1] == '\x00' && + src[2] == '\xFE' && src[3] == '\xFF' ) + return wxBOM_UTF32BE; + + if ( src[0] == '\xFF' && src[1] == '\xFE' && + src[2] == '\x00' && src[3] == '\x00' ) + return wxBOM_UTF32LE; + + if ( src[0] == '\xFE' && src[1] == '\xFF' ) + return wxBOM_UTF16BE; + + if ( src[0] == '\xFF' && src[1] == '\xFE' ) + return wxBOM_UTF16LE; } - return BOM_None; + return wxBOM_None; } -void wxConvAuto::InitFromBOM(BOMType bomType) +void wxConvAuto::InitFromBOM(wxBOM bomType) { m_consumedBOM = false; switch ( bomType ) { - case BOM_UTF32BE: + case wxBOM_Unknown: + wxFAIL_MSG( "shouldn't be called for this BOM type" ); + break; + + case wxBOM_None: + // use the default + break; + + case wxBOM_UTF32BE: m_conv = new wxMBConvUTF32BE; m_ownsConv = true; break; - case BOM_UTF32LE: + case wxBOM_UTF32LE: m_conv = new wxMBConvUTF32LE; m_ownsConv = true; break; - case BOM_UTF16BE: + case wxBOM_UTF16BE: m_conv = new wxMBConvUTF16BE; m_ownsConv = true; break; - case BOM_UTF16LE: + case wxBOM_UTF16LE: m_conv = new wxMBConvUTF16LE; m_ownsConv = true; break; - case BOM_UTF8: - m_conv = &wxConvUTF8; - m_ownsConv = false; + case wxBOM_UTF8: + InitWithUTF8(); break; default: - wxFAIL_MSG( _T("unexpected BOM type") ); - // fall through: still need to create something + wxFAIL_MSG( "unknown BOM type" ); + } - case BOM_None: - InitWithDefault(); - m_consumedBOM = true; // as there is nothing to consume + if ( !m_conv ) + { + // we end up here if there is no BOM or we didn't recognize it somehow + // (this shouldn't happen but still don't crash if it does), so use the + // default encoding + InitWithUTF8(); + m_consumedBOM = true; // as there is nothing to consume } } @@ -137,26 +224,31 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const int ofs; switch ( m_bomType ) { - case BOM_UTF32BE: - case BOM_UTF32LE: + case wxBOM_Unknown: + wxFAIL_MSG( "shouldn't be called for this BOM type" ); + return; + + case wxBOM_None: + ofs = 0; + break; + + case wxBOM_UTF32BE: + case wxBOM_UTF32LE: ofs = 4; break; - case BOM_UTF16BE: - case BOM_UTF16LE: + case wxBOM_UTF16BE: + case wxBOM_UTF16LE: ofs = 2; break; - case BOM_UTF8: + case wxBOM_UTF8: ofs = 3; break; default: - wxFAIL_MSG( _T("unexpected BOM type") ); - // fall through: still need to create something - - case BOM_None: - ofs = 0; + wxFAIL_MSG( "unknown BOM type" ); + return; } *src += ofs; @@ -164,11 +256,15 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const *len -= ofs; } -void wxConvAuto::InitFromInput(const char **src, size_t *len) +bool wxConvAuto::InitFromInput(const char *src, size_t len) { - m_bomType = DetectBOM(*src, *len); + m_bomType = DetectBOM(src, len == wxNO_LEN ? strlen(src) : len); + if ( m_bomType == wxBOM_Unknown ) + return false; + InitFromBOM(m_bomType); - SkipBOM(src, len); + + return true; } size_t @@ -180,21 +276,61 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen, // during this initial call but also during the first call with non-NULL // dst as typically we're first called with NULL dst to calculate the // needed buffer size - wxConvAuto *self = wx_const_cast(wxConvAuto *, this); + wxConvAuto *self = const_cast(this); + + if ( !m_conv ) { - self->InitFromInput(&src, &srcLen); - if ( dst ) - self->m_consumedBOM = true; + if ( !self->InitFromInput(src, srcLen) ) + { + // there is not enough data to determine whether we have a BOM or + // not, so fail for now -- the caller is supposed to call us again + // with more data + return wxCONV_FAILED; + } } - if ( !m_consumedBOM && dst ) + if ( !m_consumedBOM ) { - self->m_consumedBOM = true; SkipBOM(&src, &srcLen); + if ( srcLen == 0 ) + { + // there is nothing left except the BOM so we'd return 0 below but + // this is unexpected: decoding a non-empty string must either fail + // or return something non-empty, in particular this would break + // the code in wxTextInputStream::NextChar() + // + // so still return an error as we need some more data to be able to + // decode it + return wxCONV_FAILED; + } } - return m_conv->ToWChar(dst, dstLen, src, srcLen); + // try to convert using the auto-detected encoding + size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen); + if ( rc == wxCONV_FAILED && m_bomType == wxBOM_None ) + { + // if the conversion failed but we didn't really detect anything and + // simply tried UTF-8 by default, retry it using the fall-back + if ( m_encDefault != wxFONTENCODING_MAX ) + { + if ( m_ownsConv ) + delete m_conv; + + self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT + ? GetFallbackEncoding() + : m_encDefault); + self->m_ownsConv = true; + + rc = m_conv->ToWChar(dst, dstLen, src, srcLen); + } + } + + // don't skip the BOM again the next time if we really consumed it + if ( rc != wxCONV_FAILED && dst && !m_consumedBOM ) + self->m_consumedBOM = true; + + return rc; } size_t @@ -204,11 +340,8 @@ wxConvAuto::FromWChar(char *dst, size_t dstLen, if ( !m_conv ) { // default to UTF-8 for the multibyte output - wx_const_cast(wxConvAuto *, this)->InitWithDefault(); + const_cast(this)->InitWithUTF8(); } return m_conv->FromWChar(dst, dstLen, src, srcLen); } - -#endif // wxUSE_WCHAR_T -