X-Git-Url: https://git.saurik.com/wxWidgets.git/blobdiff_plain/4cb0e8d05cadea6be3a7bd93f1fea9a9e0df95f0..931d6a47c32a5b4c283243cb553ce71ee2b535d5:/src/common/convauto.cpp diff --git a/src/common/convauto.cpp b/src/common/convauto.cpp index c9ff7df9f6..6a5fba4ecb 100644 --- a/src/common/convauto.cpp +++ b/src/common/convauto.cpp @@ -3,7 +3,6 @@ // Purpose: implementation of wxConvAuto // Author: Vadim Zeitlin // Created: 2006-04-04 -// RCS-ID: $Id$ // Copyright: (c) 2006 Vadim Zeitlin // Licence: wxWindows licence /////////////////////////////////////////////////////////////////////////////// @@ -23,12 +22,6 @@ #pragma hdrstop #endif -#if wxUSE_WCHAR_T - -#ifndef WX_PRECOMP - #include "wx/wx.h" -#endif //WX_PRECOMP - #include "wx/convauto.h" // we use latin1 by default as it seems the least bad choice: the files we need @@ -37,6 +30,17 @@ // seem to be a good idea and there is no other reasonable alternative wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1; +namespace +{ + +const char BOM_UTF32BE[] = { '\x00', '\x00', '\xFE', '\xFF' }; +const char BOM_UTF32LE[] = { '\xFF', '\xFE', '\x00', '\x00' }; +const char BOM_UTF16BE[] = { '\xFE', '\xFF' }; +const char BOM_UTF16LE[] = { '\xFF', '\xFE' }; +const char BOM_UTF8[] = { '\xEF', '\xBB', '\xBF' }; + +} // anonymous namespace + // ============================================================================ // implementation // ============================================================================ @@ -51,7 +55,29 @@ void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc) } /* static */ -wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen) +const char* wxConvAuto::GetBOMChars(wxBOM bom, size_t* count) +{ + wxCHECK_MSG( count , NULL, wxS("count pointer must be provided") ); + + switch ( bom ) + { + case wxBOM_UTF32BE: *count = WXSIZEOF(BOM_UTF32BE); return BOM_UTF32BE; + case wxBOM_UTF32LE: *count = WXSIZEOF(BOM_UTF32LE); return BOM_UTF32LE; + case wxBOM_UTF16BE: *count = WXSIZEOF(BOM_UTF16BE); return BOM_UTF16BE; + case wxBOM_UTF16LE: *count = WXSIZEOF(BOM_UTF16LE); return BOM_UTF16LE; + case wxBOM_UTF8 : *count = WXSIZEOF(BOM_UTF8 ); return BOM_UTF8; + case wxBOM_Unknown: + case wxBOM_None: + wxFAIL_MSG( wxS("Invalid BOM type") ); + return NULL; + } + + wxFAIL_MSG( wxS("Unknown BOM type") ); + return NULL; +} + +/* static */ +wxBOM wxConvAuto::DetectBOM(const char *src, size_t srcLen) { // examine the buffer for BOM presence // @@ -71,14 +97,14 @@ wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen) switch ( srcLen ) { case 0: - return BOM_Unknown; + return wxBOM_Unknown; case 1: if ( src[0] == '\x00' || src[0] == '\xFF' || src[0] == '\xFE' || src[0] == '\xEF') { // this could be a BOM but we don't know yet - return BOM_Unknown; + return wxBOM_Unknown; } break; @@ -87,92 +113,94 @@ wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen) if ( src[0] == '\xEF' && src[1] == '\xBB' ) { if ( srcLen == 3 ) - return src[2] == '\xBF' ? BOM_UTF8 : BOM_None; + return src[2] == '\xBF' ? wxBOM_UTF8 : wxBOM_None; - return BOM_Unknown; + return wxBOM_Unknown; } if ( src[0] == '\xFE' && src[1] == '\xFF' ) - return BOM_UTF16BE; + return wxBOM_UTF16BE; if ( src[0] == '\xFF' && src[1] == '\xFE' ) { // if the next byte is 0, it could be an UTF-32LE BOM but if it // isn't we can be sure it's UTF-16LE if ( srcLen == 3 && src[2] != '\x00' ) - return BOM_UTF16LE; + return wxBOM_UTF16LE; - return BOM_Unknown; + return wxBOM_Unknown; } if ( src[0] == '\x00' && src[1] == '\x00' ) { - // this could only be UTF-32BE - if ( srcLen == 3 && src[2] == '\xFE' ) - return BOM_Unknown; - } + // this could only be UTF-32BE, check that the data we have so + // far allows for it + if ( srcLen == 3 && src[2] != '\xFE' ) + return wxBOM_None; + return wxBOM_Unknown; + } break; default: // we have at least 4 characters so we may finally decide whether // we have a BOM or not if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' ) - return BOM_UTF8; + return wxBOM_UTF8; if ( src[0] == '\x00' && src[1] == '\x00' && src[2] == '\xFE' && src[3] == '\xFF' ) - return BOM_UTF32BE; + return wxBOM_UTF32BE; if ( src[0] == '\xFF' && src[1] == '\xFE' && src[2] == '\x00' && src[3] == '\x00' ) - return BOM_UTF32LE; + return wxBOM_UTF32LE; if ( src[0] == '\xFE' && src[1] == '\xFF' ) - return BOM_UTF16BE; + return wxBOM_UTF16BE; if ( src[0] == '\xFF' && src[1] == '\xFE' ) - return BOM_UTF16LE; + return wxBOM_UTF16LE; } - return BOM_None; + return wxBOM_None; } -void wxConvAuto::InitFromBOM(BOMType bomType) +void wxConvAuto::InitFromBOM(wxBOM bomType) { m_consumedBOM = false; switch ( bomType ) { - case BOM_Unknown: + case wxBOM_Unknown: wxFAIL_MSG( "shouldn't be called for this BOM type" ); break; - case BOM_None: + case wxBOM_None: // use the default break; - case BOM_UTF32BE: + case wxBOM_UTF32BE: m_conv = new wxMBConvUTF32BE; m_ownsConv = true; break; - case BOM_UTF32LE: + case wxBOM_UTF32LE: m_conv = new wxMBConvUTF32LE; m_ownsConv = true; break; - case BOM_UTF16BE: + case wxBOM_UTF16BE: m_conv = new wxMBConvUTF16BE; m_ownsConv = true; break; - case BOM_UTF16LE: + case wxBOM_UTF16LE: m_conv = new wxMBConvUTF16LE; m_ownsConv = true; break; - case BOM_UTF8: + case wxBOM_UTF8: InitWithUTF8(); break; @@ -195,25 +223,25 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const int ofs; switch ( m_bomType ) { - case BOM_Unknown: + case wxBOM_Unknown: wxFAIL_MSG( "shouldn't be called for this BOM type" ); return; - case BOM_None: + case wxBOM_None: ofs = 0; break; - case BOM_UTF32BE: - case BOM_UTF32LE: + case wxBOM_UTF32BE: + case wxBOM_UTF32LE: ofs = 4; break; - case BOM_UTF16BE: - case BOM_UTF16LE: + case wxBOM_UTF16BE: + case wxBOM_UTF16LE: ofs = 2; break; - case BOM_UTF8: + case wxBOM_UTF8: ofs = 3; break; @@ -227,14 +255,13 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const *len -= ofs; } -bool wxConvAuto::InitFromInput(const char **src, size_t *len) +bool wxConvAuto::InitFromInput(const char *src, size_t len) { - m_bomType = DetectBOM(*src, *len); - if ( m_bomType == BOM_Unknown ) + m_bomType = DetectBOM(src, len == wxNO_LEN ? strlen(src) : len); + if ( m_bomType == wxBOM_Unknown ) return false; InitFromBOM(m_bomType); - SkipBOM(src, len); return true; } @@ -253,7 +280,7 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen, if ( !m_conv ) { - if ( !self->InitFromInput(&src, &srcLen) ) + if ( !self->InitFromInput(src, srcLen) ) { // there is not enough data to determine whether we have a BOM or // not, so fail for now -- the caller is supposed to call us again @@ -261,14 +288,26 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen, return wxCONV_FAILED; } } - else if ( !m_consumedBOM && dst ) + + if ( !m_consumedBOM ) { SkipBOM(&src, &srcLen); + if ( srcLen == 0 ) + { + // there is nothing left except the BOM so we'd return 0 below but + // this is unexpected: decoding a non-empty string must either fail + // or return something non-empty, in particular this would break + // the code in wxTextInputStream::NextChar() + // + // so still return an error as we need some more data to be able to + // decode it + return wxCONV_FAILED; + } } // try to convert using the auto-detected encoding size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen); - if ( rc == wxCONV_FAILED && m_bomType == BOM_None ) + if ( rc == wxCONV_FAILED && m_bomType == wxBOM_None ) { // if the conversion failed but we didn't really detect anything and // simply tried UTF-8 by default, retry it using the fall-back @@ -286,8 +325,10 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen, } } - if (rc != wxCONV_FAILED && dst && !m_consumedBOM) + // don't skip the BOM again the next time if we really consumed it + if ( rc != wxCONV_FAILED && dst && !m_consumedBOM ) self->m_consumedBOM = true; + return rc; } @@ -303,5 +344,3 @@ wxConvAuto::FromWChar(char *dst, size_t dstLen, return m_conv->FromWChar(dst, dstLen, src, srcLen); } - -#endif // wxUSE_WCHAR_T