]>
git.saurik.com Git - wxWidgets.git/blob - src/common/convauto.cpp
1 ///////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/convauto.cpp
3 // Purpose: implementation of wxConvAuto
4 // Author: Vadim Zeitlin
7 // Copyright: (c) 2006 Vadim Zeitlin <vadim@wxwindows.org>
8 // Licence: wxWindows licence
9 ///////////////////////////////////////////////////////////////////////////////
11 // ============================================================================
13 // ============================================================================
15 // ----------------------------------------------------------------------------
17 // ----------------------------------------------------------------------------
19 // for compilers that support precompilation, includes "wx.h".
20 #include "wx/wxprec.h"
32 #include "wx/convauto.h"
34 // we use latin1 by default as it seems the least bad choice: the files we need
35 // to detect input of don't always come from the user system (they are often
36 // received from other machines) and so using wxFONTENCODING_SYSTEM doesn't
37 // seem to be a good idea and there is no other reasonable alternative
38 wxFontEncoding
wxConvAuto::ms_defaultMBEncoding
= wxFONTENCODING_ISO8859_1
;
40 // ============================================================================
42 // ============================================================================
45 void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc
)
47 wxASSERT_MSG( enc
!= wxFONTENCODING_DEFAULT
,
48 wxT("wxFONTENCODING_DEFAULT doesn't make sense here") );
50 ms_defaultMBEncoding
= enc
;
54 wxConvAuto::BOMType
wxConvAuto::DetectBOM(const char *src
, size_t srcLen
)
56 // examine the buffer for BOM presence
58 // quoting from http://www.unicode.org/faq/utf_bom.html#BOM:
60 // Bytes Encoding Form
62 // 00 00 FE FF UTF-32, big-endian
63 // FF FE 00 00 UTF-32, little-endian
64 // FE FF UTF-16, big-endian
65 // FF FE UTF-16, little-endian
68 // as some BOMs are prefixes of other ones we may need to read more bytes
69 // to disambiguate them
77 if ( src
[0] == '\x00' || src
[0] == '\xFF' ||
78 src
[0] == '\xFE' || src
[0] == '\xEF')
80 // this could be a BOM but we don't know yet
87 if ( src
[0] == '\xEF' && src
[1] == '\xBB' )
90 return src
[2] == '\xBF' ? BOM_UTF8
: BOM_None
;
95 if ( src
[0] == '\xFE' && src
[1] == '\xFF' )
98 if ( src
[0] == '\xFF' && src
[1] == '\xFE' )
100 // if the next byte is 0, it could be an UTF-32LE BOM but if it
101 // isn't we can be sure it's UTF-16LE
102 if ( srcLen
== 3 && src
[2] != '\x00' )
108 if ( src
[0] == '\x00' && src
[1] == '\x00' )
110 // this could only be UTF-32BE
111 if ( srcLen
== 3 && src
[2] == '\xFE' )
118 // we have at least 4 characters so we may finally decide whether
119 // we have a BOM or not
120 if ( src
[0] == '\xEF' && src
[1] == '\xBB' && src
[2] == '\xBF' )
123 if ( src
[0] == '\x00' && src
[1] == '\x00' &&
124 src
[2] == '\xFE' && src
[3] == '\xFF' )
127 if ( src
[0] == '\xFF' && src
[1] == '\xFE' &&
128 src
[2] == '\x00' && src
[3] == '\x00' )
131 if ( src
[0] == '\xFE' && src
[1] == '\xFF' )
134 if ( src
[0] == '\xFF' && src
[1] == '\xFE' )
141 void wxConvAuto::InitFromBOM(BOMType bomType
)
143 m_consumedBOM
= false;
148 wxFAIL_MSG( "shouldn't be called for this BOM type" );
156 m_conv
= new wxMBConvUTF32BE
;
161 m_conv
= new wxMBConvUTF32LE
;
166 m_conv
= new wxMBConvUTF16BE
;
171 m_conv
= new wxMBConvUTF16LE
;
180 wxFAIL_MSG( "unknown BOM type" );
185 // we end up here if there is no BOM or we didn't recognize it somehow
186 // (this shouldn't happen but still don't crash if it does), so use the
189 m_consumedBOM
= true; // as there is nothing to consume
193 void wxConvAuto::SkipBOM(const char **src
, size_t *len
) const
199 wxFAIL_MSG( "shouldn't be called for this BOM type" );
221 wxFAIL_MSG( "unknown BOM type" );
226 if ( *len
!= (size_t)-1 )
230 bool wxConvAuto::InitFromInput(const char **src
, size_t *len
)
232 m_bomType
= DetectBOM(*src
, *len
);
233 if ( m_bomType
== BOM_Unknown
)
236 InitFromBOM(m_bomType
);
243 wxConvAuto::ToWChar(wchar_t *dst
, size_t dstLen
,
244 const char *src
, size_t srcLen
) const
246 // we check BOM and create the appropriate conversion the first time we're
247 // called but we also need to ensure that the BOM is skipped not only
248 // during this initial call but also during the first call with non-NULL
249 // dst as typically we're first called with NULL dst to calculate the
250 // needed buffer size
251 wxConvAuto
*self
= const_cast<wxConvAuto
*>(this);
256 if ( !self
->InitFromInput(&src
, &srcLen
) )
258 // there is not enough data to determine whether we have a BOM or
259 // not, so fail for now -- the caller is supposed to call us again
261 return wxCONV_FAILED
;
264 else if ( !m_consumedBOM
&& dst
)
266 SkipBOM(&src
, &srcLen
);
269 // try to convert using the auto-detected encoding
270 size_t rc
= m_conv
->ToWChar(dst
, dstLen
, src
, srcLen
);
271 if ( rc
== wxCONV_FAILED
&& m_bomType
== BOM_None
)
273 // if the conversion failed but we didn't really detect anything and
274 // simply tried UTF-8 by default, retry it using the fall-back
275 if ( m_encDefault
!= wxFONTENCODING_MAX
)
280 self
->m_conv
= new wxCSConv(m_encDefault
== wxFONTENCODING_DEFAULT
281 ? GetFallbackEncoding()
283 self
->m_ownsConv
= true;
285 rc
= m_conv
->ToWChar(dst
, dstLen
, src
, srcLen
);
289 if (rc
!= wxCONV_FAILED
&& dst
&& !m_consumedBOM
)
290 self
->m_consumedBOM
= true;
295 wxConvAuto::FromWChar(char *dst
, size_t dstLen
,
296 const wchar_t *src
, size_t srcLen
) const
300 // default to UTF-8 for the multibyte output
301 const_cast<wxConvAuto
*>(this)->InitWithUTF8();
304 return m_conv
->FromWChar(dst
, dstLen
, src
, srcLen
);
307 #endif // wxUSE_WCHAR_T