]>
git.saurik.com Git - wxWidgets.git/blob - src/common/convauto.cpp
1 ///////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/convauto.cpp
3 // Purpose: implementation of wxConvAuto
4 // Author: Vadim Zeitlin
6 // Copyright: (c) 2006 Vadim Zeitlin <vadim@wxwindows.org>
7 // Licence: wxWindows licence
8 ///////////////////////////////////////////////////////////////////////////////
10 // ============================================================================
12 // ============================================================================
14 // ----------------------------------------------------------------------------
16 // ----------------------------------------------------------------------------
18 // for compilers that support precompilation, includes "wx.h".
19 #include "wx/wxprec.h"
25 #include "wx/convauto.h"
27 // we use latin1 by default as it seems the least bad choice: the files we need
28 // to detect input of don't always come from the user system (they are often
29 // received from other machines) and so using wxFONTENCODING_SYSTEM doesn't
30 // seem to be a good idea and there is no other reasonable alternative
31 wxFontEncoding
wxConvAuto::ms_defaultMBEncoding
= wxFONTENCODING_ISO8859_1
;
36 const char BOM_UTF32BE
[] = { '\x00', '\x00', '\xFE', '\xFF' };
37 const char BOM_UTF32LE
[] = { '\xFF', '\xFE', '\x00', '\x00' };
38 const char BOM_UTF16BE
[] = { '\xFE', '\xFF' };
39 const char BOM_UTF16LE
[] = { '\xFF', '\xFE' };
40 const char BOM_UTF8
[] = { '\xEF', '\xBB', '\xBF' };
42 } // anonymous namespace
44 // ============================================================================
46 // ============================================================================
49 void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc
)
51 wxASSERT_MSG( enc
!= wxFONTENCODING_DEFAULT
,
52 wxT("wxFONTENCODING_DEFAULT doesn't make sense here") );
54 ms_defaultMBEncoding
= enc
;
58 const char* wxConvAuto::GetBOMChars(wxBOM bom
, size_t* count
)
60 wxCHECK_MSG( count
, NULL
, wxS("count pointer must be provided") );
64 case wxBOM_UTF32BE
: *count
= WXSIZEOF(BOM_UTF32BE
); return BOM_UTF32BE
;
65 case wxBOM_UTF32LE
: *count
= WXSIZEOF(BOM_UTF32LE
); return BOM_UTF32LE
;
66 case wxBOM_UTF16BE
: *count
= WXSIZEOF(BOM_UTF16BE
); return BOM_UTF16BE
;
67 case wxBOM_UTF16LE
: *count
= WXSIZEOF(BOM_UTF16LE
); return BOM_UTF16LE
;
68 case wxBOM_UTF8
: *count
= WXSIZEOF(BOM_UTF8
); return BOM_UTF8
;
71 wxFAIL_MSG( wxS("Invalid BOM type") );
75 wxFAIL_MSG( wxS("Unknown BOM type") );
80 wxBOM
wxConvAuto::DetectBOM(const char *src
, size_t srcLen
)
82 // examine the buffer for BOM presence
84 // quoting from http://www.unicode.org/faq/utf_bom.html#BOM:
86 // Bytes Encoding Form
88 // 00 00 FE FF UTF-32, big-endian
89 // FF FE 00 00 UTF-32, little-endian
90 // FE FF UTF-16, big-endian
91 // FF FE UTF-16, little-endian
94 // as some BOMs are prefixes of other ones we may need to read more bytes
95 // to disambiguate them
100 return wxBOM_Unknown
;
103 if ( src
[0] == '\x00' || src
[0] == '\xFF' ||
104 src
[0] == '\xFE' || src
[0] == '\xEF')
106 // this could be a BOM but we don't know yet
107 return wxBOM_Unknown
;
113 if ( src
[0] == '\xEF' && src
[1] == '\xBB' )
116 return src
[2] == '\xBF' ? wxBOM_UTF8
: wxBOM_None
;
118 return wxBOM_Unknown
;
121 if ( src
[0] == '\xFE' && src
[1] == '\xFF' )
122 return wxBOM_UTF16BE
;
124 if ( src
[0] == '\xFF' && src
[1] == '\xFE' )
126 // if the next byte is 0, it could be an UTF-32LE BOM but if it
127 // isn't we can be sure it's UTF-16LE
128 if ( srcLen
== 3 && src
[2] != '\x00' )
129 return wxBOM_UTF16LE
;
131 return wxBOM_Unknown
;
134 if ( src
[0] == '\x00' && src
[1] == '\x00' )
136 // this could only be UTF-32BE, check that the data we have so
138 if ( srcLen
== 3 && src
[2] != '\xFE' )
141 return wxBOM_Unknown
;
146 // we have at least 4 characters so we may finally decide whether
147 // we have a BOM or not
148 if ( src
[0] == '\xEF' && src
[1] == '\xBB' && src
[2] == '\xBF' )
151 if ( src
[0] == '\x00' && src
[1] == '\x00' &&
152 src
[2] == '\xFE' && src
[3] == '\xFF' )
153 return wxBOM_UTF32BE
;
155 if ( src
[0] == '\xFF' && src
[1] == '\xFE' &&
156 src
[2] == '\x00' && src
[3] == '\x00' )
157 return wxBOM_UTF32LE
;
159 if ( src
[0] == '\xFE' && src
[1] == '\xFF' )
160 return wxBOM_UTF16BE
;
162 if ( src
[0] == '\xFF' && src
[1] == '\xFE' )
163 return wxBOM_UTF16LE
;
169 void wxConvAuto::InitFromBOM(wxBOM bomType
)
171 m_consumedBOM
= false;
176 wxFAIL_MSG( "shouldn't be called for this BOM type" );
184 m_conv
= new wxMBConvUTF32BE
;
189 m_conv
= new wxMBConvUTF32LE
;
194 m_conv
= new wxMBConvUTF16BE
;
199 m_conv
= new wxMBConvUTF16LE
;
208 wxFAIL_MSG( "unknown BOM type" );
213 // we end up here if there is no BOM or we didn't recognize it somehow
214 // (this shouldn't happen but still don't crash if it does), so use the
217 m_consumedBOM
= true; // as there is nothing to consume
221 void wxConvAuto::SkipBOM(const char **src
, size_t *len
) const
227 wxFAIL_MSG( "shouldn't be called for this BOM type" );
249 wxFAIL_MSG( "unknown BOM type" );
254 if ( *len
!= (size_t)-1 )
258 bool wxConvAuto::InitFromInput(const char *src
, size_t len
)
260 m_bomType
= DetectBOM(src
, len
== wxNO_LEN
? strlen(src
) : len
);
261 if ( m_bomType
== wxBOM_Unknown
)
264 InitFromBOM(m_bomType
);
270 wxConvAuto::ToWChar(wchar_t *dst
, size_t dstLen
,
271 const char *src
, size_t srcLen
) const
273 // we check BOM and create the appropriate conversion the first time we're
274 // called but we also need to ensure that the BOM is skipped not only
275 // during this initial call but also during the first call with non-NULL
276 // dst as typically we're first called with NULL dst to calculate the
277 // needed buffer size
278 wxConvAuto
*self
= const_cast<wxConvAuto
*>(this);
283 if ( !self
->InitFromInput(src
, srcLen
) )
285 // there is not enough data to determine whether we have a BOM or
286 // not, so fail for now -- the caller is supposed to call us again
288 return wxCONV_FAILED
;
292 if ( !m_consumedBOM
)
294 SkipBOM(&src
, &srcLen
);
297 // there is nothing left except the BOM so we'd return 0 below but
298 // this is unexpected: decoding a non-empty string must either fail
299 // or return something non-empty, in particular this would break
300 // the code in wxTextInputStream::NextChar()
302 // so still return an error as we need some more data to be able to
304 return wxCONV_FAILED
;
308 // try to convert using the auto-detected encoding
309 size_t rc
= m_conv
->ToWChar(dst
, dstLen
, src
, srcLen
);
310 if ( rc
== wxCONV_FAILED
&& m_bomType
== wxBOM_None
)
312 // if the conversion failed but we didn't really detect anything and
313 // simply tried UTF-8 by default, retry it using the fall-back
314 if ( m_encDefault
!= wxFONTENCODING_MAX
)
319 self
->m_conv
= new wxCSConv(m_encDefault
== wxFONTENCODING_DEFAULT
320 ? GetFallbackEncoding()
322 self
->m_ownsConv
= true;
324 rc
= m_conv
->ToWChar(dst
, dstLen
, src
, srcLen
);
328 // don't skip the BOM again the next time if we really consumed it
329 if ( rc
!= wxCONV_FAILED
&& dst
&& !m_consumedBOM
)
330 self
->m_consumedBOM
= true;
336 wxConvAuto::FromWChar(char *dst
, size_t dstLen
,
337 const wchar_t *src
, size_t srcLen
) const
341 // default to UTF-8 for the multibyte output
342 const_cast<wxConvAuto
*>(this)->InitWithUTF8();
345 return m_conv
->FromWChar(dst
, dstLen
, src
, srcLen
);