]> git.saurik.com Git - wxWidgets.git/blame - src/common/convauto.cpp
No real changes, just fix a typo in comments and documentation.
[wxWidgets.git] / src / common / convauto.cpp
CommitLineData
830f8f11
VZ
1///////////////////////////////////////////////////////////////////////////////
2// Name: src/common/convauto.cpp
3// Purpose: implementation of wxConvAuto
4// Author: Vadim Zeitlin
5// Created: 2006-04-04
6// RCS-ID: $Id$
7// Copyright: (c) 2006 Vadim Zeitlin <vadim@wxwindows.org>
8// Licence: wxWindows licence
9///////////////////////////////////////////////////////////////////////////////
10
11// ============================================================================
12// declarations
13// ============================================================================
14
15// ----------------------------------------------------------------------------
16// headers
17// ----------------------------------------------------------------------------
18
19// for compilers that support precompilation, includes "wx.h".
20#include "wx/wxprec.h"
21
22#ifdef __BORLANDC__
23 #pragma hdrstop
24#endif
25
26#if wxUSE_WCHAR_T
27
28#ifndef WX_PRECOMP
4cb0e8d0 29 #include "wx/wx.h"
830f8f11
VZ
30#endif //WX_PRECOMP
31
32#include "wx/convauto.h"
33
01a9232b
VZ
34// we use latin1 by default as it seems the least bad choice: the files we need
35// to detect input of don't always come from the user system (they are often
36// received from other machines) and so using wxFONTENCODING_SYSTEM doesn't
37// seem to be a good idea and there is no other reasonable alternative
38wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1;
39
830f8f11
VZ
40// ============================================================================
41// implementation
42// ============================================================================
43
01a9232b
VZ
44/* static */
45void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
46{
47 wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT,
9a83f860 48 wxT("wxFONTENCODING_DEFAULT doesn't make sense here") );
01a9232b
VZ
49
50 ms_defaultMBEncoding = enc;
51}
52
830f8f11
VZ
53/* static */
54wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
55{
830f8f11
VZ
56 // examine the buffer for BOM presence
57 //
4cb0e8d0
VZ
58 // quoting from http://www.unicode.org/faq/utf_bom.html#BOM:
59 //
60 // Bytes Encoding Form
61 //
62 // 00 00 FE FF UTF-32, big-endian
63 // FF FE 00 00 UTF-32, little-endian
64 // FE FF UTF-16, big-endian
65 // FF FE UTF-16, little-endian
66 // EF BB BF UTF-8
67 //
68 // as some BOMs are prefixes of other ones we may need to read more bytes
69 // to disambiguate them
70
71 switch ( srcLen )
830f8f11 72 {
4cb0e8d0
VZ
73 case 0:
74 return BOM_Unknown;
75
76 case 1:
77 if ( src[0] == '\x00' || src[0] == '\xFF' ||
78 src[0] == '\xFE' || src[0] == '\xEF')
830f8f11 79 {
4cb0e8d0
VZ
80 // this could be a BOM but we don't know yet
81 return BOM_Unknown;
830f8f11
VZ
82 }
83 break;
84
4cb0e8d0
VZ
85 case 2:
86 case 3:
87 if ( src[0] == '\xEF' && src[1] == '\xBB' )
830f8f11 88 {
4cb0e8d0
VZ
89 if ( srcLen == 3 )
90 return src[2] == '\xBF' ? BOM_UTF8 : BOM_None;
91
92 return BOM_Unknown;
830f8f11 93 }
830f8f11 94
4cb0e8d0
VZ
95 if ( src[0] == '\xFE' && src[1] == '\xFF' )
96 return BOM_UTF16BE;
97
98 if ( src[0] == '\xFF' && src[1] == '\xFE' )
830f8f11 99 {
4cb0e8d0
VZ
100 // if the next byte is 0, it could be an UTF-32LE BOM but if it
101 // isn't we can be sure it's UTF-16LE
102 if ( srcLen == 3 && src[2] != '\x00' )
103 return BOM_UTF16LE;
104
105 return BOM_Unknown;
830f8f11 106 }
830f8f11 107
4cb0e8d0 108 if ( src[0] == '\x00' && src[1] == '\x00' )
830f8f11 109 {
4cb0e8d0
VZ
110 // this could only be UTF-32BE
111 if ( srcLen == 3 && src[2] == '\xFE' )
112 return BOM_Unknown;
830f8f11 113 }
4cb0e8d0 114
830f8f11 115 break;
4cb0e8d0
VZ
116
117 default:
118 // we have at least 4 characters so we may finally decide whether
119 // we have a BOM or not
120 if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
121 return BOM_UTF8;
122
123 if ( src[0] == '\x00' && src[1] == '\x00' &&
124 src[2] == '\xFE' && src[3] == '\xFF' )
125 return BOM_UTF32BE;
126
127 if ( src[0] == '\xFF' && src[1] == '\xFE' &&
128 src[2] == '\x00' && src[3] == '\x00' )
129 return BOM_UTF32LE;
130
131 if ( src[0] == '\xFE' && src[1] == '\xFF' )
132 return BOM_UTF16BE;
133
134 if ( src[0] == '\xFF' && src[1] == '\xFE' )
135 return BOM_UTF16LE;
830f8f11
VZ
136 }
137
138 return BOM_None;
139}
140
141void wxConvAuto::InitFromBOM(BOMType bomType)
142{
143 m_consumedBOM = false;
144
145 switch ( bomType )
146 {
4cb0e8d0
VZ
147 case BOM_Unknown:
148 wxFAIL_MSG( "shouldn't be called for this BOM type" );
149 break;
150
151 case BOM_None:
152 // use the default
153 break;
154
830f8f11
VZ
155 case BOM_UTF32BE:
156 m_conv = new wxMBConvUTF32BE;
157 m_ownsConv = true;
158 break;
159
160 case BOM_UTF32LE:
161 m_conv = new wxMBConvUTF32LE;
162 m_ownsConv = true;
163 break;
164
165 case BOM_UTF16BE:
166 m_conv = new wxMBConvUTF16BE;
167 m_ownsConv = true;
168 break;
169
170 case BOM_UTF16LE:
171 m_conv = new wxMBConvUTF16LE;
172 m_ownsConv = true;
173 break;
174
175 case BOM_UTF8:
01a9232b 176 InitWithUTF8();
830f8f11
VZ
177 break;
178
179 default:
4cb0e8d0
VZ
180 wxFAIL_MSG( "unknown BOM type" );
181 }
830f8f11 182
4cb0e8d0
VZ
183 if ( !m_conv )
184 {
185 // we end up here if there is no BOM or we didn't recognize it somehow
186 // (this shouldn't happen but still don't crash if it does), so use the
187 // default encoding
188 InitWithUTF8();
189 m_consumedBOM = true; // as there is nothing to consume
830f8f11
VZ
190 }
191}
192
193void wxConvAuto::SkipBOM(const char **src, size_t *len) const
194{
195 int ofs;
196 switch ( m_bomType )
197 {
4cb0e8d0
VZ
198 case BOM_Unknown:
199 wxFAIL_MSG( "shouldn't be called for this BOM type" );
200 return;
201
202 case BOM_None:
203 ofs = 0;
204 break;
205
830f8f11
VZ
206 case BOM_UTF32BE:
207 case BOM_UTF32LE:
208 ofs = 4;
209 break;
210
211 case BOM_UTF16BE:
212 case BOM_UTF16LE:
213 ofs = 2;
214 break;
215
216 case BOM_UTF8:
217 ofs = 3;
218 break;
219
220 default:
4cb0e8d0
VZ
221 wxFAIL_MSG( "unknown BOM type" );
222 return;
830f8f11
VZ
223 }
224
225 *src += ofs;
226 if ( *len != (size_t)-1 )
227 *len -= ofs;
228}
229
4cb0e8d0 230bool wxConvAuto::InitFromInput(const char **src, size_t *len)
830f8f11
VZ
231{
232 m_bomType = DetectBOM(*src, *len);
4cb0e8d0
VZ
233 if ( m_bomType == BOM_Unknown )
234 return false;
235
830f8f11
VZ
236 InitFromBOM(m_bomType);
237 SkipBOM(src, len);
4cb0e8d0
VZ
238
239 return true;
830f8f11
VZ
240}
241
242size_t
243wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
244 const char *src, size_t srcLen) const
245{
246 // we check BOM and create the appropriate conversion the first time we're
247 // called but we also need to ensure that the BOM is skipped not only
248 // during this initial call but also during the first call with non-NULL
249 // dst as typically we're first called with NULL dst to calculate the
250 // needed buffer size
5c33522f 251 wxConvAuto *self = const_cast<wxConvAuto *>(this);
4cb0e8d0
VZ
252
253
830f8f11
VZ
254 if ( !m_conv )
255 {
4cb0e8d0
VZ
256 if ( !self->InitFromInput(&src, &srcLen) )
257 {
258 // there is not enough data to determine whether we have a BOM or
259 // not, so fail for now -- the caller is supposed to call us again
260 // with more data
261 return wxCONV_FAILED;
262 }
830f8f11 263 }
4cb0e8d0 264 else if ( !m_consumedBOM && dst )
830f8f11 265 {
830f8f11
VZ
266 SkipBOM(&src, &srcLen);
267 }
268
01a9232b
VZ
269 // try to convert using the auto-detected encoding
270 size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
271 if ( rc == wxCONV_FAILED && m_bomType == BOM_None )
272 {
273 // if the conversion failed but we didn't really detect anything and
274 // simply tried UTF-8 by default, retry it using the fall-back
275 if ( m_encDefault != wxFONTENCODING_MAX )
276 {
277 if ( m_ownsConv )
278 delete m_conv;
279
280 self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT
281 ? GetFallbackEncoding()
282 : m_encDefault);
283 self->m_ownsConv = true;
284
285 rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
286 }
287 }
288
4cb0e8d0
VZ
289 if (rc != wxCONV_FAILED && dst && !m_consumedBOM)
290 self->m_consumedBOM = true;
01a9232b 291 return rc;
830f8f11
VZ
292}
293
294size_t
295wxConvAuto::FromWChar(char *dst, size_t dstLen,
296 const wchar_t *src, size_t srcLen) const
297{
298 if ( !m_conv )
299 {
300 // default to UTF-8 for the multibyte output
5c33522f 301 const_cast<wxConvAuto *>(this)->InitWithUTF8();
830f8f11
VZ
302 }
303
304 return m_conv->FromWChar(dst, dstLen, src, srcLen);
305}
306
307#endif // wxUSE_WCHAR_T