]> git.saurik.com Git - wxWidgets.git/blame - src/common/convauto.cpp
Don't crash on malformed HTML in wxHTML font tag handler.
[wxWidgets.git] / src / common / convauto.cpp
CommitLineData
830f8f11
VZ
1///////////////////////////////////////////////////////////////////////////////
2// Name: src/common/convauto.cpp
3// Purpose: implementation of wxConvAuto
4// Author: Vadim Zeitlin
5// Created: 2006-04-04
6// RCS-ID: $Id$
7// Copyright: (c) 2006 Vadim Zeitlin <vadim@wxwindows.org>
8// Licence: wxWindows licence
9///////////////////////////////////////////////////////////////////////////////
10
11// ============================================================================
12// declarations
13// ============================================================================
14
15// ----------------------------------------------------------------------------
16// headers
17// ----------------------------------------------------------------------------
18
19// for compilers that support precompilation, includes "wx.h".
20#include "wx/wxprec.h"
21
22#ifdef __BORLANDC__
23 #pragma hdrstop
24#endif
25
830f8f11 26#ifndef WX_PRECOMP
4cb0e8d0 27 #include "wx/wx.h"
830f8f11
VZ
28#endif //WX_PRECOMP
29
30#include "wx/convauto.h"
31
01a9232b
VZ
32// we use latin1 by default as it seems the least bad choice: the files we need
33// to detect input of don't always come from the user system (they are often
34// received from other machines) and so using wxFONTENCODING_SYSTEM doesn't
35// seem to be a good idea and there is no other reasonable alternative
36wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1;
37
830f8f11
VZ
38// ============================================================================
39// implementation
40// ============================================================================
41
01a9232b
VZ
42/* static */
43void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
44{
45 wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT,
9a83f860 46 wxT("wxFONTENCODING_DEFAULT doesn't make sense here") );
01a9232b
VZ
47
48 ms_defaultMBEncoding = enc;
49}
50
830f8f11
VZ
51/* static */
52wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
53{
830f8f11
VZ
54 // examine the buffer for BOM presence
55 //
4cb0e8d0
VZ
56 // quoting from http://www.unicode.org/faq/utf_bom.html#BOM:
57 //
58 // Bytes Encoding Form
59 //
60 // 00 00 FE FF UTF-32, big-endian
61 // FF FE 00 00 UTF-32, little-endian
62 // FE FF UTF-16, big-endian
63 // FF FE UTF-16, little-endian
64 // EF BB BF UTF-8
65 //
66 // as some BOMs are prefixes of other ones we may need to read more bytes
67 // to disambiguate them
68
69 switch ( srcLen )
830f8f11 70 {
4cb0e8d0
VZ
71 case 0:
72 return BOM_Unknown;
73
74 case 1:
75 if ( src[0] == '\x00' || src[0] == '\xFF' ||
76 src[0] == '\xFE' || src[0] == '\xEF')
830f8f11 77 {
4cb0e8d0
VZ
78 // this could be a BOM but we don't know yet
79 return BOM_Unknown;
830f8f11
VZ
80 }
81 break;
82
4cb0e8d0
VZ
83 case 2:
84 case 3:
85 if ( src[0] == '\xEF' && src[1] == '\xBB' )
830f8f11 86 {
4cb0e8d0
VZ
87 if ( srcLen == 3 )
88 return src[2] == '\xBF' ? BOM_UTF8 : BOM_None;
89
90 return BOM_Unknown;
830f8f11 91 }
830f8f11 92
4cb0e8d0
VZ
93 if ( src[0] == '\xFE' && src[1] == '\xFF' )
94 return BOM_UTF16BE;
95
96 if ( src[0] == '\xFF' && src[1] == '\xFE' )
830f8f11 97 {
4cb0e8d0
VZ
98 // if the next byte is 0, it could be an UTF-32LE BOM but if it
99 // isn't we can be sure it's UTF-16LE
100 if ( srcLen == 3 && src[2] != '\x00' )
101 return BOM_UTF16LE;
102
103 return BOM_Unknown;
830f8f11 104 }
830f8f11 105
4cb0e8d0 106 if ( src[0] == '\x00' && src[1] == '\x00' )
830f8f11 107 {
823e82e2
VZ
108 // this could only be UTF-32BE, check that the data we have so
109 // far allows for it
110 if ( srcLen == 3 && src[2] != '\xFE' )
111 return BOM_None;
4cb0e8d0 112
823e82e2
VZ
113 return BOM_Unknown;
114 }
830f8f11 115 break;
4cb0e8d0
VZ
116
117 default:
118 // we have at least 4 characters so we may finally decide whether
119 // we have a BOM or not
120 if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
121 return BOM_UTF8;
122
123 if ( src[0] == '\x00' && src[1] == '\x00' &&
124 src[2] == '\xFE' && src[3] == '\xFF' )
125 return BOM_UTF32BE;
126
127 if ( src[0] == '\xFF' && src[1] == '\xFE' &&
128 src[2] == '\x00' && src[3] == '\x00' )
129 return BOM_UTF32LE;
130
131 if ( src[0] == '\xFE' && src[1] == '\xFF' )
132 return BOM_UTF16BE;
133
134 if ( src[0] == '\xFF' && src[1] == '\xFE' )
135 return BOM_UTF16LE;
830f8f11
VZ
136 }
137
138 return BOM_None;
139}
140
141void wxConvAuto::InitFromBOM(BOMType bomType)
142{
143 m_consumedBOM = false;
144
145 switch ( bomType )
146 {
4cb0e8d0
VZ
147 case BOM_Unknown:
148 wxFAIL_MSG( "shouldn't be called for this BOM type" );
149 break;
150
151 case BOM_None:
152 // use the default
153 break;
154
830f8f11
VZ
155 case BOM_UTF32BE:
156 m_conv = new wxMBConvUTF32BE;
157 m_ownsConv = true;
158 break;
159
160 case BOM_UTF32LE:
161 m_conv = new wxMBConvUTF32LE;
162 m_ownsConv = true;
163 break;
164
165 case BOM_UTF16BE:
166 m_conv = new wxMBConvUTF16BE;
167 m_ownsConv = true;
168 break;
169
170 case BOM_UTF16LE:
171 m_conv = new wxMBConvUTF16LE;
172 m_ownsConv = true;
173 break;
174
175 case BOM_UTF8:
01a9232b 176 InitWithUTF8();
830f8f11
VZ
177 break;
178
179 default:
4cb0e8d0
VZ
180 wxFAIL_MSG( "unknown BOM type" );
181 }
830f8f11 182
4cb0e8d0
VZ
183 if ( !m_conv )
184 {
185 // we end up here if there is no BOM or we didn't recognize it somehow
186 // (this shouldn't happen but still don't crash if it does), so use the
187 // default encoding
188 InitWithUTF8();
189 m_consumedBOM = true; // as there is nothing to consume
830f8f11
VZ
190 }
191}
192
193void wxConvAuto::SkipBOM(const char **src, size_t *len) const
194{
195 int ofs;
196 switch ( m_bomType )
197 {
4cb0e8d0
VZ
198 case BOM_Unknown:
199 wxFAIL_MSG( "shouldn't be called for this BOM type" );
200 return;
201
202 case BOM_None:
203 ofs = 0;
204 break;
205
830f8f11
VZ
206 case BOM_UTF32BE:
207 case BOM_UTF32LE:
208 ofs = 4;
209 break;
210
211 case BOM_UTF16BE:
212 case BOM_UTF16LE:
213 ofs = 2;
214 break;
215
216 case BOM_UTF8:
217 ofs = 3;
218 break;
219
220 default:
4cb0e8d0
VZ
221 wxFAIL_MSG( "unknown BOM type" );
222 return;
830f8f11
VZ
223 }
224
225 *src += ofs;
226 if ( *len != (size_t)-1 )
227 *len -= ofs;
228}
229
4ca97396 230bool wxConvAuto::InitFromInput(const char *src, size_t len)
830f8f11 231{
9334ad17 232 m_bomType = DetectBOM(src, len == wxNO_LEN ? strlen(src) : len);
4cb0e8d0
VZ
233 if ( m_bomType == BOM_Unknown )
234 return false;
235
830f8f11 236 InitFromBOM(m_bomType);
4cb0e8d0
VZ
237
238 return true;
830f8f11
VZ
239}
240
241size_t
242wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
243 const char *src, size_t srcLen) const
244{
245 // we check BOM and create the appropriate conversion the first time we're
246 // called but we also need to ensure that the BOM is skipped not only
247 // during this initial call but also during the first call with non-NULL
248 // dst as typically we're first called with NULL dst to calculate the
249 // needed buffer size
5c33522f 250 wxConvAuto *self = const_cast<wxConvAuto *>(this);
4cb0e8d0
VZ
251
252
830f8f11
VZ
253 if ( !m_conv )
254 {
4ca97396 255 if ( !self->InitFromInput(src, srcLen) )
4cb0e8d0
VZ
256 {
257 // there is not enough data to determine whether we have a BOM or
258 // not, so fail for now -- the caller is supposed to call us again
259 // with more data
260 return wxCONV_FAILED;
261 }
830f8f11 262 }
4ca97396
VZ
263
264 if ( !m_consumedBOM )
830f8f11 265 {
830f8f11 266 SkipBOM(&src, &srcLen);
4ca97396
VZ
267 if ( srcLen == 0 )
268 {
269 // there is nothing left except the BOM so we'd return 0 below but
270 // this is unexpected: decoding a non-empty string must either fail
271 // or return something non-empty, in particular this would break
272 // the code in wxTextInputStream::NextChar()
273 //
274 // so still return an error as we need some more data to be able to
275 // decode it
276 return wxCONV_FAILED;
277 }
830f8f11
VZ
278 }
279
01a9232b
VZ
280 // try to convert using the auto-detected encoding
281 size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
282 if ( rc == wxCONV_FAILED && m_bomType == BOM_None )
283 {
284 // if the conversion failed but we didn't really detect anything and
285 // simply tried UTF-8 by default, retry it using the fall-back
286 if ( m_encDefault != wxFONTENCODING_MAX )
287 {
288 if ( m_ownsConv )
289 delete m_conv;
290
291 self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT
292 ? GetFallbackEncoding()
293 : m_encDefault);
294 self->m_ownsConv = true;
295
296 rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
297 }
298 }
299
4ca97396
VZ
300 // don't skip the BOM again the next time if we really consumed it
301 if ( rc != wxCONV_FAILED && dst && !m_consumedBOM )
4cb0e8d0 302 self->m_consumedBOM = true;
4ca97396 303
01a9232b 304 return rc;
830f8f11
VZ
305}
306
307size_t
308wxConvAuto::FromWChar(char *dst, size_t dstLen,
309 const wchar_t *src, size_t srcLen) const
310{
311 if ( !m_conv )
312 {
313 // default to UTF-8 for the multibyte output
5c33522f 314 const_cast<wxConvAuto *>(this)->InitWithUTF8();
830f8f11
VZ
315 }
316
317 return m_conv->FromWChar(dst, dstLen, src, srcLen);
318}