]> git.saurik.com Git - wxWidgets.git/blame - src/common/convauto.cpp
Increase the number of index items shown by default in wxHTML.
[wxWidgets.git] / src / common / convauto.cpp
CommitLineData
830f8f11
VZ
1///////////////////////////////////////////////////////////////////////////////
2// Name: src/common/convauto.cpp
3// Purpose: implementation of wxConvAuto
4// Author: Vadim Zeitlin
5// Created: 2006-04-04
6// RCS-ID: $Id$
7// Copyright: (c) 2006 Vadim Zeitlin <vadim@wxwindows.org>
8// Licence: wxWindows licence
9///////////////////////////////////////////////////////////////////////////////
10
11// ============================================================================
12// declarations
13// ============================================================================
14
15// ----------------------------------------------------------------------------
16// headers
17// ----------------------------------------------------------------------------
18
19// for compilers that support precompilation, includes "wx.h".
20#include "wx/wxprec.h"
21
22#ifdef __BORLANDC__
23 #pragma hdrstop
24#endif
25
830f8f11
VZ
26#include "wx/convauto.h"
27
01a9232b
VZ
28// we use latin1 by default as it seems the least bad choice: the files we need
29// to detect input of don't always come from the user system (they are often
30// received from other machines) and so using wxFONTENCODING_SYSTEM doesn't
31// seem to be a good idea and there is no other reasonable alternative
32wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1;
33
830f8f11
VZ
34// ============================================================================
35// implementation
36// ============================================================================
37
01a9232b
VZ
38/* static */
39void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
40{
41 wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT,
9a83f860 42 wxT("wxFONTENCODING_DEFAULT doesn't make sense here") );
01a9232b
VZ
43
44 ms_defaultMBEncoding = enc;
45}
46
830f8f11
VZ
47/* static */
48wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
49{
830f8f11
VZ
50 // examine the buffer for BOM presence
51 //
4cb0e8d0
VZ
52 // quoting from http://www.unicode.org/faq/utf_bom.html#BOM:
53 //
54 // Bytes Encoding Form
55 //
56 // 00 00 FE FF UTF-32, big-endian
57 // FF FE 00 00 UTF-32, little-endian
58 // FE FF UTF-16, big-endian
59 // FF FE UTF-16, little-endian
60 // EF BB BF UTF-8
61 //
62 // as some BOMs are prefixes of other ones we may need to read more bytes
63 // to disambiguate them
64
65 switch ( srcLen )
830f8f11 66 {
4cb0e8d0
VZ
67 case 0:
68 return BOM_Unknown;
69
70 case 1:
71 if ( src[0] == '\x00' || src[0] == '\xFF' ||
72 src[0] == '\xFE' || src[0] == '\xEF')
830f8f11 73 {
4cb0e8d0
VZ
74 // this could be a BOM but we don't know yet
75 return BOM_Unknown;
830f8f11
VZ
76 }
77 break;
78
4cb0e8d0
VZ
79 case 2:
80 case 3:
81 if ( src[0] == '\xEF' && src[1] == '\xBB' )
830f8f11 82 {
4cb0e8d0
VZ
83 if ( srcLen == 3 )
84 return src[2] == '\xBF' ? BOM_UTF8 : BOM_None;
85
86 return BOM_Unknown;
830f8f11 87 }
830f8f11 88
4cb0e8d0
VZ
89 if ( src[0] == '\xFE' && src[1] == '\xFF' )
90 return BOM_UTF16BE;
91
92 if ( src[0] == '\xFF' && src[1] == '\xFE' )
830f8f11 93 {
4cb0e8d0
VZ
94 // if the next byte is 0, it could be an UTF-32LE BOM but if it
95 // isn't we can be sure it's UTF-16LE
96 if ( srcLen == 3 && src[2] != '\x00' )
97 return BOM_UTF16LE;
98
99 return BOM_Unknown;
830f8f11 100 }
830f8f11 101
4cb0e8d0 102 if ( src[0] == '\x00' && src[1] == '\x00' )
830f8f11 103 {
823e82e2
VZ
104 // this could only be UTF-32BE, check that the data we have so
105 // far allows for it
106 if ( srcLen == 3 && src[2] != '\xFE' )
107 return BOM_None;
4cb0e8d0 108
823e82e2
VZ
109 return BOM_Unknown;
110 }
830f8f11 111 break;
4cb0e8d0
VZ
112
113 default:
114 // we have at least 4 characters so we may finally decide whether
115 // we have a BOM or not
116 if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
117 return BOM_UTF8;
118
119 if ( src[0] == '\x00' && src[1] == '\x00' &&
120 src[2] == '\xFE' && src[3] == '\xFF' )
121 return BOM_UTF32BE;
122
123 if ( src[0] == '\xFF' && src[1] == '\xFE' &&
124 src[2] == '\x00' && src[3] == '\x00' )
125 return BOM_UTF32LE;
126
127 if ( src[0] == '\xFE' && src[1] == '\xFF' )
128 return BOM_UTF16BE;
129
130 if ( src[0] == '\xFF' && src[1] == '\xFE' )
131 return BOM_UTF16LE;
830f8f11
VZ
132 }
133
134 return BOM_None;
135}
136
137void wxConvAuto::InitFromBOM(BOMType bomType)
138{
139 m_consumedBOM = false;
140
141 switch ( bomType )
142 {
4cb0e8d0
VZ
143 case BOM_Unknown:
144 wxFAIL_MSG( "shouldn't be called for this BOM type" );
145 break;
146
147 case BOM_None:
148 // use the default
149 break;
150
830f8f11
VZ
151 case BOM_UTF32BE:
152 m_conv = new wxMBConvUTF32BE;
153 m_ownsConv = true;
154 break;
155
156 case BOM_UTF32LE:
157 m_conv = new wxMBConvUTF32LE;
158 m_ownsConv = true;
159 break;
160
161 case BOM_UTF16BE:
162 m_conv = new wxMBConvUTF16BE;
163 m_ownsConv = true;
164 break;
165
166 case BOM_UTF16LE:
167 m_conv = new wxMBConvUTF16LE;
168 m_ownsConv = true;
169 break;
170
171 case BOM_UTF8:
01a9232b 172 InitWithUTF8();
830f8f11
VZ
173 break;
174
175 default:
4cb0e8d0
VZ
176 wxFAIL_MSG( "unknown BOM type" );
177 }
830f8f11 178
4cb0e8d0
VZ
179 if ( !m_conv )
180 {
181 // we end up here if there is no BOM or we didn't recognize it somehow
182 // (this shouldn't happen but still don't crash if it does), so use the
183 // default encoding
184 InitWithUTF8();
185 m_consumedBOM = true; // as there is nothing to consume
830f8f11
VZ
186 }
187}
188
189void wxConvAuto::SkipBOM(const char **src, size_t *len) const
190{
191 int ofs;
192 switch ( m_bomType )
193 {
4cb0e8d0
VZ
194 case BOM_Unknown:
195 wxFAIL_MSG( "shouldn't be called for this BOM type" );
196 return;
197
198 case BOM_None:
199 ofs = 0;
200 break;
201
830f8f11
VZ
202 case BOM_UTF32BE:
203 case BOM_UTF32LE:
204 ofs = 4;
205 break;
206
207 case BOM_UTF16BE:
208 case BOM_UTF16LE:
209 ofs = 2;
210 break;
211
212 case BOM_UTF8:
213 ofs = 3;
214 break;
215
216 default:
4cb0e8d0
VZ
217 wxFAIL_MSG( "unknown BOM type" );
218 return;
830f8f11
VZ
219 }
220
221 *src += ofs;
222 if ( *len != (size_t)-1 )
223 *len -= ofs;
224}
225
4ca97396 226bool wxConvAuto::InitFromInput(const char *src, size_t len)
830f8f11 227{
9334ad17 228 m_bomType = DetectBOM(src, len == wxNO_LEN ? strlen(src) : len);
4cb0e8d0
VZ
229 if ( m_bomType == BOM_Unknown )
230 return false;
231
830f8f11 232 InitFromBOM(m_bomType);
4cb0e8d0
VZ
233
234 return true;
830f8f11
VZ
235}
236
237size_t
238wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
239 const char *src, size_t srcLen) const
240{
241 // we check BOM and create the appropriate conversion the first time we're
242 // called but we also need to ensure that the BOM is skipped not only
243 // during this initial call but also during the first call with non-NULL
244 // dst as typically we're first called with NULL dst to calculate the
245 // needed buffer size
5c33522f 246 wxConvAuto *self = const_cast<wxConvAuto *>(this);
4cb0e8d0
VZ
247
248
830f8f11
VZ
249 if ( !m_conv )
250 {
4ca97396 251 if ( !self->InitFromInput(src, srcLen) )
4cb0e8d0
VZ
252 {
253 // there is not enough data to determine whether we have a BOM or
254 // not, so fail for now -- the caller is supposed to call us again
255 // with more data
256 return wxCONV_FAILED;
257 }
830f8f11 258 }
4ca97396
VZ
259
260 if ( !m_consumedBOM )
830f8f11 261 {
830f8f11 262 SkipBOM(&src, &srcLen);
4ca97396
VZ
263 if ( srcLen == 0 )
264 {
265 // there is nothing left except the BOM so we'd return 0 below but
266 // this is unexpected: decoding a non-empty string must either fail
267 // or return something non-empty, in particular this would break
268 // the code in wxTextInputStream::NextChar()
269 //
270 // so still return an error as we need some more data to be able to
271 // decode it
272 return wxCONV_FAILED;
273 }
830f8f11
VZ
274 }
275
01a9232b
VZ
276 // try to convert using the auto-detected encoding
277 size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
278 if ( rc == wxCONV_FAILED && m_bomType == BOM_None )
279 {
280 // if the conversion failed but we didn't really detect anything and
281 // simply tried UTF-8 by default, retry it using the fall-back
282 if ( m_encDefault != wxFONTENCODING_MAX )
283 {
284 if ( m_ownsConv )
285 delete m_conv;
286
287 self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT
288 ? GetFallbackEncoding()
289 : m_encDefault);
290 self->m_ownsConv = true;
291
292 rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
293 }
294 }
295
4ca97396
VZ
296 // don't skip the BOM again the next time if we really consumed it
297 if ( rc != wxCONV_FAILED && dst && !m_consumedBOM )
4cb0e8d0 298 self->m_consumedBOM = true;
4ca97396 299
01a9232b 300 return rc;
830f8f11
VZ
301}
302
303size_t
304wxConvAuto::FromWChar(char *dst, size_t dstLen,
305 const wchar_t *src, size_t srcLen) const
306{
307 if ( !m_conv )
308 {
309 // default to UTF-8 for the multibyte output
5c33522f 310 const_cast<wxConvAuto *>(this)->InitWithUTF8();
830f8f11
VZ
311 }
312
313 return m_conv->FromWChar(dst, dstLen, src, srcLen);
314}