]> git.saurik.com Git - wxWidgets.git/blame - src/common/convauto.cpp
Fix wxHtmlHelpData::SetTempDir() to behave correctly without trailing slash.
[wxWidgets.git] / src / common / convauto.cpp
CommitLineData
830f8f11
VZ
1///////////////////////////////////////////////////////////////////////////////
2// Name: src/common/convauto.cpp
3// Purpose: implementation of wxConvAuto
4// Author: Vadim Zeitlin
5// Created: 2006-04-04
830f8f11
VZ
6// Copyright: (c) 2006 Vadim Zeitlin <vadim@wxwindows.org>
7// Licence: wxWindows licence
8///////////////////////////////////////////////////////////////////////////////
9
10// ============================================================================
11// declarations
12// ============================================================================
13
14// ----------------------------------------------------------------------------
15// headers
16// ----------------------------------------------------------------------------
17
18// for compilers that support precompilation, includes "wx.h".
19#include "wx/wxprec.h"
20
21#ifdef __BORLANDC__
22 #pragma hdrstop
23#endif
24
830f8f11
VZ
25#include "wx/convauto.h"
26
01a9232b
VZ
27// we use latin1 by default as it seems the least bad choice: the files we need
28// to detect input of don't always come from the user system (they are often
29// received from other machines) and so using wxFONTENCODING_SYSTEM doesn't
30// seem to be a good idea and there is no other reasonable alternative
31wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1;
32
64b91e2d
VZ
33namespace
34{
35
36const char BOM_UTF32BE[] = { '\x00', '\x00', '\xFE', '\xFF' };
37const char BOM_UTF32LE[] = { '\xFF', '\xFE', '\x00', '\x00' };
38const char BOM_UTF16BE[] = { '\xFE', '\xFF' };
39const char BOM_UTF16LE[] = { '\xFF', '\xFE' };
40const char BOM_UTF8[] = { '\xEF', '\xBB', '\xBF' };
41
42} // anonymous namespace
43
830f8f11
VZ
44// ============================================================================
45// implementation
46// ============================================================================
47
01a9232b
VZ
48/* static */
49void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
50{
51 wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT,
9a83f860 52 wxT("wxFONTENCODING_DEFAULT doesn't make sense here") );
01a9232b
VZ
53
54 ms_defaultMBEncoding = enc;
55}
56
64b91e2d
VZ
57/* static */
58const char* wxConvAuto::GetBOMChars(wxBOM bom, size_t* count)
59{
60 wxCHECK_MSG( count , NULL, wxS("count pointer must be provided") );
61
62 switch ( bom )
63 {
64 case wxBOM_UTF32BE: *count = WXSIZEOF(BOM_UTF32BE); return BOM_UTF32BE;
65 case wxBOM_UTF32LE: *count = WXSIZEOF(BOM_UTF32LE); return BOM_UTF32LE;
66 case wxBOM_UTF16BE: *count = WXSIZEOF(BOM_UTF16BE); return BOM_UTF16BE;
67 case wxBOM_UTF16LE: *count = WXSIZEOF(BOM_UTF16LE); return BOM_UTF16LE;
68 case wxBOM_UTF8 : *count = WXSIZEOF(BOM_UTF8 ); return BOM_UTF8;
69 case wxBOM_Unknown:
70 case wxBOM_None:
71 wxFAIL_MSG( wxS("Invalid BOM type") );
72 return NULL;
73 }
74
75 wxFAIL_MSG( wxS("Unknown BOM type") );
76 return NULL;
77}
78
830f8f11 79/* static */
038809c2 80wxBOM wxConvAuto::DetectBOM(const char *src, size_t srcLen)
830f8f11 81{
830f8f11
VZ
82 // examine the buffer for BOM presence
83 //
4cb0e8d0
VZ
84 // quoting from http://www.unicode.org/faq/utf_bom.html#BOM:
85 //
86 // Bytes Encoding Form
87 //
88 // 00 00 FE FF UTF-32, big-endian
89 // FF FE 00 00 UTF-32, little-endian
90 // FE FF UTF-16, big-endian
91 // FF FE UTF-16, little-endian
92 // EF BB BF UTF-8
93 //
94 // as some BOMs are prefixes of other ones we may need to read more bytes
95 // to disambiguate them
96
97 switch ( srcLen )
830f8f11 98 {
4cb0e8d0 99 case 0:
038809c2 100 return wxBOM_Unknown;
4cb0e8d0
VZ
101
102 case 1:
103 if ( src[0] == '\x00' || src[0] == '\xFF' ||
104 src[0] == '\xFE' || src[0] == '\xEF')
830f8f11 105 {
4cb0e8d0 106 // this could be a BOM but we don't know yet
038809c2 107 return wxBOM_Unknown;
830f8f11
VZ
108 }
109 break;
110
4cb0e8d0
VZ
111 case 2:
112 case 3:
113 if ( src[0] == '\xEF' && src[1] == '\xBB' )
830f8f11 114 {
4cb0e8d0 115 if ( srcLen == 3 )
038809c2 116 return src[2] == '\xBF' ? wxBOM_UTF8 : wxBOM_None;
4cb0e8d0 117
038809c2 118 return wxBOM_Unknown;
830f8f11 119 }
830f8f11 120
4cb0e8d0 121 if ( src[0] == '\xFE' && src[1] == '\xFF' )
038809c2 122 return wxBOM_UTF16BE;
4cb0e8d0
VZ
123
124 if ( src[0] == '\xFF' && src[1] == '\xFE' )
830f8f11 125 {
4cb0e8d0
VZ
126 // if the next byte is 0, it could be an UTF-32LE BOM but if it
127 // isn't we can be sure it's UTF-16LE
128 if ( srcLen == 3 && src[2] != '\x00' )
038809c2 129 return wxBOM_UTF16LE;
4cb0e8d0 130
038809c2 131 return wxBOM_Unknown;
830f8f11 132 }
830f8f11 133
4cb0e8d0 134 if ( src[0] == '\x00' && src[1] == '\x00' )
830f8f11 135 {
823e82e2
VZ
136 // this could only be UTF-32BE, check that the data we have so
137 // far allows for it
138 if ( srcLen == 3 && src[2] != '\xFE' )
038809c2 139 return wxBOM_None;
4cb0e8d0 140
038809c2 141 return wxBOM_Unknown;
823e82e2 142 }
830f8f11 143 break;
4cb0e8d0
VZ
144
145 default:
146 // we have at least 4 characters so we may finally decide whether
147 // we have a BOM or not
148 if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
038809c2 149 return wxBOM_UTF8;
4cb0e8d0
VZ
150
151 if ( src[0] == '\x00' && src[1] == '\x00' &&
152 src[2] == '\xFE' && src[3] == '\xFF' )
038809c2 153 return wxBOM_UTF32BE;
4cb0e8d0
VZ
154
155 if ( src[0] == '\xFF' && src[1] == '\xFE' &&
156 src[2] == '\x00' && src[3] == '\x00' )
038809c2 157 return wxBOM_UTF32LE;
4cb0e8d0
VZ
158
159 if ( src[0] == '\xFE' && src[1] == '\xFF' )
038809c2 160 return wxBOM_UTF16BE;
4cb0e8d0
VZ
161
162 if ( src[0] == '\xFF' && src[1] == '\xFE' )
038809c2 163 return wxBOM_UTF16LE;
830f8f11
VZ
164 }
165
038809c2 166 return wxBOM_None;
830f8f11
VZ
167}
168
038809c2 169void wxConvAuto::InitFromBOM(wxBOM bomType)
830f8f11
VZ
170{
171 m_consumedBOM = false;
172
173 switch ( bomType )
174 {
038809c2 175 case wxBOM_Unknown:
4cb0e8d0
VZ
176 wxFAIL_MSG( "shouldn't be called for this BOM type" );
177 break;
178
038809c2 179 case wxBOM_None:
4cb0e8d0
VZ
180 // use the default
181 break;
182
038809c2 183 case wxBOM_UTF32BE:
830f8f11
VZ
184 m_conv = new wxMBConvUTF32BE;
185 m_ownsConv = true;
186 break;
187
038809c2 188 case wxBOM_UTF32LE:
830f8f11
VZ
189 m_conv = new wxMBConvUTF32LE;
190 m_ownsConv = true;
191 break;
192
038809c2 193 case wxBOM_UTF16BE:
830f8f11
VZ
194 m_conv = new wxMBConvUTF16BE;
195 m_ownsConv = true;
196 break;
197
038809c2 198 case wxBOM_UTF16LE:
830f8f11
VZ
199 m_conv = new wxMBConvUTF16LE;
200 m_ownsConv = true;
201 break;
202
038809c2 203 case wxBOM_UTF8:
01a9232b 204 InitWithUTF8();
830f8f11
VZ
205 break;
206
207 default:
4cb0e8d0
VZ
208 wxFAIL_MSG( "unknown BOM type" );
209 }
830f8f11 210
4cb0e8d0
VZ
211 if ( !m_conv )
212 {
213 // we end up here if there is no BOM or we didn't recognize it somehow
214 // (this shouldn't happen but still don't crash if it does), so use the
215 // default encoding
216 InitWithUTF8();
217 m_consumedBOM = true; // as there is nothing to consume
830f8f11
VZ
218 }
219}
220
221void wxConvAuto::SkipBOM(const char **src, size_t *len) const
222{
223 int ofs;
224 switch ( m_bomType )
225 {
038809c2 226 case wxBOM_Unknown:
4cb0e8d0
VZ
227 wxFAIL_MSG( "shouldn't be called for this BOM type" );
228 return;
229
038809c2 230 case wxBOM_None:
4cb0e8d0
VZ
231 ofs = 0;
232 break;
233
038809c2
VZ
234 case wxBOM_UTF32BE:
235 case wxBOM_UTF32LE:
830f8f11
VZ
236 ofs = 4;
237 break;
238
038809c2
VZ
239 case wxBOM_UTF16BE:
240 case wxBOM_UTF16LE:
830f8f11
VZ
241 ofs = 2;
242 break;
243
038809c2 244 case wxBOM_UTF8:
830f8f11
VZ
245 ofs = 3;
246 break;
247
248 default:
4cb0e8d0
VZ
249 wxFAIL_MSG( "unknown BOM type" );
250 return;
830f8f11
VZ
251 }
252
253 *src += ofs;
254 if ( *len != (size_t)-1 )
255 *len -= ofs;
256}
257
4ca97396 258bool wxConvAuto::InitFromInput(const char *src, size_t len)
830f8f11 259{
9334ad17 260 m_bomType = DetectBOM(src, len == wxNO_LEN ? strlen(src) : len);
038809c2 261 if ( m_bomType == wxBOM_Unknown )
4cb0e8d0
VZ
262 return false;
263
830f8f11 264 InitFromBOM(m_bomType);
4cb0e8d0
VZ
265
266 return true;
830f8f11
VZ
267}
268
269size_t
270wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
271 const char *src, size_t srcLen) const
272{
273 // we check BOM and create the appropriate conversion the first time we're
274 // called but we also need to ensure that the BOM is skipped not only
275 // during this initial call but also during the first call with non-NULL
276 // dst as typically we're first called with NULL dst to calculate the
277 // needed buffer size
5c33522f 278 wxConvAuto *self = const_cast<wxConvAuto *>(this);
4cb0e8d0
VZ
279
280
830f8f11
VZ
281 if ( !m_conv )
282 {
4ca97396 283 if ( !self->InitFromInput(src, srcLen) )
4cb0e8d0
VZ
284 {
285 // there is not enough data to determine whether we have a BOM or
286 // not, so fail for now -- the caller is supposed to call us again
287 // with more data
288 return wxCONV_FAILED;
289 }
830f8f11 290 }
4ca97396
VZ
291
292 if ( !m_consumedBOM )
830f8f11 293 {
830f8f11 294 SkipBOM(&src, &srcLen);
4ca97396
VZ
295 if ( srcLen == 0 )
296 {
297 // there is nothing left except the BOM so we'd return 0 below but
298 // this is unexpected: decoding a non-empty string must either fail
299 // or return something non-empty, in particular this would break
300 // the code in wxTextInputStream::NextChar()
301 //
302 // so still return an error as we need some more data to be able to
303 // decode it
304 return wxCONV_FAILED;
305 }
830f8f11
VZ
306 }
307
01a9232b
VZ
308 // try to convert using the auto-detected encoding
309 size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
038809c2 310 if ( rc == wxCONV_FAILED && m_bomType == wxBOM_None )
01a9232b
VZ
311 {
312 // if the conversion failed but we didn't really detect anything and
313 // simply tried UTF-8 by default, retry it using the fall-back
314 if ( m_encDefault != wxFONTENCODING_MAX )
315 {
316 if ( m_ownsConv )
317 delete m_conv;
318
319 self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT
320 ? GetFallbackEncoding()
321 : m_encDefault);
322 self->m_ownsConv = true;
323
324 rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
325 }
326 }
327
4ca97396
VZ
328 // don't skip the BOM again the next time if we really consumed it
329 if ( rc != wxCONV_FAILED && dst && !m_consumedBOM )
4cb0e8d0 330 self->m_consumedBOM = true;
4ca97396 331
01a9232b 332 return rc;
830f8f11
VZ
333}
334
335size_t
336wxConvAuto::FromWChar(char *dst, size_t dstLen,
337 const wchar_t *src, size_t srcLen) const
338{
339 if ( !m_conv )
340 {
341 // default to UTF-8 for the multibyte output
5c33522f 342 const_cast<wxConvAuto *>(this)->InitWithUTF8();
830f8f11
VZ
343 }
344
345 return m_conv->FromWChar(dst, dstLen, src, srcLen);
346}