]> git.saurik.com Git - wxWidgets.git/blame - src/common/convauto.cpp
Fix wxFileSystem::FileNameToURL() for Unicode file names.
[wxWidgets.git] / src / common / convauto.cpp
CommitLineData
830f8f11
VZ
1///////////////////////////////////////////////////////////////////////////////
2// Name: src/common/convauto.cpp
3// Purpose: implementation of wxConvAuto
4// Author: Vadim Zeitlin
5// Created: 2006-04-04
6// RCS-ID: $Id$
7// Copyright: (c) 2006 Vadim Zeitlin <vadim@wxwindows.org>
8// Licence: wxWindows licence
9///////////////////////////////////////////////////////////////////////////////
10
11// ============================================================================
12// declarations
13// ============================================================================
14
15// ----------------------------------------------------------------------------
16// headers
17// ----------------------------------------------------------------------------
18
19// for compilers that support precompilation, includes "wx.h".
20#include "wx/wxprec.h"
21
22#ifdef __BORLANDC__
23 #pragma hdrstop
24#endif
25
26#if wxUSE_WCHAR_T
27
28#ifndef WX_PRECOMP
4cb0e8d0 29 #include "wx/wx.h"
830f8f11
VZ
30#endif //WX_PRECOMP
31
32#include "wx/convauto.h"
33
01a9232b
VZ
34// we use latin1 by default as it seems the least bad choice: the files we need
35// to detect input of don't always come from the user system (they are often
36// received from other machines) and so using wxFONTENCODING_SYSTEM doesn't
37// seem to be a good idea and there is no other reasonable alternative
38wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1;
39
830f8f11
VZ
40// ============================================================================
41// implementation
42// ============================================================================
43
01a9232b
VZ
44/* static */
45void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
46{
47 wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT,
9a83f860 48 wxT("wxFONTENCODING_DEFAULT doesn't make sense here") );
01a9232b
VZ
49
50 ms_defaultMBEncoding = enc;
51}
52
830f8f11
VZ
53/* static */
54wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
55{
830f8f11
VZ
56 // examine the buffer for BOM presence
57 //
4cb0e8d0
VZ
58 // quoting from http://www.unicode.org/faq/utf_bom.html#BOM:
59 //
60 // Bytes Encoding Form
61 //
62 // 00 00 FE FF UTF-32, big-endian
63 // FF FE 00 00 UTF-32, little-endian
64 // FE FF UTF-16, big-endian
65 // FF FE UTF-16, little-endian
66 // EF BB BF UTF-8
67 //
68 // as some BOMs are prefixes of other ones we may need to read more bytes
69 // to disambiguate them
70
71 switch ( srcLen )
830f8f11 72 {
4cb0e8d0
VZ
73 case 0:
74 return BOM_Unknown;
75
76 case 1:
77 if ( src[0] == '\x00' || src[0] == '\xFF' ||
78 src[0] == '\xFE' || src[0] == '\xEF')
830f8f11 79 {
4cb0e8d0
VZ
80 // this could be a BOM but we don't know yet
81 return BOM_Unknown;
830f8f11
VZ
82 }
83 break;
84
4cb0e8d0
VZ
85 case 2:
86 case 3:
87 if ( src[0] == '\xEF' && src[1] == '\xBB' )
830f8f11 88 {
4cb0e8d0
VZ
89 if ( srcLen == 3 )
90 return src[2] == '\xBF' ? BOM_UTF8 : BOM_None;
91
92 return BOM_Unknown;
830f8f11 93 }
830f8f11 94
4cb0e8d0
VZ
95 if ( src[0] == '\xFE' && src[1] == '\xFF' )
96 return BOM_UTF16BE;
97
98 if ( src[0] == '\xFF' && src[1] == '\xFE' )
830f8f11 99 {
4cb0e8d0
VZ
100 // if the next byte is 0, it could be an UTF-32LE BOM but if it
101 // isn't we can be sure it's UTF-16LE
102 if ( srcLen == 3 && src[2] != '\x00' )
103 return BOM_UTF16LE;
104
105 return BOM_Unknown;
830f8f11 106 }
830f8f11 107
4cb0e8d0 108 if ( src[0] == '\x00' && src[1] == '\x00' )
830f8f11 109 {
823e82e2
VZ
110 // this could only be UTF-32BE, check that the data we have so
111 // far allows for it
112 if ( srcLen == 3 && src[2] != '\xFE' )
113 return BOM_None;
4cb0e8d0 114
823e82e2
VZ
115 return BOM_Unknown;
116 }
830f8f11 117 break;
4cb0e8d0
VZ
118
119 default:
120 // we have at least 4 characters so we may finally decide whether
121 // we have a BOM or not
122 if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
123 return BOM_UTF8;
124
125 if ( src[0] == '\x00' && src[1] == '\x00' &&
126 src[2] == '\xFE' && src[3] == '\xFF' )
127 return BOM_UTF32BE;
128
129 if ( src[0] == '\xFF' && src[1] == '\xFE' &&
130 src[2] == '\x00' && src[3] == '\x00' )
131 return BOM_UTF32LE;
132
133 if ( src[0] == '\xFE' && src[1] == '\xFF' )
134 return BOM_UTF16BE;
135
136 if ( src[0] == '\xFF' && src[1] == '\xFE' )
137 return BOM_UTF16LE;
830f8f11
VZ
138 }
139
140 return BOM_None;
141}
142
143void wxConvAuto::InitFromBOM(BOMType bomType)
144{
145 m_consumedBOM = false;
146
147 switch ( bomType )
148 {
4cb0e8d0
VZ
149 case BOM_Unknown:
150 wxFAIL_MSG( "shouldn't be called for this BOM type" );
151 break;
152
153 case BOM_None:
154 // use the default
155 break;
156
830f8f11
VZ
157 case BOM_UTF32BE:
158 m_conv = new wxMBConvUTF32BE;
159 m_ownsConv = true;
160 break;
161
162 case BOM_UTF32LE:
163 m_conv = new wxMBConvUTF32LE;
164 m_ownsConv = true;
165 break;
166
167 case BOM_UTF16BE:
168 m_conv = new wxMBConvUTF16BE;
169 m_ownsConv = true;
170 break;
171
172 case BOM_UTF16LE:
173 m_conv = new wxMBConvUTF16LE;
174 m_ownsConv = true;
175 break;
176
177 case BOM_UTF8:
01a9232b 178 InitWithUTF8();
830f8f11
VZ
179 break;
180
181 default:
4cb0e8d0
VZ
182 wxFAIL_MSG( "unknown BOM type" );
183 }
830f8f11 184
4cb0e8d0
VZ
185 if ( !m_conv )
186 {
187 // we end up here if there is no BOM or we didn't recognize it somehow
188 // (this shouldn't happen but still don't crash if it does), so use the
189 // default encoding
190 InitWithUTF8();
191 m_consumedBOM = true; // as there is nothing to consume
830f8f11
VZ
192 }
193}
194
195void wxConvAuto::SkipBOM(const char **src, size_t *len) const
196{
197 int ofs;
198 switch ( m_bomType )
199 {
4cb0e8d0
VZ
200 case BOM_Unknown:
201 wxFAIL_MSG( "shouldn't be called for this BOM type" );
202 return;
203
204 case BOM_None:
205 ofs = 0;
206 break;
207
830f8f11
VZ
208 case BOM_UTF32BE:
209 case BOM_UTF32LE:
210 ofs = 4;
211 break;
212
213 case BOM_UTF16BE:
214 case BOM_UTF16LE:
215 ofs = 2;
216 break;
217
218 case BOM_UTF8:
219 ofs = 3;
220 break;
221
222 default:
4cb0e8d0
VZ
223 wxFAIL_MSG( "unknown BOM type" );
224 return;
830f8f11
VZ
225 }
226
227 *src += ofs;
228 if ( *len != (size_t)-1 )
229 *len -= ofs;
230}
231
4ca97396 232bool wxConvAuto::InitFromInput(const char *src, size_t len)
830f8f11 233{
4ca97396 234 m_bomType = DetectBOM(src, len);
4cb0e8d0
VZ
235 if ( m_bomType == BOM_Unknown )
236 return false;
237
830f8f11 238 InitFromBOM(m_bomType);
4cb0e8d0
VZ
239
240 return true;
830f8f11
VZ
241}
242
243size_t
244wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
245 const char *src, size_t srcLen) const
246{
247 // we check BOM and create the appropriate conversion the first time we're
248 // called but we also need to ensure that the BOM is skipped not only
249 // during this initial call but also during the first call with non-NULL
250 // dst as typically we're first called with NULL dst to calculate the
251 // needed buffer size
5c33522f 252 wxConvAuto *self = const_cast<wxConvAuto *>(this);
4cb0e8d0
VZ
253
254
830f8f11
VZ
255 if ( !m_conv )
256 {
4ca97396 257 if ( !self->InitFromInput(src, srcLen) )
4cb0e8d0
VZ
258 {
259 // there is not enough data to determine whether we have a BOM or
260 // not, so fail for now -- the caller is supposed to call us again
261 // with more data
262 return wxCONV_FAILED;
263 }
830f8f11 264 }
4ca97396
VZ
265
266 if ( !m_consumedBOM )
830f8f11 267 {
830f8f11 268 SkipBOM(&src, &srcLen);
4ca97396
VZ
269 if ( srcLen == 0 )
270 {
271 // there is nothing left except the BOM so we'd return 0 below but
272 // this is unexpected: decoding a non-empty string must either fail
273 // or return something non-empty, in particular this would break
274 // the code in wxTextInputStream::NextChar()
275 //
276 // so still return an error as we need some more data to be able to
277 // decode it
278 return wxCONV_FAILED;
279 }
830f8f11
VZ
280 }
281
01a9232b
VZ
282 // try to convert using the auto-detected encoding
283 size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
284 if ( rc == wxCONV_FAILED && m_bomType == BOM_None )
285 {
286 // if the conversion failed but we didn't really detect anything and
287 // simply tried UTF-8 by default, retry it using the fall-back
288 if ( m_encDefault != wxFONTENCODING_MAX )
289 {
290 if ( m_ownsConv )
291 delete m_conv;
292
293 self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT
294 ? GetFallbackEncoding()
295 : m_encDefault);
296 self->m_ownsConv = true;
297
298 rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
299 }
300 }
301
4ca97396
VZ
302 // don't skip the BOM again the next time if we really consumed it
303 if ( rc != wxCONV_FAILED && dst && !m_consumedBOM )
4cb0e8d0 304 self->m_consumedBOM = true;
4ca97396 305
01a9232b 306 return rc;
830f8f11
VZ
307}
308
309size_t
310wxConvAuto::FromWChar(char *dst, size_t dstLen,
311 const wchar_t *src, size_t srcLen) const
312{
313 if ( !m_conv )
314 {
315 // default to UTF-8 for the multibyte output
5c33522f 316 const_cast<wxConvAuto *>(this)->InitWithUTF8();
830f8f11
VZ
317 }
318
319 return m_conv->FromWChar(dst, dstLen, src, srcLen);
320}
321
322#endif // wxUSE_WCHAR_T