]> git.saurik.com Git - wxWidgets.git/blame - src/common/convauto.cpp
Allow retrieving the descent and external leading of empty strings.
[wxWidgets.git] / src / common / convauto.cpp
CommitLineData
830f8f11
VZ
1///////////////////////////////////////////////////////////////////////////////
2// Name: src/common/convauto.cpp
3// Purpose: implementation of wxConvAuto
4// Author: Vadim Zeitlin
5// Created: 2006-04-04
6// RCS-ID: $Id$
7// Copyright: (c) 2006 Vadim Zeitlin <vadim@wxwindows.org>
8// Licence: wxWindows licence
9///////////////////////////////////////////////////////////////////////////////
10
11// ============================================================================
12// declarations
13// ============================================================================
14
15// ----------------------------------------------------------------------------
16// headers
17// ----------------------------------------------------------------------------
18
19// for compilers that support precompilation, includes "wx.h".
20#include "wx/wxprec.h"
21
22#ifdef __BORLANDC__
23 #pragma hdrstop
24#endif
25
830f8f11
VZ
26#include "wx/convauto.h"
27
01a9232b
VZ
28// we use latin1 by default as it seems the least bad choice: the files we need
29// to detect input of don't always come from the user system (they are often
30// received from other machines) and so using wxFONTENCODING_SYSTEM doesn't
31// seem to be a good idea and there is no other reasonable alternative
32wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1;
33
64b91e2d
VZ
34namespace
35{
36
37const char BOM_UTF32BE[] = { '\x00', '\x00', '\xFE', '\xFF' };
38const char BOM_UTF32LE[] = { '\xFF', '\xFE', '\x00', '\x00' };
39const char BOM_UTF16BE[] = { '\xFE', '\xFF' };
40const char BOM_UTF16LE[] = { '\xFF', '\xFE' };
41const char BOM_UTF8[] = { '\xEF', '\xBB', '\xBF' };
42
43} // anonymous namespace
44
830f8f11
VZ
45// ============================================================================
46// implementation
47// ============================================================================
48
01a9232b
VZ
49/* static */
50void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
51{
52 wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT,
9a83f860 53 wxT("wxFONTENCODING_DEFAULT doesn't make sense here") );
01a9232b
VZ
54
55 ms_defaultMBEncoding = enc;
56}
57
64b91e2d
VZ
58/* static */
59const char* wxConvAuto::GetBOMChars(wxBOM bom, size_t* count)
60{
61 wxCHECK_MSG( count , NULL, wxS("count pointer must be provided") );
62
63 switch ( bom )
64 {
65 case wxBOM_UTF32BE: *count = WXSIZEOF(BOM_UTF32BE); return BOM_UTF32BE;
66 case wxBOM_UTF32LE: *count = WXSIZEOF(BOM_UTF32LE); return BOM_UTF32LE;
67 case wxBOM_UTF16BE: *count = WXSIZEOF(BOM_UTF16BE); return BOM_UTF16BE;
68 case wxBOM_UTF16LE: *count = WXSIZEOF(BOM_UTF16LE); return BOM_UTF16LE;
69 case wxBOM_UTF8 : *count = WXSIZEOF(BOM_UTF8 ); return BOM_UTF8;
70 case wxBOM_Unknown:
71 case wxBOM_None:
72 wxFAIL_MSG( wxS("Invalid BOM type") );
73 return NULL;
74 }
75
76 wxFAIL_MSG( wxS("Unknown BOM type") );
77 return NULL;
78}
79
830f8f11 80/* static */
038809c2 81wxBOM wxConvAuto::DetectBOM(const char *src, size_t srcLen)
830f8f11 82{
830f8f11
VZ
83 // examine the buffer for BOM presence
84 //
4cb0e8d0
VZ
85 // quoting from http://www.unicode.org/faq/utf_bom.html#BOM:
86 //
87 // Bytes Encoding Form
88 //
89 // 00 00 FE FF UTF-32, big-endian
90 // FF FE 00 00 UTF-32, little-endian
91 // FE FF UTF-16, big-endian
92 // FF FE UTF-16, little-endian
93 // EF BB BF UTF-8
94 //
95 // as some BOMs are prefixes of other ones we may need to read more bytes
96 // to disambiguate them
97
98 switch ( srcLen )
830f8f11 99 {
4cb0e8d0 100 case 0:
038809c2 101 return wxBOM_Unknown;
4cb0e8d0
VZ
102
103 case 1:
104 if ( src[0] == '\x00' || src[0] == '\xFF' ||
105 src[0] == '\xFE' || src[0] == '\xEF')
830f8f11 106 {
4cb0e8d0 107 // this could be a BOM but we don't know yet
038809c2 108 return wxBOM_Unknown;
830f8f11
VZ
109 }
110 break;
111
4cb0e8d0
VZ
112 case 2:
113 case 3:
114 if ( src[0] == '\xEF' && src[1] == '\xBB' )
830f8f11 115 {
4cb0e8d0 116 if ( srcLen == 3 )
038809c2 117 return src[2] == '\xBF' ? wxBOM_UTF8 : wxBOM_None;
4cb0e8d0 118
038809c2 119 return wxBOM_Unknown;
830f8f11 120 }
830f8f11 121
4cb0e8d0 122 if ( src[0] == '\xFE' && src[1] == '\xFF' )
038809c2 123 return wxBOM_UTF16BE;
4cb0e8d0
VZ
124
125 if ( src[0] == '\xFF' && src[1] == '\xFE' )
830f8f11 126 {
4cb0e8d0
VZ
127 // if the next byte is 0, it could be an UTF-32LE BOM but if it
128 // isn't we can be sure it's UTF-16LE
129 if ( srcLen == 3 && src[2] != '\x00' )
038809c2 130 return wxBOM_UTF16LE;
4cb0e8d0 131
038809c2 132 return wxBOM_Unknown;
830f8f11 133 }
830f8f11 134
4cb0e8d0 135 if ( src[0] == '\x00' && src[1] == '\x00' )
830f8f11 136 {
823e82e2
VZ
137 // this could only be UTF-32BE, check that the data we have so
138 // far allows for it
139 if ( srcLen == 3 && src[2] != '\xFE' )
038809c2 140 return wxBOM_None;
4cb0e8d0 141
038809c2 142 return wxBOM_Unknown;
823e82e2 143 }
830f8f11 144 break;
4cb0e8d0
VZ
145
146 default:
147 // we have at least 4 characters so we may finally decide whether
148 // we have a BOM or not
149 if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
038809c2 150 return wxBOM_UTF8;
4cb0e8d0
VZ
151
152 if ( src[0] == '\x00' && src[1] == '\x00' &&
153 src[2] == '\xFE' && src[3] == '\xFF' )
038809c2 154 return wxBOM_UTF32BE;
4cb0e8d0
VZ
155
156 if ( src[0] == '\xFF' && src[1] == '\xFE' &&
157 src[2] == '\x00' && src[3] == '\x00' )
038809c2 158 return wxBOM_UTF32LE;
4cb0e8d0
VZ
159
160 if ( src[0] == '\xFE' && src[1] == '\xFF' )
038809c2 161 return wxBOM_UTF16BE;
4cb0e8d0
VZ
162
163 if ( src[0] == '\xFF' && src[1] == '\xFE' )
038809c2 164 return wxBOM_UTF16LE;
830f8f11
VZ
165 }
166
038809c2 167 return wxBOM_None;
830f8f11
VZ
168}
169
038809c2 170void wxConvAuto::InitFromBOM(wxBOM bomType)
830f8f11
VZ
171{
172 m_consumedBOM = false;
173
174 switch ( bomType )
175 {
038809c2 176 case wxBOM_Unknown:
4cb0e8d0
VZ
177 wxFAIL_MSG( "shouldn't be called for this BOM type" );
178 break;
179
038809c2 180 case wxBOM_None:
4cb0e8d0
VZ
181 // use the default
182 break;
183
038809c2 184 case wxBOM_UTF32BE:
830f8f11
VZ
185 m_conv = new wxMBConvUTF32BE;
186 m_ownsConv = true;
187 break;
188
038809c2 189 case wxBOM_UTF32LE:
830f8f11
VZ
190 m_conv = new wxMBConvUTF32LE;
191 m_ownsConv = true;
192 break;
193
038809c2 194 case wxBOM_UTF16BE:
830f8f11
VZ
195 m_conv = new wxMBConvUTF16BE;
196 m_ownsConv = true;
197 break;
198
038809c2 199 case wxBOM_UTF16LE:
830f8f11
VZ
200 m_conv = new wxMBConvUTF16LE;
201 m_ownsConv = true;
202 break;
203
038809c2 204 case wxBOM_UTF8:
01a9232b 205 InitWithUTF8();
830f8f11
VZ
206 break;
207
208 default:
4cb0e8d0
VZ
209 wxFAIL_MSG( "unknown BOM type" );
210 }
830f8f11 211
4cb0e8d0
VZ
212 if ( !m_conv )
213 {
214 // we end up here if there is no BOM or we didn't recognize it somehow
215 // (this shouldn't happen but still don't crash if it does), so use the
216 // default encoding
217 InitWithUTF8();
218 m_consumedBOM = true; // as there is nothing to consume
830f8f11
VZ
219 }
220}
221
222void wxConvAuto::SkipBOM(const char **src, size_t *len) const
223{
224 int ofs;
225 switch ( m_bomType )
226 {
038809c2 227 case wxBOM_Unknown:
4cb0e8d0
VZ
228 wxFAIL_MSG( "shouldn't be called for this BOM type" );
229 return;
230
038809c2 231 case wxBOM_None:
4cb0e8d0
VZ
232 ofs = 0;
233 break;
234
038809c2
VZ
235 case wxBOM_UTF32BE:
236 case wxBOM_UTF32LE:
830f8f11
VZ
237 ofs = 4;
238 break;
239
038809c2
VZ
240 case wxBOM_UTF16BE:
241 case wxBOM_UTF16LE:
830f8f11
VZ
242 ofs = 2;
243 break;
244
038809c2 245 case wxBOM_UTF8:
830f8f11
VZ
246 ofs = 3;
247 break;
248
249 default:
4cb0e8d0
VZ
250 wxFAIL_MSG( "unknown BOM type" );
251 return;
830f8f11
VZ
252 }
253
254 *src += ofs;
255 if ( *len != (size_t)-1 )
256 *len -= ofs;
257}
258
4ca97396 259bool wxConvAuto::InitFromInput(const char *src, size_t len)
830f8f11 260{
9334ad17 261 m_bomType = DetectBOM(src, len == wxNO_LEN ? strlen(src) : len);
038809c2 262 if ( m_bomType == wxBOM_Unknown )
4cb0e8d0
VZ
263 return false;
264
830f8f11 265 InitFromBOM(m_bomType);
4cb0e8d0
VZ
266
267 return true;
830f8f11
VZ
268}
269
270size_t
271wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
272 const char *src, size_t srcLen) const
273{
274 // we check BOM and create the appropriate conversion the first time we're
275 // called but we also need to ensure that the BOM is skipped not only
276 // during this initial call but also during the first call with non-NULL
277 // dst as typically we're first called with NULL dst to calculate the
278 // needed buffer size
5c33522f 279 wxConvAuto *self = const_cast<wxConvAuto *>(this);
4cb0e8d0
VZ
280
281
830f8f11
VZ
282 if ( !m_conv )
283 {
4ca97396 284 if ( !self->InitFromInput(src, srcLen) )
4cb0e8d0
VZ
285 {
286 // there is not enough data to determine whether we have a BOM or
287 // not, so fail for now -- the caller is supposed to call us again
288 // with more data
289 return wxCONV_FAILED;
290 }
830f8f11 291 }
4ca97396
VZ
292
293 if ( !m_consumedBOM )
830f8f11 294 {
830f8f11 295 SkipBOM(&src, &srcLen);
4ca97396
VZ
296 if ( srcLen == 0 )
297 {
298 // there is nothing left except the BOM so we'd return 0 below but
299 // this is unexpected: decoding a non-empty string must either fail
300 // or return something non-empty, in particular this would break
301 // the code in wxTextInputStream::NextChar()
302 //
303 // so still return an error as we need some more data to be able to
304 // decode it
305 return wxCONV_FAILED;
306 }
830f8f11
VZ
307 }
308
01a9232b
VZ
309 // try to convert using the auto-detected encoding
310 size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
038809c2 311 if ( rc == wxCONV_FAILED && m_bomType == wxBOM_None )
01a9232b
VZ
312 {
313 // if the conversion failed but we didn't really detect anything and
314 // simply tried UTF-8 by default, retry it using the fall-back
315 if ( m_encDefault != wxFONTENCODING_MAX )
316 {
317 if ( m_ownsConv )
318 delete m_conv;
319
320 self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT
321 ? GetFallbackEncoding()
322 : m_encDefault);
323 self->m_ownsConv = true;
324
325 rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
326 }
327 }
328
4ca97396
VZ
329 // don't skip the BOM again the next time if we really consumed it
330 if ( rc != wxCONV_FAILED && dst && !m_consumedBOM )
4cb0e8d0 331 self->m_consumedBOM = true;
4ca97396 332
01a9232b 333 return rc;
830f8f11
VZ
334}
335
336size_t
337wxConvAuto::FromWChar(char *dst, size_t dstLen,
338 const wchar_t *src, size_t srcLen) const
339{
340 if ( !m_conv )
341 {
342 // default to UTF-8 for the multibyte output
5c33522f 343 const_cast<wxConvAuto *>(this)->InitWithUTF8();
830f8f11
VZ
344 }
345
346 return m_conv->FromWChar(dst, dstLen, src, srcLen);
347}