]> git.saurik.com Git - wxWidgets.git/blob - src/common/convauto.cpp
8d8c24c0a3e21ee2199f5b1a0b92d2d7c25869fb
[wxWidgets.git] / src / common / convauto.cpp
1 ///////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/convauto.cpp
3 // Purpose: implementation of wxConvAuto
4 // Author: Vadim Zeitlin
5 // Created: 2006-04-04
6 // RCS-ID: $Id$
7 // Copyright: (c) 2006 Vadim Zeitlin <vadim@wxwindows.org>
8 // Licence: wxWindows licence
9 ///////////////////////////////////////////////////////////////////////////////
10
11 // ============================================================================
12 // declarations
13 // ============================================================================
14
15 // ----------------------------------------------------------------------------
16 // headers
17 // ----------------------------------------------------------------------------
18
19 // for compilers that support precompilation, includes "wx.h".
20 #include "wx/wxprec.h"
21
22 #ifdef __BORLANDC__
23 #pragma hdrstop
24 #endif
25
26 #ifndef WX_PRECOMP
27 #include "wx/wx.h"
28 #endif //WX_PRECOMP
29
30 #include "wx/convauto.h"
31
32 // we use latin1 by default as it seems the least bad choice: the files we need
33 // to detect input of don't always come from the user system (they are often
34 // received from other machines) and so using wxFONTENCODING_SYSTEM doesn't
35 // seem to be a good idea and there is no other reasonable alternative
36 wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1;
37
38 // ============================================================================
39 // implementation
40 // ============================================================================
41
42 /* static */
43 void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
44 {
45 wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT,
46 wxT("wxFONTENCODING_DEFAULT doesn't make sense here") );
47
48 ms_defaultMBEncoding = enc;
49 }
50
51 /* static */
52 wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
53 {
54 // examine the buffer for BOM presence
55 //
56 // quoting from http://www.unicode.org/faq/utf_bom.html#BOM:
57 //
58 // Bytes Encoding Form
59 //
60 // 00 00 FE FF UTF-32, big-endian
61 // FF FE 00 00 UTF-32, little-endian
62 // FE FF UTF-16, big-endian
63 // FF FE UTF-16, little-endian
64 // EF BB BF UTF-8
65 //
66 // as some BOMs are prefixes of other ones we may need to read more bytes
67 // to disambiguate them
68
69 switch ( srcLen )
70 {
71 case 0:
72 return BOM_Unknown;
73
74 case 1:
75 if ( src[0] == '\x00' || src[0] == '\xFF' ||
76 src[0] == '\xFE' || src[0] == '\xEF')
77 {
78 // this could be a BOM but we don't know yet
79 return BOM_Unknown;
80 }
81 break;
82
83 case 2:
84 case 3:
85 if ( src[0] == '\xEF' && src[1] == '\xBB' )
86 {
87 if ( srcLen == 3 )
88 return src[2] == '\xBF' ? BOM_UTF8 : BOM_None;
89
90 return BOM_Unknown;
91 }
92
93 if ( src[0] == '\xFE' && src[1] == '\xFF' )
94 return BOM_UTF16BE;
95
96 if ( src[0] == '\xFF' && src[1] == '\xFE' )
97 {
98 // if the next byte is 0, it could be an UTF-32LE BOM but if it
99 // isn't we can be sure it's UTF-16LE
100 if ( srcLen == 3 && src[2] != '\x00' )
101 return BOM_UTF16LE;
102
103 return BOM_Unknown;
104 }
105
106 if ( src[0] == '\x00' && src[1] == '\x00' )
107 {
108 // this could only be UTF-32BE, check that the data we have so
109 // far allows for it
110 if ( srcLen == 3 && src[2] != '\xFE' )
111 return BOM_None;
112
113 return BOM_Unknown;
114 }
115 break;
116
117 default:
118 // we have at least 4 characters so we may finally decide whether
119 // we have a BOM or not
120 if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
121 return BOM_UTF8;
122
123 if ( src[0] == '\x00' && src[1] == '\x00' &&
124 src[2] == '\xFE' && src[3] == '\xFF' )
125 return BOM_UTF32BE;
126
127 if ( src[0] == '\xFF' && src[1] == '\xFE' &&
128 src[2] == '\x00' && src[3] == '\x00' )
129 return BOM_UTF32LE;
130
131 if ( src[0] == '\xFE' && src[1] == '\xFF' )
132 return BOM_UTF16BE;
133
134 if ( src[0] == '\xFF' && src[1] == '\xFE' )
135 return BOM_UTF16LE;
136 }
137
138 return BOM_None;
139 }
140
141 void wxConvAuto::InitFromBOM(BOMType bomType)
142 {
143 m_consumedBOM = false;
144
145 switch ( bomType )
146 {
147 case BOM_Unknown:
148 wxFAIL_MSG( "shouldn't be called for this BOM type" );
149 break;
150
151 case BOM_None:
152 // use the default
153 break;
154
155 case BOM_UTF32BE:
156 m_conv = new wxMBConvUTF32BE;
157 m_ownsConv = true;
158 break;
159
160 case BOM_UTF32LE:
161 m_conv = new wxMBConvUTF32LE;
162 m_ownsConv = true;
163 break;
164
165 case BOM_UTF16BE:
166 m_conv = new wxMBConvUTF16BE;
167 m_ownsConv = true;
168 break;
169
170 case BOM_UTF16LE:
171 m_conv = new wxMBConvUTF16LE;
172 m_ownsConv = true;
173 break;
174
175 case BOM_UTF8:
176 InitWithUTF8();
177 break;
178
179 default:
180 wxFAIL_MSG( "unknown BOM type" );
181 }
182
183 if ( !m_conv )
184 {
185 // we end up here if there is no BOM or we didn't recognize it somehow
186 // (this shouldn't happen but still don't crash if it does), so use the
187 // default encoding
188 InitWithUTF8();
189 m_consumedBOM = true; // as there is nothing to consume
190 }
191 }
192
193 void wxConvAuto::SkipBOM(const char **src, size_t *len) const
194 {
195 int ofs;
196 switch ( m_bomType )
197 {
198 case BOM_Unknown:
199 wxFAIL_MSG( "shouldn't be called for this BOM type" );
200 return;
201
202 case BOM_None:
203 ofs = 0;
204 break;
205
206 case BOM_UTF32BE:
207 case BOM_UTF32LE:
208 ofs = 4;
209 break;
210
211 case BOM_UTF16BE:
212 case BOM_UTF16LE:
213 ofs = 2;
214 break;
215
216 case BOM_UTF8:
217 ofs = 3;
218 break;
219
220 default:
221 wxFAIL_MSG( "unknown BOM type" );
222 return;
223 }
224
225 *src += ofs;
226 if ( *len != (size_t)-1 )
227 *len -= ofs;
228 }
229
230 bool wxConvAuto::InitFromInput(const char *src, size_t len)
231 {
232 m_bomType = DetectBOM(src, len);
233 if ( m_bomType == BOM_Unknown )
234 return false;
235
236 InitFromBOM(m_bomType);
237
238 return true;
239 }
240
241 size_t
242 wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
243 const char *src, size_t srcLen) const
244 {
245 // we check BOM and create the appropriate conversion the first time we're
246 // called but we also need to ensure that the BOM is skipped not only
247 // during this initial call but also during the first call with non-NULL
248 // dst as typically we're first called with NULL dst to calculate the
249 // needed buffer size
250 wxConvAuto *self = const_cast<wxConvAuto *>(this);
251
252
253 if ( !m_conv )
254 {
255 if ( !self->InitFromInput(src, srcLen) )
256 {
257 // there is not enough data to determine whether we have a BOM or
258 // not, so fail for now -- the caller is supposed to call us again
259 // with more data
260 return wxCONV_FAILED;
261 }
262 }
263
264 if ( !m_consumedBOM )
265 {
266 SkipBOM(&src, &srcLen);
267 if ( srcLen == 0 )
268 {
269 // there is nothing left except the BOM so we'd return 0 below but
270 // this is unexpected: decoding a non-empty string must either fail
271 // or return something non-empty, in particular this would break
272 // the code in wxTextInputStream::NextChar()
273 //
274 // so still return an error as we need some more data to be able to
275 // decode it
276 return wxCONV_FAILED;
277 }
278 }
279
280 // try to convert using the auto-detected encoding
281 size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
282 if ( rc == wxCONV_FAILED && m_bomType == BOM_None )
283 {
284 // if the conversion failed but we didn't really detect anything and
285 // simply tried UTF-8 by default, retry it using the fall-back
286 if ( m_encDefault != wxFONTENCODING_MAX )
287 {
288 if ( m_ownsConv )
289 delete m_conv;
290
291 self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT
292 ? GetFallbackEncoding()
293 : m_encDefault);
294 self->m_ownsConv = true;
295
296 rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
297 }
298 }
299
300 // don't skip the BOM again the next time if we really consumed it
301 if ( rc != wxCONV_FAILED && dst && !m_consumedBOM )
302 self->m_consumedBOM = true;
303
304 return rc;
305 }
306
307 size_t
308 wxConvAuto::FromWChar(char *dst, size_t dstLen,
309 const wchar_t *src, size_t srcLen) const
310 {
311 if ( !m_conv )
312 {
313 // default to UTF-8 for the multibyte output
314 const_cast<wxConvAuto *>(this)->InitWithUTF8();
315 }
316
317 return m_conv->FromWChar(dst, dstLen, src, srcLen);
318 }