]> git.saurik.com Git - wxWidgets.git/blob - src/common/convauto.cpp
XML import corrections
[wxWidgets.git] / src / common / convauto.cpp
1 ///////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/convauto.cpp
3 // Purpose: implementation of wxConvAuto
4 // Author: Vadim Zeitlin
5 // Created: 2006-04-04
6 // RCS-ID: $Id$
7 // Copyright: (c) 2006 Vadim Zeitlin <vadim@wxwindows.org>
8 // Licence: wxWindows licence
9 ///////////////////////////////////////////////////////////////////////////////
10
11 // ============================================================================
12 // declarations
13 // ============================================================================
14
15 // ----------------------------------------------------------------------------
16 // headers
17 // ----------------------------------------------------------------------------
18
19 // for compilers that support precompilation, includes "wx.h".
20 #include "wx/wxprec.h"
21
22 #ifdef __BORLANDC__
23 #pragma hdrstop
24 #endif
25
26 #include "wx/convauto.h"
27
28 // we use latin1 by default as it seems the least bad choice: the files we need
29 // to detect input of don't always come from the user system (they are often
30 // received from other machines) and so using wxFONTENCODING_SYSTEM doesn't
31 // seem to be a good idea and there is no other reasonable alternative
32 wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1;
33
34 // ============================================================================
35 // implementation
36 // ============================================================================
37
38 /* static */
39 void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
40 {
41 wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT,
42 wxT("wxFONTENCODING_DEFAULT doesn't make sense here") );
43
44 ms_defaultMBEncoding = enc;
45 }
46
47 /* static */
48 wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
49 {
50 // examine the buffer for BOM presence
51 //
52 // quoting from http://www.unicode.org/faq/utf_bom.html#BOM:
53 //
54 // Bytes Encoding Form
55 //
56 // 00 00 FE FF UTF-32, big-endian
57 // FF FE 00 00 UTF-32, little-endian
58 // FE FF UTF-16, big-endian
59 // FF FE UTF-16, little-endian
60 // EF BB BF UTF-8
61 //
62 // as some BOMs are prefixes of other ones we may need to read more bytes
63 // to disambiguate them
64
65 switch ( srcLen )
66 {
67 case 0:
68 return BOM_Unknown;
69
70 case 1:
71 if ( src[0] == '\x00' || src[0] == '\xFF' ||
72 src[0] == '\xFE' || src[0] == '\xEF')
73 {
74 // this could be a BOM but we don't know yet
75 return BOM_Unknown;
76 }
77 break;
78
79 case 2:
80 case 3:
81 if ( src[0] == '\xEF' && src[1] == '\xBB' )
82 {
83 if ( srcLen == 3 )
84 return src[2] == '\xBF' ? BOM_UTF8 : BOM_None;
85
86 return BOM_Unknown;
87 }
88
89 if ( src[0] == '\xFE' && src[1] == '\xFF' )
90 return BOM_UTF16BE;
91
92 if ( src[0] == '\xFF' && src[1] == '\xFE' )
93 {
94 // if the next byte is 0, it could be an UTF-32LE BOM but if it
95 // isn't we can be sure it's UTF-16LE
96 if ( srcLen == 3 && src[2] != '\x00' )
97 return BOM_UTF16LE;
98
99 return BOM_Unknown;
100 }
101
102 if ( src[0] == '\x00' && src[1] == '\x00' )
103 {
104 // this could only be UTF-32BE, check that the data we have so
105 // far allows for it
106 if ( srcLen == 3 && src[2] != '\xFE' )
107 return BOM_None;
108
109 return BOM_Unknown;
110 }
111 break;
112
113 default:
114 // we have at least 4 characters so we may finally decide whether
115 // we have a BOM or not
116 if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
117 return BOM_UTF8;
118
119 if ( src[0] == '\x00' && src[1] == '\x00' &&
120 src[2] == '\xFE' && src[3] == '\xFF' )
121 return BOM_UTF32BE;
122
123 if ( src[0] == '\xFF' && src[1] == '\xFE' &&
124 src[2] == '\x00' && src[3] == '\x00' )
125 return BOM_UTF32LE;
126
127 if ( src[0] == '\xFE' && src[1] == '\xFF' )
128 return BOM_UTF16BE;
129
130 if ( src[0] == '\xFF' && src[1] == '\xFE' )
131 return BOM_UTF16LE;
132 }
133
134 return BOM_None;
135 }
136
137 void wxConvAuto::InitFromBOM(BOMType bomType)
138 {
139 m_consumedBOM = false;
140
141 switch ( bomType )
142 {
143 case BOM_Unknown:
144 wxFAIL_MSG( "shouldn't be called for this BOM type" );
145 break;
146
147 case BOM_None:
148 // use the default
149 break;
150
151 case BOM_UTF32BE:
152 m_conv = new wxMBConvUTF32BE;
153 m_ownsConv = true;
154 break;
155
156 case BOM_UTF32LE:
157 m_conv = new wxMBConvUTF32LE;
158 m_ownsConv = true;
159 break;
160
161 case BOM_UTF16BE:
162 m_conv = new wxMBConvUTF16BE;
163 m_ownsConv = true;
164 break;
165
166 case BOM_UTF16LE:
167 m_conv = new wxMBConvUTF16LE;
168 m_ownsConv = true;
169 break;
170
171 case BOM_UTF8:
172 InitWithUTF8();
173 break;
174
175 default:
176 wxFAIL_MSG( "unknown BOM type" );
177 }
178
179 if ( !m_conv )
180 {
181 // we end up here if there is no BOM or we didn't recognize it somehow
182 // (this shouldn't happen but still don't crash if it does), so use the
183 // default encoding
184 InitWithUTF8();
185 m_consumedBOM = true; // as there is nothing to consume
186 }
187 }
188
189 void wxConvAuto::SkipBOM(const char **src, size_t *len) const
190 {
191 int ofs;
192 switch ( m_bomType )
193 {
194 case BOM_Unknown:
195 wxFAIL_MSG( "shouldn't be called for this BOM type" );
196 return;
197
198 case BOM_None:
199 ofs = 0;
200 break;
201
202 case BOM_UTF32BE:
203 case BOM_UTF32LE:
204 ofs = 4;
205 break;
206
207 case BOM_UTF16BE:
208 case BOM_UTF16LE:
209 ofs = 2;
210 break;
211
212 case BOM_UTF8:
213 ofs = 3;
214 break;
215
216 default:
217 wxFAIL_MSG( "unknown BOM type" );
218 return;
219 }
220
221 *src += ofs;
222 if ( *len != (size_t)-1 )
223 *len -= ofs;
224 }
225
226 bool wxConvAuto::InitFromInput(const char *src, size_t len)
227 {
228 m_bomType = DetectBOM(src, len == wxNO_LEN ? strlen(src) : len);
229 if ( m_bomType == BOM_Unknown )
230 return false;
231
232 InitFromBOM(m_bomType);
233
234 return true;
235 }
236
237 size_t
238 wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
239 const char *src, size_t srcLen) const
240 {
241 // we check BOM and create the appropriate conversion the first time we're
242 // called but we also need to ensure that the BOM is skipped not only
243 // during this initial call but also during the first call with non-NULL
244 // dst as typically we're first called with NULL dst to calculate the
245 // needed buffer size
246 wxConvAuto *self = const_cast<wxConvAuto *>(this);
247
248
249 if ( !m_conv )
250 {
251 if ( !self->InitFromInput(src, srcLen) )
252 {
253 // there is not enough data to determine whether we have a BOM or
254 // not, so fail for now -- the caller is supposed to call us again
255 // with more data
256 return wxCONV_FAILED;
257 }
258 }
259
260 if ( !m_consumedBOM )
261 {
262 SkipBOM(&src, &srcLen);
263 if ( srcLen == 0 )
264 {
265 // there is nothing left except the BOM so we'd return 0 below but
266 // this is unexpected: decoding a non-empty string must either fail
267 // or return something non-empty, in particular this would break
268 // the code in wxTextInputStream::NextChar()
269 //
270 // so still return an error as we need some more data to be able to
271 // decode it
272 return wxCONV_FAILED;
273 }
274 }
275
276 // try to convert using the auto-detected encoding
277 size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
278 if ( rc == wxCONV_FAILED && m_bomType == BOM_None )
279 {
280 // if the conversion failed but we didn't really detect anything and
281 // simply tried UTF-8 by default, retry it using the fall-back
282 if ( m_encDefault != wxFONTENCODING_MAX )
283 {
284 if ( m_ownsConv )
285 delete m_conv;
286
287 self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT
288 ? GetFallbackEncoding()
289 : m_encDefault);
290 self->m_ownsConv = true;
291
292 rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
293 }
294 }
295
296 // don't skip the BOM again the next time if we really consumed it
297 if ( rc != wxCONV_FAILED && dst && !m_consumedBOM )
298 self->m_consumedBOM = true;
299
300 return rc;
301 }
302
303 size_t
304 wxConvAuto::FromWChar(char *dst, size_t dstLen,
305 const wchar_t *src, size_t srcLen) const
306 {
307 if ( !m_conv )
308 {
309 // default to UTF-8 for the multibyte output
310 const_cast<wxConvAuto *>(this)->InitWithUTF8();
311 }
312
313 return m_conv->FromWChar(dst, dstLen, src, srcLen);
314 }