Allow overriding print preview frame creation in docview.
[wxWidgets.git] / src / common / convauto.cpp
1 ///////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/convauto.cpp
3 // Purpose: implementation of wxConvAuto
4 // Author: Vadim Zeitlin
5 // Created: 2006-04-04
6 // RCS-ID: $Id$
7 // Copyright: (c) 2006 Vadim Zeitlin <vadim@wxwindows.org>
8 // Licence: wxWindows licence
9 ///////////////////////////////////////////////////////////////////////////////
10
11 // ============================================================================
12 // declarations
13 // ============================================================================
14
15 // ----------------------------------------------------------------------------
16 // headers
17 // ----------------------------------------------------------------------------
18
19 // for compilers that support precompilation, includes "wx.h".
20 #include "wx/wxprec.h"
21
22 #ifdef __BORLANDC__
23 #pragma hdrstop
24 #endif
25
26 #if wxUSE_WCHAR_T
27
28 #ifndef WX_PRECOMP
29 #include "wx/wx.h"
30 #endif //WX_PRECOMP
31
32 #include "wx/convauto.h"
33
34 // we use latin1 by default as it seems the least bad choice: the files we need
35 // to detect input of don't always come from the user system (they are often
36 // received from other machines) and so using wxFONTENCODING_SYSTEM doesn't
37 // seem to be a good idea and there is no other reasonable alternative
38 wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1;
39
40 // ============================================================================
41 // implementation
42 // ============================================================================
43
44 /* static */
45 void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
46 {
47 wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT,
48 wxT("wxFONTENCODING_DEFAULT doesn't make sense here") );
49
50 ms_defaultMBEncoding = enc;
51 }
52
53 /* static */
54 wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
55 {
56 // examine the buffer for BOM presence
57 //
58 // quoting from http://www.unicode.org/faq/utf_bom.html#BOM:
59 //
60 // Bytes Encoding Form
61 //
62 // 00 00 FE FF UTF-32, big-endian
63 // FF FE 00 00 UTF-32, little-endian
64 // FE FF UTF-16, big-endian
65 // FF FE UTF-16, little-endian
66 // EF BB BF UTF-8
67 //
68 // as some BOMs are prefixes of other ones we may need to read more bytes
69 // to disambiguate them
70
71 switch ( srcLen )
72 {
73 case 0:
74 return BOM_Unknown;
75
76 case 1:
77 if ( src[0] == '\x00' || src[0] == '\xFF' ||
78 src[0] == '\xFE' || src[0] == '\xEF')
79 {
80 // this could be a BOM but we don't know yet
81 return BOM_Unknown;
82 }
83 break;
84
85 case 2:
86 case 3:
87 if ( src[0] == '\xEF' && src[1] == '\xBB' )
88 {
89 if ( srcLen == 3 )
90 return src[2] == '\xBF' ? BOM_UTF8 : BOM_None;
91
92 return BOM_Unknown;
93 }
94
95 if ( src[0] == '\xFE' && src[1] == '\xFF' )
96 return BOM_UTF16BE;
97
98 if ( src[0] == '\xFF' && src[1] == '\xFE' )
99 {
100 // if the next byte is 0, it could be an UTF-32LE BOM but if it
101 // isn't we can be sure it's UTF-16LE
102 if ( srcLen == 3 && src[2] != '\x00' )
103 return BOM_UTF16LE;
104
105 return BOM_Unknown;
106 }
107
108 if ( src[0] == '\x00' && src[1] == '\x00' )
109 {
110 // this could only be UTF-32BE, check that the data we have so
111 // far allows for it
112 if ( srcLen == 3 && src[2] != '\xFE' )
113 return BOM_None;
114
115 return BOM_Unknown;
116 }
117 break;
118
119 default:
120 // we have at least 4 characters so we may finally decide whether
121 // we have a BOM or not
122 if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
123 return BOM_UTF8;
124
125 if ( src[0] == '\x00' && src[1] == '\x00' &&
126 src[2] == '\xFE' && src[3] == '\xFF' )
127 return BOM_UTF32BE;
128
129 if ( src[0] == '\xFF' && src[1] == '\xFE' &&
130 src[2] == '\x00' && src[3] == '\x00' )
131 return BOM_UTF32LE;
132
133 if ( src[0] == '\xFE' && src[1] == '\xFF' )
134 return BOM_UTF16BE;
135
136 if ( src[0] == '\xFF' && src[1] == '\xFE' )
137 return BOM_UTF16LE;
138 }
139
140 return BOM_None;
141 }
142
143 void wxConvAuto::InitFromBOM(BOMType bomType)
144 {
145 m_consumedBOM = false;
146
147 switch ( bomType )
148 {
149 case BOM_Unknown:
150 wxFAIL_MSG( "shouldn't be called for this BOM type" );
151 break;
152
153 case BOM_None:
154 // use the default
155 break;
156
157 case BOM_UTF32BE:
158 m_conv = new wxMBConvUTF32BE;
159 m_ownsConv = true;
160 break;
161
162 case BOM_UTF32LE:
163 m_conv = new wxMBConvUTF32LE;
164 m_ownsConv = true;
165 break;
166
167 case BOM_UTF16BE:
168 m_conv = new wxMBConvUTF16BE;
169 m_ownsConv = true;
170 break;
171
172 case BOM_UTF16LE:
173 m_conv = new wxMBConvUTF16LE;
174 m_ownsConv = true;
175 break;
176
177 case BOM_UTF8:
178 InitWithUTF8();
179 break;
180
181 default:
182 wxFAIL_MSG( "unknown BOM type" );
183 }
184
185 if ( !m_conv )
186 {
187 // we end up here if there is no BOM or we didn't recognize it somehow
188 // (this shouldn't happen but still don't crash if it does), so use the
189 // default encoding
190 InitWithUTF8();
191 m_consumedBOM = true; // as there is nothing to consume
192 }
193 }
194
195 void wxConvAuto::SkipBOM(const char **src, size_t *len) const
196 {
197 int ofs;
198 switch ( m_bomType )
199 {
200 case BOM_Unknown:
201 wxFAIL_MSG( "shouldn't be called for this BOM type" );
202 return;
203
204 case BOM_None:
205 ofs = 0;
206 break;
207
208 case BOM_UTF32BE:
209 case BOM_UTF32LE:
210 ofs = 4;
211 break;
212
213 case BOM_UTF16BE:
214 case BOM_UTF16LE:
215 ofs = 2;
216 break;
217
218 case BOM_UTF8:
219 ofs = 3;
220 break;
221
222 default:
223 wxFAIL_MSG( "unknown BOM type" );
224 return;
225 }
226
227 *src += ofs;
228 if ( *len != (size_t)-1 )
229 *len -= ofs;
230 }
231
232 bool wxConvAuto::InitFromInput(const char *src, size_t len)
233 {
234 m_bomType = DetectBOM(src, len);
235 if ( m_bomType == BOM_Unknown )
236 return false;
237
238 InitFromBOM(m_bomType);
239
240 return true;
241 }
242
243 size_t
244 wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
245 const char *src, size_t srcLen) const
246 {
247 // we check BOM and create the appropriate conversion the first time we're
248 // called but we also need to ensure that the BOM is skipped not only
249 // during this initial call but also during the first call with non-NULL
250 // dst as typically we're first called with NULL dst to calculate the
251 // needed buffer size
252 wxConvAuto *self = const_cast<wxConvAuto *>(this);
253
254
255 if ( !m_conv )
256 {
257 if ( !self->InitFromInput(src, srcLen) )
258 {
259 // there is not enough data to determine whether we have a BOM or
260 // not, so fail for now -- the caller is supposed to call us again
261 // with more data
262 return wxCONV_FAILED;
263 }
264 }
265
266 if ( !m_consumedBOM )
267 {
268 SkipBOM(&src, &srcLen);
269 if ( srcLen == 0 )
270 {
271 // there is nothing left except the BOM so we'd return 0 below but
272 // this is unexpected: decoding a non-empty string must either fail
273 // or return something non-empty, in particular this would break
274 // the code in wxTextInputStream::NextChar()
275 //
276 // so still return an error as we need some more data to be able to
277 // decode it
278 return wxCONV_FAILED;
279 }
280 }
281
282 // try to convert using the auto-detected encoding
283 size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
284 if ( rc == wxCONV_FAILED && m_bomType == BOM_None )
285 {
286 // if the conversion failed but we didn't really detect anything and
287 // simply tried UTF-8 by default, retry it using the fall-back
288 if ( m_encDefault != wxFONTENCODING_MAX )
289 {
290 if ( m_ownsConv )
291 delete m_conv;
292
293 self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT
294 ? GetFallbackEncoding()
295 : m_encDefault);
296 self->m_ownsConv = true;
297
298 rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
299 }
300 }
301
302 // don't skip the BOM again the next time if we really consumed it
303 if ( rc != wxCONV_FAILED && dst && !m_consumedBOM )
304 self->m_consumedBOM = true;
305
306 return rc;
307 }
308
309 size_t
310 wxConvAuto::FromWChar(char *dst, size_t dstLen,
311 const wchar_t *src, size_t srcLen) const
312 {
313 if ( !m_conv )
314 {
315 // default to UTF-8 for the multibyte output
316 const_cast<wxConvAuto *>(this)->InitWithUTF8();
317 }
318
319 return m_conv->FromWChar(dst, dstLen, src, srcLen);
320 }
321
322 #endif // wxUSE_WCHAR_T