]> git.saurik.com Git - wxWidgets.git/blob - src/common/convauto.cpp
Fix for non-Mac builds.
[wxWidgets.git] / src / common / convauto.cpp
1 ///////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/convauto.cpp
3 // Purpose: implementation of wxConvAuto
4 // Author: Vadim Zeitlin
5 // Created: 2006-04-04
6 // RCS-ID: $Id$
7 // Copyright: (c) 2006 Vadim Zeitlin <vadim@wxwindows.org>
8 // Licence: wxWindows licence
9 ///////////////////////////////////////////////////////////////////////////////
10
11 // ============================================================================
12 // declarations
13 // ============================================================================
14
15 // ----------------------------------------------------------------------------
16 // headers
17 // ----------------------------------------------------------------------------
18
19 // for compilers that support precompilation, includes "wx.h".
20 #include "wx/wxprec.h"
21
22 #ifdef __BORLANDC__
23 #pragma hdrstop
24 #endif
25
26 #if wxUSE_WCHAR_T
27
28 #ifndef WX_PRECOMP
29 #include "wx/wx.h"
30 #endif //WX_PRECOMP
31
32 #include "wx/convauto.h"
33
34 // we use latin1 by default as it seems the least bad choice: the files we need
35 // to detect input of don't always come from the user system (they are often
36 // received from other machines) and so using wxFONTENCODING_SYSTEM doesn't
37 // seem to be a good idea and there is no other reasonable alternative
38 wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1;
39
40 // ============================================================================
41 // implementation
42 // ============================================================================
43
44 /* static */
45 void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
46 {
47 wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT,
48 wxT("wxFONTENCODING_DEFAULT doesn't make sense here") );
49
50 ms_defaultMBEncoding = enc;
51 }
52
53 /* static */
54 wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
55 {
56 // examine the buffer for BOM presence
57 //
58 // quoting from http://www.unicode.org/faq/utf_bom.html#BOM:
59 //
60 // Bytes Encoding Form
61 //
62 // 00 00 FE FF UTF-32, big-endian
63 // FF FE 00 00 UTF-32, little-endian
64 // FE FF UTF-16, big-endian
65 // FF FE UTF-16, little-endian
66 // EF BB BF UTF-8
67 //
68 // as some BOMs are prefixes of other ones we may need to read more bytes
69 // to disambiguate them
70
71 switch ( srcLen )
72 {
73 case 0:
74 return BOM_Unknown;
75
76 case 1:
77 if ( src[0] == '\x00' || src[0] == '\xFF' ||
78 src[0] == '\xFE' || src[0] == '\xEF')
79 {
80 // this could be a BOM but we don't know yet
81 return BOM_Unknown;
82 }
83 break;
84
85 case 2:
86 case 3:
87 if ( src[0] == '\xEF' && src[1] == '\xBB' )
88 {
89 if ( srcLen == 3 )
90 return src[2] == '\xBF' ? BOM_UTF8 : BOM_None;
91
92 return BOM_Unknown;
93 }
94
95 if ( src[0] == '\xFE' && src[1] == '\xFF' )
96 return BOM_UTF16BE;
97
98 if ( src[0] == '\xFF' && src[1] == '\xFE' )
99 {
100 // if the next byte is 0, it could be an UTF-32LE BOM but if it
101 // isn't we can be sure it's UTF-16LE
102 if ( srcLen == 3 && src[2] != '\x00' )
103 return BOM_UTF16LE;
104
105 return BOM_Unknown;
106 }
107
108 if ( src[0] == '\x00' && src[1] == '\x00' )
109 {
110 // this could only be UTF-32BE
111 if ( srcLen == 3 && src[2] == '\xFE' )
112 return BOM_Unknown;
113 }
114
115 break;
116
117 default:
118 // we have at least 4 characters so we may finally decide whether
119 // we have a BOM or not
120 if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
121 return BOM_UTF8;
122
123 if ( src[0] == '\x00' && src[1] == '\x00' &&
124 src[2] == '\xFE' && src[3] == '\xFF' )
125 return BOM_UTF32BE;
126
127 if ( src[0] == '\xFF' && src[1] == '\xFE' &&
128 src[2] == '\x00' && src[3] == '\x00' )
129 return BOM_UTF32LE;
130
131 if ( src[0] == '\xFE' && src[1] == '\xFF' )
132 return BOM_UTF16BE;
133
134 if ( src[0] == '\xFF' && src[1] == '\xFE' )
135 return BOM_UTF16LE;
136 }
137
138 return BOM_None;
139 }
140
141 void wxConvAuto::InitFromBOM(BOMType bomType)
142 {
143 m_consumedBOM = false;
144
145 switch ( bomType )
146 {
147 case BOM_Unknown:
148 wxFAIL_MSG( "shouldn't be called for this BOM type" );
149 break;
150
151 case BOM_None:
152 // use the default
153 break;
154
155 case BOM_UTF32BE:
156 m_conv = new wxMBConvUTF32BE;
157 m_ownsConv = true;
158 break;
159
160 case BOM_UTF32LE:
161 m_conv = new wxMBConvUTF32LE;
162 m_ownsConv = true;
163 break;
164
165 case BOM_UTF16BE:
166 m_conv = new wxMBConvUTF16BE;
167 m_ownsConv = true;
168 break;
169
170 case BOM_UTF16LE:
171 m_conv = new wxMBConvUTF16LE;
172 m_ownsConv = true;
173 break;
174
175 case BOM_UTF8:
176 InitWithUTF8();
177 break;
178
179 default:
180 wxFAIL_MSG( "unknown BOM type" );
181 }
182
183 if ( !m_conv )
184 {
185 // we end up here if there is no BOM or we didn't recognize it somehow
186 // (this shouldn't happen but still don't crash if it does), so use the
187 // default encoding
188 InitWithUTF8();
189 m_consumedBOM = true; // as there is nothing to consume
190 }
191 }
192
193 void wxConvAuto::SkipBOM(const char **src, size_t *len) const
194 {
195 int ofs;
196 switch ( m_bomType )
197 {
198 case BOM_Unknown:
199 wxFAIL_MSG( "shouldn't be called for this BOM type" );
200 return;
201
202 case BOM_None:
203 ofs = 0;
204 break;
205
206 case BOM_UTF32BE:
207 case BOM_UTF32LE:
208 ofs = 4;
209 break;
210
211 case BOM_UTF16BE:
212 case BOM_UTF16LE:
213 ofs = 2;
214 break;
215
216 case BOM_UTF8:
217 ofs = 3;
218 break;
219
220 default:
221 wxFAIL_MSG( "unknown BOM type" );
222 return;
223 }
224
225 *src += ofs;
226 if ( *len != (size_t)-1 )
227 *len -= ofs;
228 }
229
230 bool wxConvAuto::InitFromInput(const char **src, size_t *len)
231 {
232 m_bomType = DetectBOM(*src, *len);
233 if ( m_bomType == BOM_Unknown )
234 return false;
235
236 InitFromBOM(m_bomType);
237 SkipBOM(src, len);
238
239 return true;
240 }
241
242 size_t
243 wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
244 const char *src, size_t srcLen) const
245 {
246 // we check BOM and create the appropriate conversion the first time we're
247 // called but we also need to ensure that the BOM is skipped not only
248 // during this initial call but also during the first call with non-NULL
249 // dst as typically we're first called with NULL dst to calculate the
250 // needed buffer size
251 wxConvAuto *self = const_cast<wxConvAuto *>(this);
252
253
254 if ( !m_conv )
255 {
256 if ( !self->InitFromInput(&src, &srcLen) )
257 {
258 // there is not enough data to determine whether we have a BOM or
259 // not, so fail for now -- the caller is supposed to call us again
260 // with more data
261 return wxCONV_FAILED;
262 }
263 }
264 else if ( !m_consumedBOM && dst )
265 {
266 SkipBOM(&src, &srcLen);
267 }
268
269 // try to convert using the auto-detected encoding
270 size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
271 if ( rc == wxCONV_FAILED && m_bomType == BOM_None )
272 {
273 // if the conversion failed but we didn't really detect anything and
274 // simply tried UTF-8 by default, retry it using the fall-back
275 if ( m_encDefault != wxFONTENCODING_MAX )
276 {
277 if ( m_ownsConv )
278 delete m_conv;
279
280 self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT
281 ? GetFallbackEncoding()
282 : m_encDefault);
283 self->m_ownsConv = true;
284
285 rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
286 }
287 }
288
289 if (rc != wxCONV_FAILED && dst && !m_consumedBOM)
290 self->m_consumedBOM = true;
291 return rc;
292 }
293
294 size_t
295 wxConvAuto::FromWChar(char *dst, size_t dstLen,
296 const wchar_t *src, size_t srcLen) const
297 {
298 if ( !m_conv )
299 {
300 // default to UTF-8 for the multibyte output
301 const_cast<wxConvAuto *>(this)->InitWithUTF8();
302 }
303
304 return m_conv->FromWChar(dst, dstLen, src, srcLen);
305 }
306
307 #endif // wxUSE_WCHAR_T