]> git.saurik.com Git - wxWidgets.git/blob - src/common/convauto.cpp
Corrected bug in in revision 47973
[wxWidgets.git] / src / common / convauto.cpp
1 ///////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/convauto.cpp
3 // Purpose: implementation of wxConvAuto
4 // Author: Vadim Zeitlin
5 // Created: 2006-04-04
6 // RCS-ID: $Id$
7 // Copyright: (c) 2006 Vadim Zeitlin <vadim@wxwindows.org>
8 // Licence: wxWindows licence
9 ///////////////////////////////////////////////////////////////////////////////
10
11 // ============================================================================
12 // declarations
13 // ============================================================================
14
15 // ----------------------------------------------------------------------------
16 // headers
17 // ----------------------------------------------------------------------------
18
19 // for compilers that support precompilation, includes "wx.h".
20 #include "wx/wxprec.h"
21
22 #ifdef __BORLANDC__
23 #pragma hdrstop
24 #endif
25
26 #if wxUSE_WCHAR_T
27
28 #ifndef WX_PRECOMP
29 #endif //WX_PRECOMP
30
31 #include "wx/convauto.h"
32
33 // we use latin1 by default as it seems the least bad choice: the files we need
34 // to detect input of don't always come from the user system (they are often
35 // received from other machines) and so using wxFONTENCODING_SYSTEM doesn't
36 // seem to be a good idea and there is no other reasonable alternative
37 wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1;
38
39 // ============================================================================
40 // implementation
41 // ============================================================================
42
43 /* static */
44 void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
45 {
46 wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT,
47 _T("wxFONTENCODING_DEFAULT doesn't make sense here") );
48
49 ms_defaultMBEncoding = enc;
50 }
51
52 /* static */
53 wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
54 {
55 if ( srcLen < 2 )
56 {
57 // minimal BOM is 2 bytes so bail out immediately and simplify the code
58 // below which wouldn't need to check for length for UTF-16 cases
59 return BOM_None;
60 }
61
62 // examine the buffer for BOM presence
63 //
64 // see http://www.unicode.org/faq/utf_bom.html#BOM
65 switch ( *src++ )
66 {
67 case '\0':
68 // could only be big endian UTF-32 (00 00 FE FF)
69 if ( srcLen >= 4 &&
70 src[0] == '\0' &&
71 src[1] == '\xfe' &&
72 src[2] == '\xff' )
73 {
74 return BOM_UTF32BE;
75 }
76 break;
77
78 case '\xfe':
79 // could only be big endian UTF-16 (FE FF)
80 if ( *src++ == '\xff' )
81 {
82 return BOM_UTF16BE;
83 }
84 break;
85
86 case '\xff':
87 // could be either little endian UTF-16 or UTF-32, both start
88 // with FF FE
89 if ( *src++ == '\xfe' )
90 {
91 return srcLen >= 4 && src[0] == '\0' && src[1] == '\0'
92 ? BOM_UTF32LE
93 : BOM_UTF16LE;
94 }
95 break;
96
97 case '\xef':
98 // is this UTF-8 BOM (EF BB BF)?
99 if ( srcLen >= 3 && src[0] == '\xbb' && src[1] == '\xbf' )
100 {
101 return BOM_UTF8;
102 }
103 break;
104 }
105
106 return BOM_None;
107 }
108
109 void wxConvAuto::InitFromBOM(BOMType bomType)
110 {
111 m_consumedBOM = false;
112
113 switch ( bomType )
114 {
115 case BOM_UTF32BE:
116 m_conv = new wxMBConvUTF32BE;
117 m_ownsConv = true;
118 break;
119
120 case BOM_UTF32LE:
121 m_conv = new wxMBConvUTF32LE;
122 m_ownsConv = true;
123 break;
124
125 case BOM_UTF16BE:
126 m_conv = new wxMBConvUTF16BE;
127 m_ownsConv = true;
128 break;
129
130 case BOM_UTF16LE:
131 m_conv = new wxMBConvUTF16LE;
132 m_ownsConv = true;
133 break;
134
135 case BOM_UTF8:
136 InitWithUTF8();
137 break;
138
139 default:
140 wxFAIL_MSG( _T("unexpected BOM type") );
141 // fall through: still need to create something
142
143 case BOM_None:
144 InitWithUTF8();
145 m_consumedBOM = true; // as there is nothing to consume
146 }
147 }
148
149 void wxConvAuto::SkipBOM(const char **src, size_t *len) const
150 {
151 int ofs;
152 switch ( m_bomType )
153 {
154 case BOM_UTF32BE:
155 case BOM_UTF32LE:
156 ofs = 4;
157 break;
158
159 case BOM_UTF16BE:
160 case BOM_UTF16LE:
161 ofs = 2;
162 break;
163
164 case BOM_UTF8:
165 ofs = 3;
166 break;
167
168 default:
169 wxFAIL_MSG( _T("unexpected BOM type") );
170 // fall through: still need to create something
171
172 case BOM_None:
173 ofs = 0;
174 }
175
176 *src += ofs;
177 if ( *len != (size_t)-1 )
178 *len -= ofs;
179 }
180
181 void wxConvAuto::InitFromInput(const char **src, size_t *len)
182 {
183 m_bomType = DetectBOM(*src, *len);
184 InitFromBOM(m_bomType);
185 SkipBOM(src, len);
186 }
187
188 size_t
189 wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
190 const char *src, size_t srcLen) const
191 {
192 // we check BOM and create the appropriate conversion the first time we're
193 // called but we also need to ensure that the BOM is skipped not only
194 // during this initial call but also during the first call with non-NULL
195 // dst as typically we're first called with NULL dst to calculate the
196 // needed buffer size
197 wxConvAuto *self = wx_const_cast(wxConvAuto *, this);
198 if ( !m_conv )
199 {
200 self->InitFromInput(&src, &srcLen);
201 if ( dst )
202 self->m_consumedBOM = true;
203 }
204
205 if ( !m_consumedBOM && dst )
206 {
207 self->m_consumedBOM = true;
208 SkipBOM(&src, &srcLen);
209 }
210
211 // try to convert using the auto-detected encoding
212 size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
213 if ( rc == wxCONV_FAILED && m_bomType == BOM_None )
214 {
215 // if the conversion failed but we didn't really detect anything and
216 // simply tried UTF-8 by default, retry it using the fall-back
217 if ( m_encDefault != wxFONTENCODING_MAX )
218 {
219 if ( m_ownsConv )
220 delete m_conv;
221
222 self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT
223 ? GetFallbackEncoding()
224 : m_encDefault);
225 self->m_ownsConv = true;
226
227 rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
228 }
229 }
230
231 return rc;
232 }
233
234 size_t
235 wxConvAuto::FromWChar(char *dst, size_t dstLen,
236 const wchar_t *src, size_t srcLen) const
237 {
238 if ( !m_conv )
239 {
240 // default to UTF-8 for the multibyte output
241 wx_const_cast(wxConvAuto *, this)->InitWithUTF8();
242 }
243
244 return m_conv->FromWChar(dst, dstLen, src, srcLen);
245 }
246
247 #endif // wxUSE_WCHAR_T
248