]>
Commit | Line | Data |
---|---|---|
1 | /////////////////////////////////////////////////////////////////////////////// | |
2 | // Name: src/common/convauto.cpp | |
3 | // Purpose: implementation of wxConvAuto | |
4 | // Author: Vadim Zeitlin | |
5 | // Created: 2006-04-04 | |
6 | // RCS-ID: $Id$ | |
7 | // Copyright: (c) 2006 Vadim Zeitlin <vadim@wxwindows.org> | |
8 | // Licence: wxWindows licence | |
9 | /////////////////////////////////////////////////////////////////////////////// | |
10 | ||
11 | // ============================================================================ | |
12 | // declarations | |
13 | // ============================================================================ | |
14 | ||
15 | // ---------------------------------------------------------------------------- | |
16 | // headers | |
17 | // ---------------------------------------------------------------------------- | |
18 | ||
19 | // for compilers that support precompilation, includes "wx.h". | |
20 | #include "wx/wxprec.h" | |
21 | ||
22 | #ifdef __BORLANDC__ | |
23 | #pragma hdrstop | |
24 | #endif | |
25 | ||
26 | #if wxUSE_WCHAR_T | |
27 | ||
28 | #ifndef WX_PRECOMP | |
29 | #endif //WX_PRECOMP | |
30 | ||
31 | #include "wx/convauto.h" | |
32 | ||
33 | // we use latin1 by default as it seems the least bad choice: the files we need | |
34 | // to detect input of don't always come from the user system (they are often | |
35 | // received from other machines) and so using wxFONTENCODING_SYSTEM doesn't | |
36 | // seem to be a good idea and there is no other reasonable alternative | |
37 | wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1; | |
38 | ||
39 | // ============================================================================ | |
40 | // implementation | |
41 | // ============================================================================ | |
42 | ||
43 | /* static */ | |
44 | void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc) | |
45 | { | |
46 | wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT, | |
47 | _T("wxFONTENCODING_DEFAULT doesn't make sense here") ); | |
48 | ||
49 | ms_defaultMBEncoding = enc; | |
50 | } | |
51 | ||
52 | /* static */ | |
53 | wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen) | |
54 | { | |
55 | if ( srcLen < 2 ) | |
56 | { | |
57 | // minimal BOM is 2 bytes so bail out immediately and simplify the code | |
58 | // below which wouldn't need to check for length for UTF-16 cases | |
59 | return BOM_None; | |
60 | } | |
61 | ||
62 | // examine the buffer for BOM presence | |
63 | // | |
64 | // see http://www.unicode.org/faq/utf_bom.html#BOM | |
65 | switch ( *src++ ) | |
66 | { | |
67 | case '\0': | |
68 | // could only be big endian UTF-32 (00 00 FE FF) | |
69 | if ( srcLen >= 4 && | |
70 | src[0] == '\0' && | |
71 | src[1] == '\xfe' && | |
72 | src[2] == '\xff' ) | |
73 | { | |
74 | return BOM_UTF32BE; | |
75 | } | |
76 | break; | |
77 | ||
78 | case '\xfe': | |
79 | // could only be big endian UTF-16 (FE FF) | |
80 | if ( *src++ == '\xff' ) | |
81 | { | |
82 | return BOM_UTF16BE; | |
83 | } | |
84 | break; | |
85 | ||
86 | case '\xff': | |
87 | // could be either little endian UTF-16 or UTF-32, both start | |
88 | // with FF FE | |
89 | if ( *src++ == '\xfe' ) | |
90 | { | |
91 | return srcLen >= 4 && src[0] == '\0' && src[1] == '\0' | |
92 | ? BOM_UTF32LE | |
93 | : BOM_UTF16LE; | |
94 | } | |
95 | break; | |
96 | ||
97 | case '\xef': | |
98 | // is this UTF-8 BOM (EF BB BF)? | |
99 | if ( srcLen >= 3 && src[0] == '\xbb' && src[1] == '\xbf' ) | |
100 | { | |
101 | return BOM_UTF8; | |
102 | } | |
103 | break; | |
104 | } | |
105 | ||
106 | return BOM_None; | |
107 | } | |
108 | ||
109 | void wxConvAuto::InitFromBOM(BOMType bomType) | |
110 | { | |
111 | m_consumedBOM = false; | |
112 | ||
113 | switch ( bomType ) | |
114 | { | |
115 | case BOM_UTF32BE: | |
116 | m_conv = new wxMBConvUTF32BE; | |
117 | m_ownsConv = true; | |
118 | break; | |
119 | ||
120 | case BOM_UTF32LE: | |
121 | m_conv = new wxMBConvUTF32LE; | |
122 | m_ownsConv = true; | |
123 | break; | |
124 | ||
125 | case BOM_UTF16BE: | |
126 | m_conv = new wxMBConvUTF16BE; | |
127 | m_ownsConv = true; | |
128 | break; | |
129 | ||
130 | case BOM_UTF16LE: | |
131 | m_conv = new wxMBConvUTF16LE; | |
132 | m_ownsConv = true; | |
133 | break; | |
134 | ||
135 | case BOM_UTF8: | |
136 | InitWithUTF8(); | |
137 | break; | |
138 | ||
139 | default: | |
140 | wxFAIL_MSG( _T("unexpected BOM type") ); | |
141 | // fall through: still need to create something | |
142 | ||
143 | case BOM_None: | |
144 | InitWithUTF8(); | |
145 | m_consumedBOM = true; // as there is nothing to consume | |
146 | } | |
147 | } | |
148 | ||
149 | void wxConvAuto::SkipBOM(const char **src, size_t *len) const | |
150 | { | |
151 | int ofs; | |
152 | switch ( m_bomType ) | |
153 | { | |
154 | case BOM_UTF32BE: | |
155 | case BOM_UTF32LE: | |
156 | ofs = 4; | |
157 | break; | |
158 | ||
159 | case BOM_UTF16BE: | |
160 | case BOM_UTF16LE: | |
161 | ofs = 2; | |
162 | break; | |
163 | ||
164 | case BOM_UTF8: | |
165 | ofs = 3; | |
166 | break; | |
167 | ||
168 | default: | |
169 | wxFAIL_MSG( _T("unexpected BOM type") ); | |
170 | // fall through: still need to create something | |
171 | ||
172 | case BOM_None: | |
173 | ofs = 0; | |
174 | } | |
175 | ||
176 | *src += ofs; | |
177 | if ( *len != (size_t)-1 ) | |
178 | *len -= ofs; | |
179 | } | |
180 | ||
181 | void wxConvAuto::InitFromInput(const char **src, size_t *len) | |
182 | { | |
183 | m_bomType = DetectBOM(*src, *len); | |
184 | InitFromBOM(m_bomType); | |
185 | SkipBOM(src, len); | |
186 | } | |
187 | ||
188 | size_t | |
189 | wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen, | |
190 | const char *src, size_t srcLen) const | |
191 | { | |
192 | // we check BOM and create the appropriate conversion the first time we're | |
193 | // called but we also need to ensure that the BOM is skipped not only | |
194 | // during this initial call but also during the first call with non-NULL | |
195 | // dst as typically we're first called with NULL dst to calculate the | |
196 | // needed buffer size | |
197 | wxConvAuto *self = const_cast<wxConvAuto *>(this); | |
198 | if ( !m_conv ) | |
199 | { | |
200 | self->InitFromInput(&src, &srcLen); | |
201 | if ( dst ) | |
202 | self->m_consumedBOM = true; | |
203 | } | |
204 | ||
205 | if ( !m_consumedBOM && dst ) | |
206 | { | |
207 | self->m_consumedBOM = true; | |
208 | SkipBOM(&src, &srcLen); | |
209 | } | |
210 | ||
211 | // try to convert using the auto-detected encoding | |
212 | size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen); | |
213 | if ( rc == wxCONV_FAILED && m_bomType == BOM_None ) | |
214 | { | |
215 | // if the conversion failed but we didn't really detect anything and | |
216 | // simply tried UTF-8 by default, retry it using the fall-back | |
217 | if ( m_encDefault != wxFONTENCODING_MAX ) | |
218 | { | |
219 | if ( m_ownsConv ) | |
220 | delete m_conv; | |
221 | ||
222 | self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT | |
223 | ? GetFallbackEncoding() | |
224 | : m_encDefault); | |
225 | self->m_ownsConv = true; | |
226 | ||
227 | rc = m_conv->ToWChar(dst, dstLen, src, srcLen); | |
228 | } | |
229 | } | |
230 | ||
231 | return rc; | |
232 | } | |
233 | ||
234 | size_t | |
235 | wxConvAuto::FromWChar(char *dst, size_t dstLen, | |
236 | const wchar_t *src, size_t srcLen) const | |
237 | { | |
238 | if ( !m_conv ) | |
239 | { | |
240 | // default to UTF-8 for the multibyte output | |
241 | const_cast<wxConvAuto *>(this)->InitWithUTF8(); | |
242 | } | |
243 | ||
244 | return m_conv->FromWChar(dst, dstLen, src, srcLen); | |
245 | } | |
246 | ||
247 | #endif // wxUSE_WCHAR_T | |
248 |