| 1 | /////////////////////////////////////////////////////////////////////////////// |
| 2 | // Name: src/common/convauto.cpp |
| 3 | // Purpose: implementation of wxConvAuto |
| 4 | // Author: Vadim Zeitlin |
| 5 | // Created: 2006-04-04 |
| 6 | // RCS-ID: $Id$ |
| 7 | // Copyright: (c) 2006 Vadim Zeitlin <vadim@wxwindows.org> |
| 8 | // Licence: wxWindows licence |
| 9 | /////////////////////////////////////////////////////////////////////////////// |
| 10 | |
| 11 | // ============================================================================ |
| 12 | // declarations |
| 13 | // ============================================================================ |
| 14 | |
| 15 | // ---------------------------------------------------------------------------- |
| 16 | // headers |
| 17 | // ---------------------------------------------------------------------------- |
| 18 | |
| 19 | // for compilers that support precompilation, includes "wx.h". |
| 20 | #include "wx/wxprec.h" |
| 21 | |
| 22 | #ifdef __BORLANDC__ |
| 23 | #pragma hdrstop |
| 24 | #endif |
| 25 | |
| 26 | #if wxUSE_WCHAR_T |
| 27 | |
| 28 | #ifndef WX_PRECOMP |
| 29 | #endif //WX_PRECOMP |
| 30 | |
| 31 | #include "wx/convauto.h" |
| 32 | |
| 33 | // we use latin1 by default as it seems the least bad choice: the files we need |
| 34 | // to detect input of don't always come from the user system (they are often |
| 35 | // received from other machines) and so using wxFONTENCODING_SYSTEM doesn't |
| 36 | // seem to be a good idea and there is no other reasonable alternative |
| 37 | wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1; |
| 38 | |
| 39 | // ============================================================================ |
| 40 | // implementation |
| 41 | // ============================================================================ |
| 42 | |
| 43 | /* static */ |
| 44 | void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc) |
| 45 | { |
| 46 | wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT, |
| 47 | wxT("wxFONTENCODING_DEFAULT doesn't make sense here") ); |
| 48 | |
| 49 | ms_defaultMBEncoding = enc; |
| 50 | } |
| 51 | |
| 52 | /* static */ |
| 53 | wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen) |
| 54 | { |
| 55 | if ( srcLen < 2 ) |
| 56 | { |
| 57 | // minimal BOM is 2 bytes so bail out immediately and simplify the code |
| 58 | // below which wouldn't need to check for length for UTF-16 cases |
| 59 | return BOM_None; |
| 60 | } |
| 61 | |
| 62 | // examine the buffer for BOM presence |
| 63 | // |
| 64 | // see http://www.unicode.org/faq/utf_bom.html#BOM |
| 65 | switch ( *src++ ) |
| 66 | { |
| 67 | case '\0': |
| 68 | // could only be big endian UTF-32 (00 00 FE FF) |
| 69 | if ( srcLen >= 4 && |
| 70 | src[0] == '\0' && |
| 71 | src[1] == '\xfe' && |
| 72 | src[2] == '\xff' ) |
| 73 | { |
| 74 | return BOM_UTF32BE; |
| 75 | } |
| 76 | break; |
| 77 | |
| 78 | case '\xfe': |
| 79 | // could only be big endian UTF-16 (FE FF) |
| 80 | if ( *src++ == '\xff' ) |
| 81 | { |
| 82 | return BOM_UTF16BE; |
| 83 | } |
| 84 | break; |
| 85 | |
| 86 | case '\xff': |
| 87 | // could be either little endian UTF-16 or UTF-32, both start |
| 88 | // with FF FE |
| 89 | if ( *src++ == '\xfe' ) |
| 90 | { |
| 91 | return srcLen >= 4 && src[0] == '\0' && src[1] == '\0' |
| 92 | ? BOM_UTF32LE |
| 93 | : BOM_UTF16LE; |
| 94 | } |
| 95 | break; |
| 96 | |
| 97 | case '\xef': |
| 98 | // is this UTF-8 BOM (EF BB BF)? |
| 99 | if ( srcLen >= 3 && src[0] == '\xbb' && src[1] == '\xbf' ) |
| 100 | { |
| 101 | return BOM_UTF8; |
| 102 | } |
| 103 | break; |
| 104 | } |
| 105 | |
| 106 | return BOM_None; |
| 107 | } |
| 108 | |
| 109 | void wxConvAuto::InitFromBOM(BOMType bomType) |
| 110 | { |
| 111 | m_consumedBOM = false; |
| 112 | |
| 113 | switch ( bomType ) |
| 114 | { |
| 115 | case BOM_UTF32BE: |
| 116 | m_conv = new wxMBConvUTF32BE; |
| 117 | m_ownsConv = true; |
| 118 | break; |
| 119 | |
| 120 | case BOM_UTF32LE: |
| 121 | m_conv = new wxMBConvUTF32LE; |
| 122 | m_ownsConv = true; |
| 123 | break; |
| 124 | |
| 125 | case BOM_UTF16BE: |
| 126 | m_conv = new wxMBConvUTF16BE; |
| 127 | m_ownsConv = true; |
| 128 | break; |
| 129 | |
| 130 | case BOM_UTF16LE: |
| 131 | m_conv = new wxMBConvUTF16LE; |
| 132 | m_ownsConv = true; |
| 133 | break; |
| 134 | |
| 135 | case BOM_UTF8: |
| 136 | InitWithUTF8(); |
| 137 | break; |
| 138 | |
| 139 | default: |
| 140 | wxFAIL_MSG( wxT("unexpected BOM type") ); |
| 141 | // fall through: still need to create something |
| 142 | |
| 143 | case BOM_None: |
| 144 | InitWithUTF8(); |
| 145 | m_consumedBOM = true; // as there is nothing to consume |
| 146 | } |
| 147 | } |
| 148 | |
| 149 | void wxConvAuto::SkipBOM(const char **src, size_t *len) const |
| 150 | { |
| 151 | int ofs; |
| 152 | switch ( m_bomType ) |
| 153 | { |
| 154 | case BOM_UTF32BE: |
| 155 | case BOM_UTF32LE: |
| 156 | ofs = 4; |
| 157 | break; |
| 158 | |
| 159 | case BOM_UTF16BE: |
| 160 | case BOM_UTF16LE: |
| 161 | ofs = 2; |
| 162 | break; |
| 163 | |
| 164 | case BOM_UTF8: |
| 165 | ofs = 3; |
| 166 | break; |
| 167 | |
| 168 | default: |
| 169 | wxFAIL_MSG( wxT("unexpected BOM type") ); |
| 170 | // fall through: still need to create something |
| 171 | |
| 172 | case BOM_None: |
| 173 | ofs = 0; |
| 174 | } |
| 175 | |
| 176 | *src += ofs; |
| 177 | if ( *len != (size_t)-1 ) |
| 178 | *len -= ofs; |
| 179 | } |
| 180 | |
| 181 | void wxConvAuto::InitFromInput(const char **src, size_t *len) |
| 182 | { |
| 183 | m_bomType = DetectBOM(*src, *len); |
| 184 | InitFromBOM(m_bomType); |
| 185 | SkipBOM(src, len); |
| 186 | } |
| 187 | |
| 188 | size_t |
| 189 | wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen, |
| 190 | const char *src, size_t srcLen) const |
| 191 | { |
| 192 | // we check BOM and create the appropriate conversion the first time we're |
| 193 | // called but we also need to ensure that the BOM is skipped not only |
| 194 | // during this initial call but also during the first call with non-NULL |
| 195 | // dst as typically we're first called with NULL dst to calculate the |
| 196 | // needed buffer size |
| 197 | wxConvAuto *self = const_cast<wxConvAuto *>(this); |
| 198 | if ( !m_conv ) |
| 199 | { |
| 200 | self->InitFromInput(&src, &srcLen); |
| 201 | if ( dst ) |
| 202 | self->m_consumedBOM = true; |
| 203 | } |
| 204 | |
| 205 | if ( !m_consumedBOM && dst ) |
| 206 | { |
| 207 | self->m_consumedBOM = true; |
| 208 | SkipBOM(&src, &srcLen); |
| 209 | } |
| 210 | |
| 211 | // try to convert using the auto-detected encoding |
| 212 | size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen); |
| 213 | if ( rc == wxCONV_FAILED && m_bomType == BOM_None ) |
| 214 | { |
| 215 | // if the conversion failed but we didn't really detect anything and |
| 216 | // simply tried UTF-8 by default, retry it using the fall-back |
| 217 | if ( m_encDefault != wxFONTENCODING_MAX ) |
| 218 | { |
| 219 | if ( m_ownsConv ) |
| 220 | delete m_conv; |
| 221 | |
| 222 | self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT |
| 223 | ? GetFallbackEncoding() |
| 224 | : m_encDefault); |
| 225 | self->m_ownsConv = true; |
| 226 | |
| 227 | rc = m_conv->ToWChar(dst, dstLen, src, srcLen); |
| 228 | } |
| 229 | } |
| 230 | |
| 231 | return rc; |
| 232 | } |
| 233 | |
| 234 | size_t |
| 235 | wxConvAuto::FromWChar(char *dst, size_t dstLen, |
| 236 | const wchar_t *src, size_t srcLen) const |
| 237 | { |
| 238 | if ( !m_conv ) |
| 239 | { |
| 240 | // default to UTF-8 for the multibyte output |
| 241 | const_cast<wxConvAuto *>(this)->InitWithUTF8(); |
| 242 | } |
| 243 | |
| 244 | return m_conv->FromWChar(dst, dstLen, src, srcLen); |
| 245 | } |
| 246 | |
| 247 | #endif // wxUSE_WCHAR_T |
| 248 | |