src/common/convauto.cpp

   1 ///////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/convauto.cpp
   3 // Purpose:     implementation of wxConvAuto
   4 // Author:      Vadim Zeitlin
   5 // Created:     2006-04-04
   6 // RCS-ID:      $Id$
   7 // Copyright:   (c) 2006 Vadim Zeitlin <vadim@wxwindows.org>
   8 // Licence:     wxWindows licence
   9 ///////////////////////////////////////////////////////////////////////////////
  10
  11 // ============================================================================
  12 // declarations
  13 // ============================================================================
  14
  15 // ----------------------------------------------------------------------------
  16 // headers
  17 // ----------------------------------------------------------------------------
  18
  19 // for compilers that support precompilation, includes "wx.h".
  20 #include "wx/wxprec.h"
  21
  22 #ifdef __BORLANDC__
  23     #pragma hdrstop
  24 #endif
  25
  26 #include "wx/convauto.h"
  27
  28 // we use latin1 by default as it seems the least bad choice: the files we need
  29 // to detect input of don't always come from the user system (they are often
  30 // received from other machines) and so using wxFONTENCODING_SYSTEM doesn't
  31 // seem to be a good idea and there is no other reasonable alternative
  32 wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1;
  33
  34 // ============================================================================
  35 // implementation
  36 // ============================================================================
  37
  38 /* static */
  39 void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
  40 {
  41     wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT,
  42                   wxT("wxFONTENCODING_DEFAULT doesn't make sense here") );
  43
  44     ms_defaultMBEncoding = enc;
  45 }
  46
  47 /* static */
  48 wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
  49 {
  50     // examine the buffer for BOM presence
  51     //
  52     // quoting from http://www.unicode.org/faq/utf_bom.html#BOM:
  53     //
  54     //  Bytes           Encoding Form
  55     //
  56     //  00 00 FE FF     UTF-32, big-endian
  57     //  FF FE 00 00     UTF-32, little-endian
  58     //  FE FF           UTF-16, big-endian
  59     //  FF FE           UTF-16, little-endian
  60     //  EF BB BF        UTF-8
  61     //
  62     // as some BOMs are prefixes of other ones we may need to read more bytes
  63     // to disambiguate them
  64
  65     switch ( srcLen )
  66     {
  67         case 0:
  68             return BOM_Unknown;
  69
  70         case 1:
  71             if ( src[0] == '\x00' || src[0] == '\xFF' ||
  72                  src[0] == '\xFE' || src[0] == '\xEF')
  73             {
  74                 // this could be a BOM but we don't know yet
  75                 return BOM_Unknown;
  76             }
  77             break;
  78
  79         case 2:
  80         case 3:
  81             if ( src[0] == '\xEF' && src[1] == '\xBB' )
  82             {
  83                 if ( srcLen == 3 )
  84                     return src[2] == '\xBF' ? BOM_UTF8 : BOM_None;
  85
  86                 return BOM_Unknown;
  87             }
  88
  89             if ( src[0] == '\xFE' && src[1] == '\xFF' )
  90                 return BOM_UTF16BE;
  91
  92             if ( src[0] == '\xFF' && src[1] == '\xFE' )
  93             {
  94                 // if the next byte is 0, it could be an UTF-32LE BOM but if it
  95                 // isn't we can be sure it's UTF-16LE
  96                 if ( srcLen == 3 && src[2] != '\x00' )
  97                     return BOM_UTF16LE;
  98
  99                 return BOM_Unknown;
 100             }
 101
 102             if ( src[0] == '\x00' && src[1] == '\x00' )
 103             {
 104                 // this could only be UTF-32BE, check that the data we have so
 105                 // far allows for it
 106                 if ( srcLen == 3 && src[2] != '\xFE' )
 107                     return BOM_None;
 108
 109                 return BOM_Unknown;
 110             }
 111             break;
 112
 113         default:
 114             // we have at least 4 characters so we may finally decide whether
 115             // we have a BOM or not
 116             if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
 117                 return BOM_UTF8;
 118
 119             if ( src[0] == '\x00' && src[1] == '\x00' &&
 120                  src[2] == '\xFE' && src[3] == '\xFF' )
 121                 return BOM_UTF32BE;
 122
 123             if ( src[0] == '\xFF' && src[1] == '\xFE' &&
 124                  src[2] == '\x00' && src[3] == '\x00' )
 125                 return BOM_UTF32LE;
 126
 127             if ( src[0] == '\xFE' && src[1] == '\xFF' )
 128                 return BOM_UTF16BE;
 129
 130             if ( src[0] == '\xFF' && src[1] == '\xFE' )
 131                 return BOM_UTF16LE;
 132     }
 133
 134     return BOM_None;
 135 }
 136
 137 void wxConvAuto::InitFromBOM(BOMType bomType)
 138 {
 139     m_consumedBOM = false;
 140
 141     switch ( bomType )
 142     {
 143         case BOM_Unknown:
 144             wxFAIL_MSG( "shouldn't be called for this BOM type" );
 145             break;
 146
 147         case BOM_None:
 148             // use the default
 149             break;
 150
 151         case BOM_UTF32BE:
 152             m_conv = new wxMBConvUTF32BE;
 153             m_ownsConv = true;
 154             break;
 155
 156         case BOM_UTF32LE:
 157             m_conv = new wxMBConvUTF32LE;
 158             m_ownsConv = true;
 159             break;
 160
 161         case BOM_UTF16BE:
 162             m_conv = new wxMBConvUTF16BE;
 163             m_ownsConv = true;
 164             break;
 165
 166         case BOM_UTF16LE:
 167             m_conv = new wxMBConvUTF16LE;
 168             m_ownsConv = true;
 169             break;
 170
 171         case BOM_UTF8:
 172             InitWithUTF8();
 173             break;
 174
 175         default:
 176             wxFAIL_MSG( "unknown BOM type" );
 177     }
 178
 179     if ( !m_conv )
 180     {
 181         // we end up here if there is no BOM or we didn't recognize it somehow
 182         // (this shouldn't happen but still don't crash if it does), so use the
 183         // default encoding
 184         InitWithUTF8();
 185         m_consumedBOM = true; // as there is nothing to consume
 186     }
 187 }
 188
 189 void wxConvAuto::SkipBOM(const char **src, size_t *len) const
 190 {
 191     int ofs;
 192     switch ( m_bomType )
 193     {
 194         case BOM_Unknown:
 195             wxFAIL_MSG( "shouldn't be called for this BOM type" );
 196             return;
 197
 198         case BOM_None:
 199             ofs = 0;
 200             break;
 201
 202         case BOM_UTF32BE:
 203         case BOM_UTF32LE:
 204             ofs = 4;
 205             break;
 206
 207         case BOM_UTF16BE:
 208         case BOM_UTF16LE:
 209             ofs = 2;
 210             break;
 211
 212         case BOM_UTF8:
 213             ofs = 3;
 214             break;
 215
 216         default:
 217             wxFAIL_MSG( "unknown BOM type" );
 218             return;
 219     }
 220
 221     *src += ofs;
 222     if ( *len != (size_t)-1 )
 223         *len -= ofs;
 224 }
 225
 226 bool wxConvAuto::InitFromInput(const char *src, size_t len)
 227 {
 228     m_bomType = DetectBOM(src, len == wxNO_LEN ? strlen(src) : len);
 229     if ( m_bomType == BOM_Unknown )
 230         return false;
 231
 232     InitFromBOM(m_bomType);
 233
 234     return true;
 235 }
 236
 237 size_t
 238 wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
 239                     const char *src, size_t srcLen) const
 240 {
 241     // we check BOM and create the appropriate conversion the first time we're
 242     // called but we also need to ensure that the BOM is skipped not only
 243     // during this initial call but also during the first call with non-NULL
 244     // dst as typically we're first called with NULL dst to calculate the
 245     // needed buffer size
 246     wxConvAuto *self = const_cast<wxConvAuto *>(this);
 247
 248
 249     if ( !m_conv )
 250     {
 251         if ( !self->InitFromInput(src, srcLen) )
 252         {
 253             // there is not enough data to determine whether we have a BOM or
 254             // not, so fail for now -- the caller is supposed to call us again
 255             // with more data
 256             return wxCONV_FAILED;
 257         }
 258     }
 259
 260     if ( !m_consumedBOM )
 261     {
 262         SkipBOM(&src, &srcLen);
 263         if ( srcLen == 0 )
 264         {
 265             // there is nothing left except the BOM so we'd return 0 below but
 266             // this is unexpected: decoding a non-empty string must either fail
 267             // or return something non-empty, in particular this would break
 268             // the code in wxTextInputStream::NextChar()
 269             //
 270             // so still return an error as we need some more data to be able to
 271             // decode it
 272             return wxCONV_FAILED;
 273         }
 274     }
 275
 276     // try to convert using the auto-detected encoding
 277     size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
 278     if ( rc == wxCONV_FAILED && m_bomType == BOM_None )
 279     {
 280         // if the conversion failed but we didn't really detect anything and
 281         // simply tried UTF-8 by default, retry it using the fall-back
 282         if ( m_encDefault != wxFONTENCODING_MAX )
 283         {
 284             if ( m_ownsConv )
 285                 delete m_conv;
 286
 287             self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT
 288                                             ? GetFallbackEncoding()
 289                                             : m_encDefault);
 290             self->m_ownsConv = true;
 291
 292             rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
 293         }
 294     }
 295
 296     // don't skip the BOM again the next time if we really consumed it
 297     if ( rc != wxCONV_FAILED && dst && !m_consumedBOM )
 298         self->m_consumedBOM = true;
 299
 300     return rc;
 301 }
 302
 303 size_t
 304 wxConvAuto::FromWChar(char *dst, size_t dstLen,
 305                       const wchar_t *src, size_t srcLen) const
 306 {
 307     if ( !m_conv )
 308     {
 309         // default to UTF-8 for the multibyte output
 310         const_cast<wxConvAuto *>(this)->InitWithUTF8();
 311     }
 312
 313     return m_conv->FromWChar(dst, dstLen, src, srcLen);
 314 }