src/common/convauto.cpp

   1 ///////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/convauto.cpp
   3 // Purpose:     implementation of wxConvAuto
   4 // Author:      Vadim Zeitlin
   5 // Created:     2006-04-04
   6 // RCS-ID:      $Id$
   7 // Copyright:   (c) 2006 Vadim Zeitlin <vadim@wxwindows.org>
   8 // Licence:     wxWindows licence
   9 ///////////////////////////////////////////////////////////////////////////////
  10
  11 // ============================================================================
  12 // declarations
  13 // ============================================================================
  14
  15 // ----------------------------------------------------------------------------
  16 // headers
  17 // ----------------------------------------------------------------------------
  18
  19 // for compilers that support precompilation, includes "wx.h".
  20 #include "wx/wxprec.h"
  21
  22 #ifdef __BORLANDC__
  23     #pragma hdrstop
  24 #endif
  25
  26 #ifndef WX_PRECOMP
  27     #include "wx/wx.h"
  28 #endif //WX_PRECOMP
  29
  30 #include "wx/convauto.h"
  31
  32 // we use latin1 by default as it seems the least bad choice: the files we need
  33 // to detect input of don't always come from the user system (they are often
  34 // received from other machines) and so using wxFONTENCODING_SYSTEM doesn't
  35 // seem to be a good idea and there is no other reasonable alternative
  36 wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1;
  37
  38 // ============================================================================
  39 // implementation
  40 // ============================================================================
  41
  42 /* static */
  43 void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
  44 {
  45     wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT,
  46                   wxT("wxFONTENCODING_DEFAULT doesn't make sense here") );
  47
  48     ms_defaultMBEncoding = enc;
  49 }
  50
  51 /* static */
  52 wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
  53 {
  54     // examine the buffer for BOM presence
  55     //
  56     // quoting from http://www.unicode.org/faq/utf_bom.html#BOM:
  57     //
  58     //  Bytes           Encoding Form
  59     //
  60     //  00 00 FE FF     UTF-32, big-endian
  61     //  FF FE 00 00     UTF-32, little-endian
  62     //  FE FF           UTF-16, big-endian
  63     //  FF FE           UTF-16, little-endian
  64     //  EF BB BF        UTF-8
  65     //
  66     // as some BOMs are prefixes of other ones we may need to read more bytes
  67     // to disambiguate them
  68
  69     switch ( srcLen )
  70     {
  71         case 0:
  72             return BOM_Unknown;
  73
  74         case 1:
  75             if ( src[0] == '\x00' || src[0] == '\xFF' ||
  76                  src[0] == '\xFE' || src[0] == '\xEF')
  77             {
  78                 // this could be a BOM but we don't know yet
  79                 return BOM_Unknown;
  80             }
  81             break;
  82
  83         case 2:
  84         case 3:
  85             if ( src[0] == '\xEF' && src[1] == '\xBB' )
  86             {
  87                 if ( srcLen == 3 )
  88                     return src[2] == '\xBF' ? BOM_UTF8 : BOM_None;
  89
  90                 return BOM_Unknown;
  91             }
  92
  93             if ( src[0] == '\xFE' && src[1] == '\xFF' )
  94                 return BOM_UTF16BE;
  95
  96             if ( src[0] == '\xFF' && src[1] == '\xFE' )
  97             {
  98                 // if the next byte is 0, it could be an UTF-32LE BOM but if it
  99                 // isn't we can be sure it's UTF-16LE
 100                 if ( srcLen == 3 && src[2] != '\x00' )
 101                     return BOM_UTF16LE;
 102
 103                 return BOM_Unknown;
 104             }
 105
 106             if ( src[0] == '\x00' && src[1] == '\x00' )
 107             {
 108                 // this could only be UTF-32BE, check that the data we have so
 109                 // far allows for it
 110                 if ( srcLen == 3 && src[2] != '\xFE' )
 111                     return BOM_None;
 112
 113                 return BOM_Unknown;
 114             }
 115             break;
 116
 117         default:
 118             // we have at least 4 characters so we may finally decide whether
 119             // we have a BOM or not
 120             if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
 121                 return BOM_UTF8;
 122
 123             if ( src[0] == '\x00' && src[1] == '\x00' &&
 124                  src[2] == '\xFE' && src[3] == '\xFF' )
 125                 return BOM_UTF32BE;
 126
 127             if ( src[0] == '\xFF' && src[1] == '\xFE' &&
 128                  src[2] == '\x00' && src[3] == '\x00' )
 129                 return BOM_UTF32LE;
 130
 131             if ( src[0] == '\xFE' && src[1] == '\xFF' )
 132                 return BOM_UTF16BE;
 133
 134             if ( src[0] == '\xFF' && src[1] == '\xFE' )
 135                 return BOM_UTF16LE;
 136     }
 137
 138     return BOM_None;
 139 }
 140
 141 void wxConvAuto::InitFromBOM(BOMType bomType)
 142 {
 143     m_consumedBOM = false;
 144
 145     switch ( bomType )
 146     {
 147         case BOM_Unknown:
 148             wxFAIL_MSG( "shouldn't be called for this BOM type" );
 149             break;
 150
 151         case BOM_None:
 152             // use the default
 153             break;
 154
 155         case BOM_UTF32BE:
 156             m_conv = new wxMBConvUTF32BE;
 157             m_ownsConv = true;
 158             break;
 159
 160         case BOM_UTF32LE:
 161             m_conv = new wxMBConvUTF32LE;
 162             m_ownsConv = true;
 163             break;
 164
 165         case BOM_UTF16BE:
 166             m_conv = new wxMBConvUTF16BE;
 167             m_ownsConv = true;
 168             break;
 169
 170         case BOM_UTF16LE:
 171             m_conv = new wxMBConvUTF16LE;
 172             m_ownsConv = true;
 173             break;
 174
 175         case BOM_UTF8:
 176             InitWithUTF8();
 177             break;
 178
 179         default:
 180             wxFAIL_MSG( "unknown BOM type" );
 181     }
 182
 183     if ( !m_conv )
 184     {
 185         // we end up here if there is no BOM or we didn't recognize it somehow
 186         // (this shouldn't happen but still don't crash if it does), so use the
 187         // default encoding
 188         InitWithUTF8();
 189         m_consumedBOM = true; // as there is nothing to consume
 190     }
 191 }
 192
 193 void wxConvAuto::SkipBOM(const char **src, size_t *len) const
 194 {
 195     int ofs;
 196     switch ( m_bomType )
 197     {
 198         case BOM_Unknown:
 199             wxFAIL_MSG( "shouldn't be called for this BOM type" );
 200             return;
 201
 202         case BOM_None:
 203             ofs = 0;
 204             break;
 205
 206         case BOM_UTF32BE:
 207         case BOM_UTF32LE:
 208             ofs = 4;
 209             break;
 210
 211         case BOM_UTF16BE:
 212         case BOM_UTF16LE:
 213             ofs = 2;
 214             break;
 215
 216         case BOM_UTF8:
 217             ofs = 3;
 218             break;
 219
 220         default:
 221             wxFAIL_MSG( "unknown BOM type" );
 222             return;
 223     }
 224
 225     *src += ofs;
 226     if ( *len != (size_t)-1 )
 227         *len -= ofs;
 228 }
 229
 230 bool wxConvAuto::InitFromInput(const char *src, size_t len)
 231 {
 232     m_bomType = DetectBOM(src, len == wxNO_LEN ? strlen(src) : len);
 233     if ( m_bomType == BOM_Unknown )
 234         return false;
 235
 236     InitFromBOM(m_bomType);
 237
 238     return true;
 239 }
 240
 241 size_t
 242 wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
 243                     const char *src, size_t srcLen) const
 244 {
 245     // we check BOM and create the appropriate conversion the first time we're
 246     // called but we also need to ensure that the BOM is skipped not only
 247     // during this initial call but also during the first call with non-NULL
 248     // dst as typically we're first called with NULL dst to calculate the
 249     // needed buffer size
 250     wxConvAuto *self = const_cast<wxConvAuto *>(this);
 251
 252
 253     if ( !m_conv )
 254     {
 255         if ( !self->InitFromInput(src, srcLen) )
 256         {
 257             // there is not enough data to determine whether we have a BOM or
 258             // not, so fail for now -- the caller is supposed to call us again
 259             // with more data
 260             return wxCONV_FAILED;
 261         }
 262     }
 263
 264     if ( !m_consumedBOM )
 265     {
 266         SkipBOM(&src, &srcLen);
 267         if ( srcLen == 0 )
 268         {
 269             // there is nothing left except the BOM so we'd return 0 below but
 270             // this is unexpected: decoding a non-empty string must either fail
 271             // or return something non-empty, in particular this would break
 272             // the code in wxTextInputStream::NextChar()
 273             //
 274             // so still return an error as we need some more data to be able to
 275             // decode it
 276             return wxCONV_FAILED;
 277         }
 278     }
 279
 280     // try to convert using the auto-detected encoding
 281     size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
 282     if ( rc == wxCONV_FAILED && m_bomType == BOM_None )
 283     {
 284         // if the conversion failed but we didn't really detect anything and
 285         // simply tried UTF-8 by default, retry it using the fall-back
 286         if ( m_encDefault != wxFONTENCODING_MAX )
 287         {
 288             if ( m_ownsConv )
 289                 delete m_conv;
 290
 291             self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT
 292                                             ? GetFallbackEncoding()
 293                                             : m_encDefault);
 294             self->m_ownsConv = true;
 295
 296             rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
 297         }
 298     }
 299
 300     // don't skip the BOM again the next time if we really consumed it
 301     if ( rc != wxCONV_FAILED && dst && !m_consumedBOM )
 302         self->m_consumedBOM = true;
 303
 304     return rc;
 305 }
 306
 307 size_t
 308 wxConvAuto::FromWChar(char *dst, size_t dstLen,
 309                       const wchar_t *src, size_t srcLen) const
 310 {
 311     if ( !m_conv )
 312     {
 313         // default to UTF-8 for the multibyte output
 314         const_cast<wxConvAuto *>(this)->InitWithUTF8();
 315     }
 316
 317     return m_conv->FromWChar(dst, dstLen, src, srcLen);
 318 }