src/common/convauto.cpp

   1 ///////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/convauto.cpp
   3 // Purpose:     implementation of wxConvAuto
   4 // Author:      Vadim Zeitlin
   5 // Created:     2006-04-04
   6 // RCS-ID:      $Id$
   7 // Copyright:   (c) 2006 Vadim Zeitlin <vadim@wxwindows.org>
   8 // Licence:     wxWindows licence
   9 ///////////////////////////////////////////////////////////////////////////////
  10
  11 // ============================================================================
  12 // declarations
  13 // ============================================================================
  14
  15 // ----------------------------------------------------------------------------
  16 // headers
  17 // ----------------------------------------------------------------------------
  18
  19 // for compilers that support precompilation, includes "wx.h".
  20 #include "wx/wxprec.h"
  21
  22 #ifdef __BORLANDC__
  23     #pragma hdrstop
  24 #endif
  25
  26 #if wxUSE_WCHAR_T
  27
  28 #ifndef WX_PRECOMP
  29     #include "wx/wx.h"
  30 #endif //WX_PRECOMP
  31
  32 #include "wx/convauto.h"
  33
  34 // we use latin1 by default as it seems the least bad choice: the files we need
  35 // to detect input of don't always come from the user system (they are often
  36 // received from other machines) and so using wxFONTENCODING_SYSTEM doesn't
  37 // seem to be a good idea and there is no other reasonable alternative
  38 wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1;
  39
  40 // ============================================================================
  41 // implementation
  42 // ============================================================================
  43
  44 /* static */
  45 void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
  46 {
  47     wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT,
  48                   wxT("wxFONTENCODING_DEFAULT doesn't make sense here") );
  49
  50     ms_defaultMBEncoding = enc;
  51 }
  52
  53 /* static */
  54 wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
  55 {
  56     // examine the buffer for BOM presence
  57     //
  58     // quoting from http://www.unicode.org/faq/utf_bom.html#BOM:
  59     //
  60     //  Bytes           Encoding Form
  61     //
  62     //  00 00 FE FF     UTF-32, big-endian
  63     //  FF FE 00 00     UTF-32, little-endian
  64     //  FE FF           UTF-16, big-endian
  65     //  FF FE           UTF-16, little-endian
  66     //  EF BB BF        UTF-8
  67     //
  68     // as some BOMs are prefixes of other ones we may need to read more bytes
  69     // to disambiguate them
  70
  71     switch ( srcLen )
  72     {
  73         case 0:
  74             return BOM_Unknown;
  75
  76         case 1:
  77             if ( src[0] == '\x00' || src[0] == '\xFF' ||
  78                  src[0] == '\xFE' || src[0] == '\xEF')
  79             {
  80                 // this could be a BOM but we don't know yet
  81                 return BOM_Unknown;
  82             }
  83             break;
  84
  85         case 2:
  86         case 3:
  87             if ( src[0] == '\xEF' && src[1] == '\xBB' )
  88             {
  89                 if ( srcLen == 3 )
  90                     return src[2] == '\xBF' ? BOM_UTF8 : BOM_None;
  91
  92                 return BOM_Unknown;
  93             }
  94
  95             if ( src[0] == '\xFE' && src[1] == '\xFF' )
  96                 return BOM_UTF16BE;
  97
  98             if ( src[0] == '\xFF' && src[1] == '\xFE' )
  99             {
 100                 // if the next byte is 0, it could be an UTF-32LE BOM but if it
 101                 // isn't we can be sure it's UTF-16LE
 102                 if ( srcLen == 3 && src[2] != '\x00' )
 103                     return BOM_UTF16LE;
 104
 105                 return BOM_Unknown;
 106             }
 107
 108             if ( src[0] == '\x00' && src[1] == '\x00' )
 109             {
 110                 // this could only be UTF-32BE
 111                 if ( srcLen == 3 && src[2] == '\xFE' )
 112                     return BOM_Unknown;
 113             }
 114
 115             break;
 116
 117         default:
 118             // we have at least 4 characters so we may finally decide whether
 119             // we have a BOM or not
 120             if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
 121                 return BOM_UTF8;
 122
 123             if ( src[0] == '\x00' && src[1] == '\x00' &&
 124                  src[2] == '\xFE' && src[3] == '\xFF' )
 125                 return BOM_UTF32BE;
 126
 127             if ( src[0] == '\xFF' && src[1] == '\xFE' &&
 128                  src[2] == '\x00' && src[3] == '\x00' )
 129                 return BOM_UTF32LE;
 130
 131             if ( src[0] == '\xFE' && src[1] == '\xFF' )
 132                 return BOM_UTF16BE;
 133
 134             if ( src[0] == '\xFF' && src[1] == '\xFE' )
 135                 return BOM_UTF16LE;
 136     }
 137
 138     return BOM_None;
 139 }
 140
 141 void wxConvAuto::InitFromBOM(BOMType bomType)
 142 {
 143     m_consumedBOM = false;
 144
 145     switch ( bomType )
 146     {
 147         case BOM_Unknown:
 148             wxFAIL_MSG( "shouldn't be called for this BOM type" );
 149             break;
 150
 151         case BOM_None:
 152             // use the default
 153             break;
 154
 155         case BOM_UTF32BE:
 156             m_conv = new wxMBConvUTF32BE;
 157             m_ownsConv = true;
 158             break;
 159
 160         case BOM_UTF32LE:
 161             m_conv = new wxMBConvUTF32LE;
 162             m_ownsConv = true;
 163             break;
 164
 165         case BOM_UTF16BE:
 166             m_conv = new wxMBConvUTF16BE;
 167             m_ownsConv = true;
 168             break;
 169
 170         case BOM_UTF16LE:
 171             m_conv = new wxMBConvUTF16LE;
 172             m_ownsConv = true;
 173             break;
 174
 175         case BOM_UTF8:
 176             InitWithUTF8();
 177             break;
 178
 179         default:
 180             wxFAIL_MSG( "unknown BOM type" );
 181     }
 182
 183     if ( !m_conv )
 184     {
 185         // we end up here if there is no BOM or we didn't recognize it somehow
 186         // (this shouldn't happen but still don't crash if it does), so use the
 187         // default encoding
 188         InitWithUTF8();
 189         m_consumedBOM = true; // as there is nothing to consume
 190     }
 191 }
 192
 193 void wxConvAuto::SkipBOM(const char **src, size_t *len) const
 194 {
 195     int ofs;
 196     switch ( m_bomType )
 197     {
 198         case BOM_Unknown:
 199             wxFAIL_MSG( "shouldn't be called for this BOM type" );
 200             return;
 201
 202         case BOM_None:
 203             ofs = 0;
 204             break;
 205
 206         case BOM_UTF32BE:
 207         case BOM_UTF32LE:
 208             ofs = 4;
 209             break;
 210
 211         case BOM_UTF16BE:
 212         case BOM_UTF16LE:
 213             ofs = 2;
 214             break;
 215
 216         case BOM_UTF8:
 217             ofs = 3;
 218             break;
 219
 220         default:
 221             wxFAIL_MSG( "unknown BOM type" );
 222             return;
 223     }
 224
 225     *src += ofs;
 226     if ( *len != (size_t)-1 )
 227         *len -= ofs;
 228 }
 229
 230 bool wxConvAuto::InitFromInput(const char **src, size_t *len)
 231 {
 232     m_bomType = DetectBOM(*src, *len);
 233     if ( m_bomType == BOM_Unknown )
 234         return false;
 235
 236     InitFromBOM(m_bomType);
 237     SkipBOM(src, len);
 238
 239     return true;
 240 }
 241
 242 size_t
 243 wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
 244                     const char *src, size_t srcLen) const
 245 {
 246     // we check BOM and create the appropriate conversion the first time we're
 247     // called but we also need to ensure that the BOM is skipped not only
 248     // during this initial call but also during the first call with non-NULL
 249     // dst as typically we're first called with NULL dst to calculate the
 250     // needed buffer size
 251     wxConvAuto *self = const_cast<wxConvAuto *>(this);
 252
 253
 254     if ( !m_conv )
 255     {
 256         if ( !self->InitFromInput(&src, &srcLen) )
 257         {
 258             // there is not enough data to determine whether we have a BOM or
 259             // not, so fail for now -- the caller is supposed to call us again
 260             // with more data
 261             return wxCONV_FAILED;
 262         }
 263     }
 264     else if ( !m_consumedBOM && dst )
 265     {
 266         SkipBOM(&src, &srcLen);
 267     }
 268
 269     // try to convert using the auto-detected encoding
 270     size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
 271     if ( rc == wxCONV_FAILED && m_bomType == BOM_None )
 272     {
 273         // if the conversion failed but we didn't really detect anything and
 274         // simply tried UTF-8 by default, retry it using the fall-back
 275         if ( m_encDefault != wxFONTENCODING_MAX )
 276         {
 277             if ( m_ownsConv )
 278                 delete m_conv;
 279
 280             self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT
 281                                             ? GetFallbackEncoding()
 282                                             : m_encDefault);
 283             self->m_ownsConv = true;
 284
 285             rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
 286         }
 287     }
 288
 289     if (rc != wxCONV_FAILED && dst && !m_consumedBOM)
 290         self->m_consumedBOM = true;
 291     return rc;
 292 }
 293
 294 size_t
 295 wxConvAuto::FromWChar(char *dst, size_t dstLen,
 296                       const wchar_t *src, size_t srcLen) const
 297 {
 298     if ( !m_conv )
 299     {
 300         // default to UTF-8 for the multibyte output
 301         const_cast<wxConvAuto *>(this)->InitWithUTF8();
 302     }
 303
 304     return m_conv->FromWChar(dst, dstLen, src, srcLen);
 305 }
 306
 307 #endif // wxUSE_WCHAR_T