src/common/convauto.cpp

   1 ///////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/convauto.cpp
   3 // Purpose:     implementation of wxConvAuto
   4 // Author:      Vadim Zeitlin
   5 // Created:     2006-04-04
   6 // RCS-ID:      $Id$
   7 // Copyright:   (c) 2006 Vadim Zeitlin <vadim@wxwindows.org>
   8 // Licence:     wxWindows licence
   9 ///////////////////////////////////////////////////////////////////////////////
  10
  11 // ============================================================================
  12 // declarations
  13 // ============================================================================
  14
  15 // ----------------------------------------------------------------------------
  16 // headers
  17 // ----------------------------------------------------------------------------
  18
  19 // for compilers that support precompilation, includes "wx.h".
  20 #include "wx/wxprec.h"
  21
  22 #ifdef __BORLANDC__
  23     #pragma hdrstop
  24 #endif
  25
  26 #if wxUSE_WCHAR_T
  27
  28 #ifndef WX_PRECOMP
  29     #include "wx/wx.h"
  30 #endif //WX_PRECOMP
  31
  32 #include "wx/convauto.h"
  33
  34 // we use latin1 by default as it seems the least bad choice: the files we need
  35 // to detect input of don't always come from the user system (they are often
  36 // received from other machines) and so using wxFONTENCODING_SYSTEM doesn't
  37 // seem to be a good idea and there is no other reasonable alternative
  38 wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1;
  39
  40 // ============================================================================
  41 // implementation
  42 // ============================================================================
  43
  44 /* static */
  45 void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
  46 {
  47     wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT,
  48                   wxT("wxFONTENCODING_DEFAULT doesn't make sense here") );
  49
  50     ms_defaultMBEncoding = enc;
  51 }
  52
  53 /* static */
  54 wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
  55 {
  56     // examine the buffer for BOM presence
  57     //
  58     // quoting from http://www.unicode.org/faq/utf_bom.html#BOM:
  59     //
  60     //  Bytes           Encoding Form
  61     //
  62     //  00 00 FE FF     UTF-32, big-endian
  63     //  FF FE 00 00     UTF-32, little-endian
  64     //  FE FF           UTF-16, big-endian
  65     //  FF FE           UTF-16, little-endian
  66     //  EF BB BF        UTF-8
  67     //
  68     // as some BOMs are prefixes of other ones we may need to read more bytes
  69     // to disambiguate them
  70
  71     switch ( srcLen )
  72     {
  73         case 0:
  74             return BOM_Unknown;
  75
  76         case 1:
  77             if ( src[0] == '\x00' || src[0] == '\xFF' ||
  78                  src[0] == '\xFE' || src[0] == '\xEF')
  79             {
  80                 // this could be a BOM but we don't know yet
  81                 return BOM_Unknown;
  82             }
  83             break;
  84
  85         case 2:
  86         case 3:
  87             if ( src[0] == '\xEF' && src[1] == '\xBB' )
  88             {
  89                 if ( srcLen == 3 )
  90                     return src[2] == '\xBF' ? BOM_UTF8 : BOM_None;
  91
  92                 return BOM_Unknown;
  93             }
  94
  95             if ( src[0] == '\xFE' && src[1] == '\xFF' )
  96                 return BOM_UTF16BE;
  97
  98             if ( src[0] == '\xFF' && src[1] == '\xFE' )
  99             {
 100                 // if the next byte is 0, it could be an UTF-32LE BOM but if it
 101                 // isn't we can be sure it's UTF-16LE
 102                 if ( srcLen == 3 && src[2] != '\x00' )
 103                     return BOM_UTF16LE;
 104
 105                 return BOM_Unknown;
 106             }
 107
 108             if ( src[0] == '\x00' && src[1] == '\x00' )
 109             {
 110                 // this could only be UTF-32BE, check that the data we have so
 111                 // far allows for it
 112                 if ( srcLen == 3 && src[2] != '\xFE' )
 113                     return BOM_None;
 114
 115                 return BOM_Unknown;
 116             }
 117             break;
 118
 119         default:
 120             // we have at least 4 characters so we may finally decide whether
 121             // we have a BOM or not
 122             if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
 123                 return BOM_UTF8;
 124
 125             if ( src[0] == '\x00' && src[1] == '\x00' &&
 126                  src[2] == '\xFE' && src[3] == '\xFF' )
 127                 return BOM_UTF32BE;
 128
 129             if ( src[0] == '\xFF' && src[1] == '\xFE' &&
 130                  src[2] == '\x00' && src[3] == '\x00' )
 131                 return BOM_UTF32LE;
 132
 133             if ( src[0] == '\xFE' && src[1] == '\xFF' )
 134                 return BOM_UTF16BE;
 135
 136             if ( src[0] == '\xFF' && src[1] == '\xFE' )
 137                 return BOM_UTF16LE;
 138     }
 139
 140     return BOM_None;
 141 }
 142
 143 void wxConvAuto::InitFromBOM(BOMType bomType)
 144 {
 145     m_consumedBOM = false;
 146
 147     switch ( bomType )
 148     {
 149         case BOM_Unknown:
 150             wxFAIL_MSG( "shouldn't be called for this BOM type" );
 151             break;
 152
 153         case BOM_None:
 154             // use the default
 155             break;
 156
 157         case BOM_UTF32BE:
 158             m_conv = new wxMBConvUTF32BE;
 159             m_ownsConv = true;
 160             break;
 161
 162         case BOM_UTF32LE:
 163             m_conv = new wxMBConvUTF32LE;
 164             m_ownsConv = true;
 165             break;
 166
 167         case BOM_UTF16BE:
 168             m_conv = new wxMBConvUTF16BE;
 169             m_ownsConv = true;
 170             break;
 171
 172         case BOM_UTF16LE:
 173             m_conv = new wxMBConvUTF16LE;
 174             m_ownsConv = true;
 175             break;
 176
 177         case BOM_UTF8:
 178             InitWithUTF8();
 179             break;
 180
 181         default:
 182             wxFAIL_MSG( "unknown BOM type" );
 183     }
 184
 185     if ( !m_conv )
 186     {
 187         // we end up here if there is no BOM or we didn't recognize it somehow
 188         // (this shouldn't happen but still don't crash if it does), so use the
 189         // default encoding
 190         InitWithUTF8();
 191         m_consumedBOM = true; // as there is nothing to consume
 192     }
 193 }
 194
 195 void wxConvAuto::SkipBOM(const char **src, size_t *len) const
 196 {
 197     int ofs;
 198     switch ( m_bomType )
 199     {
 200         case BOM_Unknown:
 201             wxFAIL_MSG( "shouldn't be called for this BOM type" );
 202             return;
 203
 204         case BOM_None:
 205             ofs = 0;
 206             break;
 207
 208         case BOM_UTF32BE:
 209         case BOM_UTF32LE:
 210             ofs = 4;
 211             break;
 212
 213         case BOM_UTF16BE:
 214         case BOM_UTF16LE:
 215             ofs = 2;
 216             break;
 217
 218         case BOM_UTF8:
 219             ofs = 3;
 220             break;
 221
 222         default:
 223             wxFAIL_MSG( "unknown BOM type" );
 224             return;
 225     }
 226
 227     *src += ofs;
 228     if ( *len != (size_t)-1 )
 229         *len -= ofs;
 230 }
 231
 232 bool wxConvAuto::InitFromInput(const char *src, size_t len)
 233 {
 234     m_bomType = DetectBOM(src, len);
 235     if ( m_bomType == BOM_Unknown )
 236         return false;
 237
 238     InitFromBOM(m_bomType);
 239
 240     return true;
 241 }
 242
 243 size_t
 244 wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
 245                     const char *src, size_t srcLen) const
 246 {
 247     // we check BOM and create the appropriate conversion the first time we're
 248     // called but we also need to ensure that the BOM is skipped not only
 249     // during this initial call but also during the first call with non-NULL
 250     // dst as typically we're first called with NULL dst to calculate the
 251     // needed buffer size
 252     wxConvAuto *self = const_cast<wxConvAuto *>(this);
 253
 254
 255     if ( !m_conv )
 256     {
 257         if ( !self->InitFromInput(src, srcLen) )
 258         {
 259             // there is not enough data to determine whether we have a BOM or
 260             // not, so fail for now -- the caller is supposed to call us again
 261             // with more data
 262             return wxCONV_FAILED;
 263         }
 264     }
 265
 266     if ( !m_consumedBOM )
 267     {
 268         SkipBOM(&src, &srcLen);
 269         if ( srcLen == 0 )
 270         {
 271             // there is nothing left except the BOM so we'd return 0 below but
 272             // this is unexpected: decoding a non-empty string must either fail
 273             // or return something non-empty, in particular this would break
 274             // the code in wxTextInputStream::NextChar()
 275             //
 276             // so still return an error as we need some more data to be able to
 277             // decode it
 278             return wxCONV_FAILED;
 279         }
 280     }
 281
 282     // try to convert using the auto-detected encoding
 283     size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
 284     if ( rc == wxCONV_FAILED && m_bomType == BOM_None )
 285     {
 286         // if the conversion failed but we didn't really detect anything and
 287         // simply tried UTF-8 by default, retry it using the fall-back
 288         if ( m_encDefault != wxFONTENCODING_MAX )
 289         {
 290             if ( m_ownsConv )
 291                 delete m_conv;
 292
 293             self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT
 294                                             ? GetFallbackEncoding()
 295                                             : m_encDefault);
 296             self->m_ownsConv = true;
 297
 298             rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
 299         }
 300     }
 301
 302     // don't skip the BOM again the next time if we really consumed it
 303     if ( rc != wxCONV_FAILED && dst && !m_consumedBOM )
 304         self->m_consumedBOM = true;
 305
 306     return rc;
 307 }
 308
 309 size_t
 310 wxConvAuto::FromWChar(char *dst, size_t dstLen,
 311                       const wchar_t *src, size_t srcLen) const
 312 {
 313     if ( !m_conv )
 314     {
 315         // default to UTF-8 for the multibyte output
 316         const_cast<wxConvAuto *>(this)->InitWithUTF8();
 317     }
 318
 319     return m_conv->FromWChar(dst, dstLen, src, srcLen);
 320 }
 321
 322 #endif // wxUSE_WCHAR_T