src/common/convauto.cpp

   1 ///////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/convauto.cpp
   3 // Purpose:     implementation of wxConvAuto
   4 // Author:      Vadim Zeitlin
   5 // Created:     2006-04-04
   6 // RCS-ID:      $Id$
   7 // Copyright:   (c) 2006 Vadim Zeitlin <vadim@wxwindows.org>
   8 // Licence:     wxWindows licence
   9 ///////////////////////////////////////////////////////////////////////////////
  10
  11 // ============================================================================
  12 // declarations
  13 // ============================================================================
  14
  15 // ----------------------------------------------------------------------------
  16 // headers
  17 // ----------------------------------------------------------------------------
  18
  19 // for compilers that support precompilation, includes "wx.h".
  20 #include "wx/wxprec.h"
  21
  22 #ifdef __BORLANDC__
  23     #pragma hdrstop
  24 #endif
  25
  26 #if wxUSE_WCHAR_T
  27
  28 #ifndef WX_PRECOMP
  29 #endif //WX_PRECOMP
  30
  31 #include "wx/convauto.h"
  32
  33 // ============================================================================
  34 // implementation
  35 // ============================================================================
  36
  37 /* static */
  38 wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
  39 {
  40     if ( srcLen < 2 )
  41     {
  42         // minimal BOM is 2 bytes so bail out immediately and simplify the code
  43         // below which wouldn't need to check for length for UTF-16 cases
  44         return BOM_None;
  45     }
  46
  47     // examine the buffer for BOM presence
  48     //
  49     // see http://www.unicode.org/faq/utf_bom.html#BOM
  50     switch ( *src++ )
  51     {
  52         case '\0':
  53             // could only be big endian UTF-32 (00 00 FE FF)
  54             if ( srcLen >= 4 &&
  55                     src[0] == '\0' &&
  56                         src[1] == '\xfe' &&
  57                             src[2] == '\xff' )
  58             {
  59                 return BOM_UTF32BE;
  60             }
  61             break;
  62
  63         case '\xfe':
  64             // could only be big endian UTF-16 (FE FF)
  65             if ( *src++ == '\xff' )
  66             {
  67                 return BOM_UTF16BE;
  68             }
  69             break;
  70
  71         case '\xff':
  72             // could be either little endian UTF-16 or UTF-32, both start
  73             // with FF FE
  74             if ( *src++ == '\xfe' )
  75             {
  76                 return srcLen >= 4 && src[0] == '\0' && src[1] == '\0'
  77                             ? BOM_UTF32LE
  78                             : BOM_UTF16LE;
  79             }
  80             break;
  81
  82         case '\xef':
  83             // is this UTF-8 BOM (EF BB BF)?
  84             if ( srcLen >= 3 && src[0] == '\xbb' && src[1] == '\xbf' )
  85             {
  86                 return BOM_UTF8;
  87             }
  88             break;
  89     }
  90
  91     return BOM_None;
  92 }
  93
  94 void wxConvAuto::InitFromBOM(BOMType bomType)
  95 {
  96     m_consumedBOM = false;
  97
  98     switch ( bomType )
  99     {
 100         case BOM_UTF32BE:
 101             m_conv = new wxMBConvUTF32BE;
 102             m_ownsConv = true;
 103             break;
 104
 105         case BOM_UTF32LE:
 106             m_conv = new wxMBConvUTF32LE;
 107             m_ownsConv = true;
 108             break;
 109
 110         case BOM_UTF16BE:
 111             m_conv = new wxMBConvUTF16BE;
 112             m_ownsConv = true;
 113             break;
 114
 115         case BOM_UTF16LE:
 116             m_conv = new wxMBConvUTF16LE;
 117             m_ownsConv = true;
 118             break;
 119
 120         case BOM_UTF8:
 121             m_conv = &wxConvUTF8;
 122             m_ownsConv = false;
 123             break;
 124
 125         default:
 126             wxFAIL_MSG( _T("unexpected BOM type") );
 127             // fall through: still need to create something
 128
 129         case BOM_None:
 130             InitWithDefault();
 131             m_consumedBOM = true; // as there is nothing to consume
 132     }
 133 }
 134
 135 void wxConvAuto::SkipBOM(const char **src, size_t *len) const
 136 {
 137     int ofs;
 138     switch ( m_bomType )
 139     {
 140         case BOM_UTF32BE:
 141         case BOM_UTF32LE:
 142             ofs = 4;
 143             break;
 144
 145         case BOM_UTF16BE:
 146         case BOM_UTF16LE:
 147             ofs = 2;
 148             break;
 149
 150         case BOM_UTF8:
 151             ofs = 3;
 152             break;
 153
 154         default:
 155             wxFAIL_MSG( _T("unexpected BOM type") );
 156             // fall through: still need to create something
 157
 158         case BOM_None:
 159             ofs = 0;
 160     }
 161
 162     *src += ofs;
 163     if ( *len != (size_t)-1 )
 164         *len -= ofs;
 165 }
 166
 167 void wxConvAuto::InitFromInput(const char **src, size_t *len)
 168 {
 169     m_bomType = DetectBOM(*src, *len);
 170     InitFromBOM(m_bomType);
 171     SkipBOM(src, len);
 172 }
 173
 174 size_t
 175 wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
 176                     const char *src, size_t srcLen) const
 177 {
 178     // we check BOM and create the appropriate conversion the first time we're
 179     // called but we also need to ensure that the BOM is skipped not only
 180     // during this initial call but also during the first call with non-NULL
 181     // dst as typically we're first called with NULL dst to calculate the
 182     // needed buffer size
 183     wxConvAuto *self = wx_const_cast(wxConvAuto *, this);
 184     if ( !m_conv )
 185     {
 186         self->InitFromInput(&src, &srcLen);
 187         if ( dst )
 188             self->m_consumedBOM = true;
 189     }
 190
 191     if ( !m_consumedBOM && dst )
 192     {
 193         self->m_consumedBOM = true;
 194         SkipBOM(&src, &srcLen);
 195     }
 196
 197     return m_conv->ToWChar(dst, dstLen, src, srcLen);
 198 }
 199
 200 size_t
 201 wxConvAuto::FromWChar(char *dst, size_t dstLen,
 202                       const wchar_t *src, size_t srcLen) const
 203 {
 204     if ( !m_conv )
 205     {
 206         // default to UTF-8 for the multibyte output
 207         wx_const_cast(wxConvAuto *, this)->InitWithDefault();
 208     }
 209
 210     return m_conv->FromWChar(dst, dstLen, src, srcLen);
 211 }
 212
 213 #endif // wxUSE_WCHAR_T
 214