]>
git.saurik.com Git - wxWidgets.git/blob - src/common/convauto.cpp
   1 /////////////////////////////////////////////////////////////////////////////// 
   2 // Name:        src/common/convauto.cpp 
   3 // Purpose:     implementation of wxConvAuto 
   4 // Author:      Vadim Zeitlin 
   7 // Copyright:   (c) 2006 Vadim Zeitlin <vadim@wxwindows.org> 
   8 // Licence:     wxWindows licence 
   9 /////////////////////////////////////////////////////////////////////////////// 
  11 // ============================================================================ 
  13 // ============================================================================ 
  15 // ---------------------------------------------------------------------------- 
  17 // ---------------------------------------------------------------------------- 
  19 // for compilers that support precompilation, includes "wx.h". 
  20 #include "wx/wxprec.h" 
  26 #include "wx/convauto.h" 
  28 // we use latin1 by default as it seems the least bad choice: the files we need 
  29 // to detect input of don't always come from the user system (they are often 
  30 // received from other machines) and so using wxFONTENCODING_SYSTEM doesn't 
  31 // seem to be a good idea and there is no other reasonable alternative 
  32 wxFontEncoding 
wxConvAuto::ms_defaultMBEncoding 
= wxFONTENCODING_ISO8859_1
; 
  37 const char BOM_UTF32BE
[] = { '\x00', '\x00', '\xFE', '\xFF' }; 
  38 const char BOM_UTF32LE
[] = { '\xFF', '\xFE', '\x00', '\x00' }; 
  39 const char BOM_UTF16BE
[] = { '\xFE', '\xFF'                 }; 
  40 const char BOM_UTF16LE
[] = { '\xFF', '\xFE'                 }; 
  41 const char BOM_UTF8
[]    = { '\xEF', '\xBB', '\xBF'         }; 
  43 } // anonymous namespace 
  45 // ============================================================================ 
  47 // ============================================================================ 
  50 void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc
) 
  52     wxASSERT_MSG( enc 
!= wxFONTENCODING_DEFAULT
, 
  53                   wxT("wxFONTENCODING_DEFAULT doesn't make sense here") ); 
  55     ms_defaultMBEncoding 
= enc
; 
  59 const char* wxConvAuto::GetBOMChars(wxBOM bom
, size_t* count
) 
  61     wxCHECK_MSG( count 
, NULL
, wxS("count pointer must be provided") ); 
  65         case wxBOM_UTF32BE
: *count 
= WXSIZEOF(BOM_UTF32BE
); return BOM_UTF32BE
; 
  66         case wxBOM_UTF32LE
: *count 
= WXSIZEOF(BOM_UTF32LE
); return BOM_UTF32LE
; 
  67         case wxBOM_UTF16BE
: *count 
= WXSIZEOF(BOM_UTF16BE
); return BOM_UTF16BE
; 
  68         case wxBOM_UTF16LE
: *count 
= WXSIZEOF(BOM_UTF16LE
); return BOM_UTF16LE
; 
  69         case wxBOM_UTF8   
: *count 
= WXSIZEOF(BOM_UTF8   
); return BOM_UTF8
; 
  72             wxFAIL_MSG( wxS("Invalid BOM type") ); 
  76     wxFAIL_MSG( wxS("Unknown BOM type") ); 
  81 wxBOM 
wxConvAuto::DetectBOM(const char *src
, size_t srcLen
) 
  83     // examine the buffer for BOM presence 
  85     // quoting from http://www.unicode.org/faq/utf_bom.html#BOM: 
  87     //  Bytes           Encoding Form 
  89     //  00 00 FE FF     UTF-32, big-endian 
  90     //  FF FE 00 00     UTF-32, little-endian 
  91     //  FE FF           UTF-16, big-endian 
  92     //  FF FE           UTF-16, little-endian 
  95     // as some BOMs are prefixes of other ones we may need to read more bytes 
  96     // to disambiguate them 
 101             return wxBOM_Unknown
; 
 104             if ( src
[0] == '\x00' || src
[0] == '\xFF' || 
 105                  src
[0] == '\xFE' || src
[0] == '\xEF') 
 107                 // this could be a BOM but we don't know yet 
 108                 return wxBOM_Unknown
; 
 114             if ( src
[0] == '\xEF' && src
[1] == '\xBB' ) 
 117                     return src
[2] == '\xBF' ? wxBOM_UTF8 
: wxBOM_None
; 
 119                 return wxBOM_Unknown
; 
 122             if ( src
[0] == '\xFE' && src
[1] == '\xFF' ) 
 123                 return wxBOM_UTF16BE
; 
 125             if ( src
[0] == '\xFF' && src
[1] == '\xFE' ) 
 127                 // if the next byte is 0, it could be an UTF-32LE BOM but if it 
 128                 // isn't we can be sure it's UTF-16LE 
 129                 if ( srcLen 
== 3 && src
[2] != '\x00' ) 
 130                     return wxBOM_UTF16LE
; 
 132                 return wxBOM_Unknown
; 
 135             if ( src
[0] == '\x00' && src
[1] == '\x00' ) 
 137                 // this could only be UTF-32BE, check that the data we have so 
 139                 if ( srcLen 
== 3 && src
[2] != '\xFE' ) 
 142                 return wxBOM_Unknown
; 
 147             // we have at least 4 characters so we may finally decide whether 
 148             // we have a BOM or not 
 149             if ( src
[0] == '\xEF' && src
[1] == '\xBB' && src
[2] == '\xBF' ) 
 152             if ( src
[0] == '\x00' && src
[1] == '\x00' && 
 153                  src
[2] == '\xFE' && src
[3] == '\xFF' ) 
 154                 return wxBOM_UTF32BE
; 
 156             if ( src
[0] == '\xFF' && src
[1] == '\xFE' && 
 157                  src
[2] == '\x00' && src
[3] == '\x00' ) 
 158                 return wxBOM_UTF32LE
; 
 160             if ( src
[0] == '\xFE' && src
[1] == '\xFF' ) 
 161                 return wxBOM_UTF16BE
; 
 163             if ( src
[0] == '\xFF' && src
[1] == '\xFE' ) 
 164                 return wxBOM_UTF16LE
; 
 170 void wxConvAuto::InitFromBOM(wxBOM bomType
) 
 172     m_consumedBOM 
= false; 
 177             wxFAIL_MSG( "shouldn't be called for this BOM type" ); 
 185             m_conv 
= new wxMBConvUTF32BE
; 
 190             m_conv 
= new wxMBConvUTF32LE
; 
 195             m_conv 
= new wxMBConvUTF16BE
; 
 200             m_conv 
= new wxMBConvUTF16LE
; 
 209             wxFAIL_MSG( "unknown BOM type" ); 
 214         // we end up here if there is no BOM or we didn't recognize it somehow 
 215         // (this shouldn't happen but still don't crash if it does), so use the 
 218         m_consumedBOM 
= true; // as there is nothing to consume 
 222 void wxConvAuto::SkipBOM(const char **src
, size_t *len
) const 
 228             wxFAIL_MSG( "shouldn't be called for this BOM type" ); 
 250             wxFAIL_MSG( "unknown BOM type" ); 
 255     if ( *len 
!= (size_t)-1 ) 
 259 bool wxConvAuto::InitFromInput(const char *src
, size_t len
) 
 261     m_bomType 
= DetectBOM(src
, len 
== wxNO_LEN 
? strlen(src
) : len
); 
 262     if ( m_bomType 
== wxBOM_Unknown 
) 
 265     InitFromBOM(m_bomType
); 
 271 wxConvAuto::ToWChar(wchar_t *dst
, size_t dstLen
, 
 272                     const char *src
, size_t srcLen
) const 
 274     // we check BOM and create the appropriate conversion the first time we're 
 275     // called but we also need to ensure that the BOM is skipped not only 
 276     // during this initial call but also during the first call with non-NULL 
 277     // dst as typically we're first called with NULL dst to calculate the 
 278     // needed buffer size 
 279     wxConvAuto 
*self 
= const_cast<wxConvAuto 
*>(this); 
 284         if ( !self
->InitFromInput(src
, srcLen
) ) 
 286             // there is not enough data to determine whether we have a BOM or 
 287             // not, so fail for now -- the caller is supposed to call us again 
 289             return wxCONV_FAILED
; 
 293     if ( !m_consumedBOM 
) 
 295         SkipBOM(&src
, &srcLen
); 
 298             // there is nothing left except the BOM so we'd return 0 below but 
 299             // this is unexpected: decoding a non-empty string must either fail 
 300             // or return something non-empty, in particular this would break 
 301             // the code in wxTextInputStream::NextChar() 
 303             // so still return an error as we need some more data to be able to 
 305             return wxCONV_FAILED
; 
 309     // try to convert using the auto-detected encoding 
 310     size_t rc 
= m_conv
->ToWChar(dst
, dstLen
, src
, srcLen
); 
 311     if ( rc 
== wxCONV_FAILED 
&& m_bomType 
== wxBOM_None 
) 
 313         // if the conversion failed but we didn't really detect anything and 
 314         // simply tried UTF-8 by default, retry it using the fall-back 
 315         if ( m_encDefault 
!= wxFONTENCODING_MAX 
) 
 320             self
->m_conv 
= new wxCSConv(m_encDefault 
== wxFONTENCODING_DEFAULT
 
 321                                             ? GetFallbackEncoding() 
 323             self
->m_ownsConv 
= true; 
 325             rc 
= m_conv
->ToWChar(dst
, dstLen
, src
, srcLen
); 
 329     // don't skip the BOM again the next time if we really consumed it 
 330     if ( rc 
!= wxCONV_FAILED 
&& dst 
&& !m_consumedBOM 
) 
 331         self
->m_consumedBOM 
= true; 
 337 wxConvAuto::FromWChar(char *dst
, size_t dstLen
, 
 338                       const wchar_t *src
, size_t srcLen
) const 
 342         // default to UTF-8 for the multibyte output 
 343         const_cast<wxConvAuto 
*>(this)->InitWithUTF8(); 
 346     return m_conv
->FromWChar(dst
, dstLen
, src
, srcLen
);