--- /dev/null
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%% Name: convauto.tex
+%% Purpose: wxConvAuto documentation
+%% Author: Vadim Zeitlin
+%% Created: 2007-08-26
+%% RCS-ID: $Id:$
+%% Copyright: (c) 2007 Vadim Zeitlin <vadim@wxwidgets.org>
+%% License: wxWindows license
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\section{\class{wxConvAuto}}\label{wxconvauto}
+
+This class implements a Unicode to/from multibyte converter capable of
+automatically recognizing the encoding of the multibyte text on input. The
+logic used is very simple: the class uses the BOM (byte order mark) if it's
+present and tries to interpret the input as UTF-8 otherwise. If this fails, the
+input is interpreted as being in the default multibyte encoding which can be
+specified in the constructor of a wxConvAuto instance and, in turn, defaults to
+the value of \helpref{GetFallbackEncoding}{wxconvautogetdefaultmbencoding} if
+not explicitly given.
+
+For the conversion from Unicode to multibyte, the same encoding as was
+previously used for multibyte to Unicode conversion is reused. If there had
+been no previous multibyte to Unicode conversion, UTF-8 is used by default.
+Notice that once the multibyte encoding is automatically detected, it doesn't
+change any more, i.e. it is entirely determined by the first use of wxConvAuto
+object in the multibyte-to-Unicode direction. However creating a copy of
+wxConvAuto object, either via the usual copy constructor or assignment
+operator, or using \helpref{Clone}{wxmbconvclone} method, resets the
+automatically detected encoding so that the new copy will try to detect the
+encoding of the input on first use.
+
+This class is used by default in wxWidgets classes and functions reading text
+from files such as \helpref{wxFile}{wxfile}, \helpref{wxFFile}{wxffile},
+\helpref{wxTextFile}{wxtextfile}, \helpref{wxFileConfig}{wxfileconfig} and
+various stream classes so the encoding set with its
+\helpref{SetFallbackEncoding}{wxconvautosetdefaultmbencoding} method will
+affect how these classes treat input files. In particular, use this method
+to change the fall-back multibyte encoding used to interpret the contents of
+the files whose contents isn't valid UTF-8 or to disallow it completely.
+
+\wxheading{Derived from}
+
+\helpref{wxMBConv}{mbconv}
+
+\wxheading{Include files}
+
+<wx/convauto.h>
+
+\wxheading{Library}
+
+\helpref{wxBase}{librarieslist}
+
+\wxheading{See also}
+
+\helpref{wxMBConv classes overview}{mbconvclasses}
+
+
+\latexignore{\rtfignore{\wxheading{Members}}}
+
+\membersection{wxConvAuto::wxConvAuto}\label{wxconvautowxconvauto}
+
+\func{}{wxConvAuto}{\param{wxFontEncoding }{enc = wxFONTENCODING\_DEFAULT}}
+
+Constructs a new wxConvAuto instance. The object will try to detect the input
+of the multibyte text given to its \helpref{ToWChar}{wxmbconvtowchar} method
+automatically but if the automatic detection of Unicode encodings fails, the
+fall-back encoding \arg{enc} will be used to interpret it as multibyte text.
+The default value of this parameter, \texttt{wxFONTENCODING\_DEFAULT} means
+that the global default value which can be set using
+\helpref{SetFallbackEncoding}{wxconvautosetdefaultmbencoding} method should be
+used. As with that method, passing \texttt{wxFONTENCODING\_MAX} inhibits using
+this encoding completely so the input multibyte text will always be interpreted
+as UTF-8 in the absence of BOM and the conversion will fail if the input
+doesn't form valid UTF-8 sequence. Another special value is
+\texttt{wxFONTENCODING\_SYSTEM} which means to use the encoding currently used
+on the user system, i.e. the encoding returned by
+\helpref{wxLocale::GetSystemEncoding}{wxlocalegetsystemencoding}. Any other
+encoding will be used as is, e.g. passing \texttt{wxFONTENCODING\_ISO8859\_1}
+ensures that non-UTF-8 input will be treated as latin1.
+
+
+\membersection{wxConvAuto::DisableFallbackEncoding}\label{wxconvautodisablefallbackencoding}
+
+\func{static void}{DisableFallbackEncoding}{\void}
+
+Disable the use of the fall back encoding: if the input doesn't have a BOM and
+is not valid UTF-8, the conversion will fail.
+
+
+\membersection{wxConvAuto::GetFallbackEncoding}\label{wxconvautogetdefaultmbencoding}
+
+\func{static wxFontEncoding}{GetFallbackEncoding}{\void}
+
+Returns the encoding used by default by wxConvAuto if no other encoding is
+explicitly specified in constructor. By default, returns
+\texttt{wxFONTENCODING\_ISO8859\_1} but can be changed using
+\helpref{SetFallbackEncoding}{wxconvautosetdefaultmbencoding} method.
+
+
+\membersection{wxConvAuto::SetFallbackEncoding}\label{wxconvautosetdefaultmbencoding}
+
+\func{static void}{SetFallbackEncoding}{\param{wxFontEncoding }{enc}}
+
+Changes the encoding used by default by wxConvAuto if no other encoding is
+explicitly specified in constructor. The default value, which can be retrieved
+using \helpref{GetFallbackEncoding}{wxconvautogetdefaultmbencoding}, is
+\texttt{wxFONTENCODING\_ISO8859\_1}.
+
+Special values of \texttt{wxFONTENCODING\_SYSTEM} or
+\texttt{wxFONTENCODING\_MAX} can be used for \arg{enc} parameter to use the
+encoding of the current user locale as fall back or not use any encoding for
+fall back at all, respectively (just as with the similar constructor
+parameter). However \texttt{wxFONTENCODING\_DEFAULT} value cannot be used here.
+
#define _WX_CONVAUTO_H_
#include "wx/strconv.h"
+#include "wx/fontenc.h"
#if wxUSE_WCHAR_T
{
public:
// default ctor, the real conversion will be created on demand
- wxConvAuto() { m_conv = NULL; /* the rest will be initialized later */ }
+ wxConvAuto(wxFontEncoding enc = wxFONTENCODING_DEFAULT)
+ {
+ m_conv = NULL; // the rest will be initialized later
+ m_encDefault = enc;
+ }
// copy ctor doesn't initialize anything neither as conversion can only be
// deduced on first use
- wxConvAuto(const wxConvAuto& WXUNUSED(other)) : wxMBConv() { m_conv = NULL; }
+ wxConvAuto(const wxConvAuto& other) : wxMBConv()
+ {
+ m_conv = NULL;
+ m_encDefault = other.m_encDefault;
+ }
+
+ virtual ~wxConvAuto()
+ {
+ if ( m_ownsConv )
+ delete m_conv;
+ }
+
+ // get/set the fall-back encoding used when the input text doesn't have BOM
+ // and isn't UTF-8
+ //
+ // special values are wxFONTENCODING_MAX meaning not to use any fall back
+ // at all (but just fail to convert in this case) and wxFONTENCODING_SYSTEM
+ // meaning to use the encoding of the system locale
+ static wxFontEncoding GetFallbackEncoding() { return ms_defaultMBEncoding; }
+ static void SetFallbackEncoding(wxFontEncoding enc);
+ static void DisableFallbackEncoding()
+ {
+ SetFallbackEncoding(wxFONTENCODING_MAX);
+ }
- virtual ~wxConvAuto() { if ( m_conv && m_ownsConv ) delete m_conv; }
// override the base class virtual function(s) to use our m_conv
virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
// return the BOM type of this buffer
static BOMType DetectBOM(const char *src, size_t srcLen);
- // initialize m_conv with the conversion to use by default (UTF-8)
- void InitWithDefault()
+ // initialize m_conv with the UTF-8 conversion
+ void InitWithUTF8()
{
m_conv = &wxConvUTF8;
m_ownsConv = false;
void SkipBOM(const char **src, size_t *len) const;
+ // fall-back multibyte encoding to use, may be wxFONTENCODING_SYSTEM or
+ // wxFONTENCODING_MAX but not wxFONTENCODING_DEFAULT
+ static wxFontEncoding ms_defaultMBEncoding;
+
// conversion object which we really use, NULL until the first call to
// either ToWChar() or FromWChar()
wxMBConv *m_conv;
+ // the multibyte encoding to use by default if input isn't Unicode
+ wxFontEncoding m_encDefault;
+
// our BOM type
BOMType m_bomType;
#include "wx/convauto.h"
+// we use latin1 by default as it seems the least bad choice: the files we need
+// to detect input of don't always come from the user system (they are often
+// received from other machines) and so using wxFONTENCODING_SYSTEM doesn't
+// seem to be a good idea and there is no other reasonable alternative
+wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1;
+
// ============================================================================
// implementation
// ============================================================================
+/* static */
+void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
+{
+ wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT,
+ _T("wxFONTENCODING_DEFAULT doesn't make sense here") );
+
+ ms_defaultMBEncoding = enc;
+}
+
/* static */
wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
{
break;
case BOM_UTF8:
- m_conv = &wxConvUTF8;
- m_ownsConv = false;
+ InitWithUTF8();
break;
default:
// fall through: still need to create something
case BOM_None:
- InitWithDefault();
+ InitWithUTF8();
m_consumedBOM = true; // as there is nothing to consume
}
}
SkipBOM(&src, &srcLen);
}
- return m_conv->ToWChar(dst, dstLen, src, srcLen);
+ // try to convert using the auto-detected encoding
+ size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
+ if ( rc == wxCONV_FAILED && m_bomType == BOM_None )
+ {
+ // if the conversion failed but we didn't really detect anything and
+ // simply tried UTF-8 by default, retry it using the fall-back
+ if ( m_encDefault != wxFONTENCODING_MAX )
+ {
+ if ( m_ownsConv )
+ delete m_conv;
+
+ self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT
+ ? GetFallbackEncoding()
+ : m_encDefault);
+ self->m_ownsConv = true;
+
+ rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
+ }
+ }
+
+ return rc;
}
size_t
if ( !m_conv )
{
// default to UTF-8 for the multibyte output
- wx_const_cast(wxConvAuto *, this)->InitWithDefault();
+ wx_const_cast(wxConvAuto *, this)->InitWithUTF8();
}
return m_conv->FromWChar(dst, dstLen, src, srcLen);