From 01a9232b5e06fc1e24cbd051661d37a46827379d Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Thu, 30 Aug 2007 17:54:28 +0000 Subject: [PATCH] use fallback encoding in wxConvAuto when input is not in UTF-8 git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@48463 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775 --- docs/latex/wx/classes.tex | 1 + docs/latex/wx/convauto.tex | 115 +++++++++++++++++++++++++++++++++++++ include/wx/convauto.h | 44 ++++++++++++-- src/common/convauto.cpp | 44 ++++++++++++-- 4 files changed, 194 insertions(+), 10 deletions(-) create mode 100644 docs/latex/wx/convauto.tex diff --git a/docs/latex/wx/classes.tex b/docs/latex/wx/classes.tex index 527ddd95d1..47b1ec89c8 100644 --- a/docs/latex/wx/classes.tex +++ b/docs/latex/wx/classes.tex @@ -70,6 +70,7 @@ \input cshelp.tex \input control.tex \input ctrlsub.tex +\input convauto.tex \input countstr.tex \input critsect.tex \input crtslock.tex diff --git a/docs/latex/wx/convauto.tex b/docs/latex/wx/convauto.tex new file mode 100644 index 0000000000..a34b75a4f0 --- /dev/null +++ b/docs/latex/wx/convauto.tex @@ -0,0 +1,115 @@ +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Name: convauto.tex +%% Purpose: wxConvAuto documentation +%% Author: Vadim Zeitlin +%% Created: 2007-08-26 +%% RCS-ID: $Id:$ +%% Copyright: (c) 2007 Vadim Zeitlin +%% License: wxWindows license +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\section{\class{wxConvAuto}}\label{wxconvauto} + +This class implements a Unicode to/from multibyte converter capable of +automatically recognizing the encoding of the multibyte text on input. The +logic used is very simple: the class uses the BOM (byte order mark) if it's +present and tries to interpret the input as UTF-8 otherwise. If this fails, the +input is interpreted as being in the default multibyte encoding which can be +specified in the constructor of a wxConvAuto instance and, in turn, defaults to +the value of \helpref{GetFallbackEncoding}{wxconvautogetdefaultmbencoding} if +not explicitly given. + +For the conversion from Unicode to multibyte, the same encoding as was +previously used for multibyte to Unicode conversion is reused. If there had +been no previous multibyte to Unicode conversion, UTF-8 is used by default. +Notice that once the multibyte encoding is automatically detected, it doesn't +change any more, i.e. it is entirely determined by the first use of wxConvAuto +object in the multibyte-to-Unicode direction. However creating a copy of +wxConvAuto object, either via the usual copy constructor or assignment +operator, or using \helpref{Clone}{wxmbconvclone} method, resets the +automatically detected encoding so that the new copy will try to detect the +encoding of the input on first use. + +This class is used by default in wxWidgets classes and functions reading text +from files such as \helpref{wxFile}{wxfile}, \helpref{wxFFile}{wxffile}, +\helpref{wxTextFile}{wxtextfile}, \helpref{wxFileConfig}{wxfileconfig} and +various stream classes so the encoding set with its +\helpref{SetFallbackEncoding}{wxconvautosetdefaultmbencoding} method will +affect how these classes treat input files. In particular, use this method +to change the fall-back multibyte encoding used to interpret the contents of +the files whose contents isn't valid UTF-8 or to disallow it completely. + +\wxheading{Derived from} + +\helpref{wxMBConv}{mbconv} + +\wxheading{Include files} + + + +\wxheading{Library} + +\helpref{wxBase}{librarieslist} + +\wxheading{See also} + +\helpref{wxMBConv classes overview}{mbconvclasses} + + +\latexignore{\rtfignore{\wxheading{Members}}} + +\membersection{wxConvAuto::wxConvAuto}\label{wxconvautowxconvauto} + +\func{}{wxConvAuto}{\param{wxFontEncoding }{enc = wxFONTENCODING\_DEFAULT}} + +Constructs a new wxConvAuto instance. The object will try to detect the input +of the multibyte text given to its \helpref{ToWChar}{wxmbconvtowchar} method +automatically but if the automatic detection of Unicode encodings fails, the +fall-back encoding \arg{enc} will be used to interpret it as multibyte text. +The default value of this parameter, \texttt{wxFONTENCODING\_DEFAULT} means +that the global default value which can be set using +\helpref{SetFallbackEncoding}{wxconvautosetdefaultmbencoding} method should be +used. As with that method, passing \texttt{wxFONTENCODING\_MAX} inhibits using +this encoding completely so the input multibyte text will always be interpreted +as UTF-8 in the absence of BOM and the conversion will fail if the input +doesn't form valid UTF-8 sequence. Another special value is +\texttt{wxFONTENCODING\_SYSTEM} which means to use the encoding currently used +on the user system, i.e. the encoding returned by +\helpref{wxLocale::GetSystemEncoding}{wxlocalegetsystemencoding}. Any other +encoding will be used as is, e.g. passing \texttt{wxFONTENCODING\_ISO8859\_1} +ensures that non-UTF-8 input will be treated as latin1. + + +\membersection{wxConvAuto::DisableFallbackEncoding}\label{wxconvautodisablefallbackencoding} + +\func{static void}{DisableFallbackEncoding}{\void} + +Disable the use of the fall back encoding: if the input doesn't have a BOM and +is not valid UTF-8, the conversion will fail. + + +\membersection{wxConvAuto::GetFallbackEncoding}\label{wxconvautogetdefaultmbencoding} + +\func{static wxFontEncoding}{GetFallbackEncoding}{\void} + +Returns the encoding used by default by wxConvAuto if no other encoding is +explicitly specified in constructor. By default, returns +\texttt{wxFONTENCODING\_ISO8859\_1} but can be changed using +\helpref{SetFallbackEncoding}{wxconvautosetdefaultmbencoding} method. + + +\membersection{wxConvAuto::SetFallbackEncoding}\label{wxconvautosetdefaultmbencoding} + +\func{static void}{SetFallbackEncoding}{\param{wxFontEncoding }{enc}} + +Changes the encoding used by default by wxConvAuto if no other encoding is +explicitly specified in constructor. The default value, which can be retrieved +using \helpref{GetFallbackEncoding}{wxconvautogetdefaultmbencoding}, is +\texttt{wxFONTENCODING\_ISO8859\_1}. + +Special values of \texttt{wxFONTENCODING\_SYSTEM} or +\texttt{wxFONTENCODING\_MAX} can be used for \arg{enc} parameter to use the +encoding of the current user locale as fall back or not use any encoding for +fall back at all, respectively (just as with the similar constructor +parameter). However \texttt{wxFONTENCODING\_DEFAULT} value cannot be used here. + diff --git a/include/wx/convauto.h b/include/wx/convauto.h index fcd26532e9..398888a788 100644 --- a/include/wx/convauto.h +++ b/include/wx/convauto.h @@ -12,6 +12,7 @@ #define _WX_CONVAUTO_H_ #include "wx/strconv.h" +#include "wx/fontenc.h" #if wxUSE_WCHAR_T @@ -23,13 +24,39 @@ class WXDLLIMPEXP_BASE wxConvAuto : public wxMBConv { public: // default ctor, the real conversion will be created on demand - wxConvAuto() { m_conv = NULL; /* the rest will be initialized later */ } + wxConvAuto(wxFontEncoding enc = wxFONTENCODING_DEFAULT) + { + m_conv = NULL; // the rest will be initialized later + m_encDefault = enc; + } // copy ctor doesn't initialize anything neither as conversion can only be // deduced on first use - wxConvAuto(const wxConvAuto& WXUNUSED(other)) : wxMBConv() { m_conv = NULL; } + wxConvAuto(const wxConvAuto& other) : wxMBConv() + { + m_conv = NULL; + m_encDefault = other.m_encDefault; + } + + virtual ~wxConvAuto() + { + if ( m_ownsConv ) + delete m_conv; + } + + // get/set the fall-back encoding used when the input text doesn't have BOM + // and isn't UTF-8 + // + // special values are wxFONTENCODING_MAX meaning not to use any fall back + // at all (but just fail to convert in this case) and wxFONTENCODING_SYSTEM + // meaning to use the encoding of the system locale + static wxFontEncoding GetFallbackEncoding() { return ms_defaultMBEncoding; } + static void SetFallbackEncoding(wxFontEncoding enc); + static void DisableFallbackEncoding() + { + SetFallbackEncoding(wxFONTENCODING_MAX); + } - virtual ~wxConvAuto() { if ( m_conv && m_ownsConv ) delete m_conv; } // override the base class virtual function(s) to use our m_conv virtual size_t ToWChar(wchar_t *dst, size_t dstLen, @@ -57,8 +84,8 @@ private: // return the BOM type of this buffer static BOMType DetectBOM(const char *src, size_t srcLen); - // initialize m_conv with the conversion to use by default (UTF-8) - void InitWithDefault() + // initialize m_conv with the UTF-8 conversion + void InitWithUTF8() { m_conv = &wxConvUTF8; m_ownsConv = false; @@ -76,10 +103,17 @@ private: void SkipBOM(const char **src, size_t *len) const; + // fall-back multibyte encoding to use, may be wxFONTENCODING_SYSTEM or + // wxFONTENCODING_MAX but not wxFONTENCODING_DEFAULT + static wxFontEncoding ms_defaultMBEncoding; + // conversion object which we really use, NULL until the first call to // either ToWChar() or FromWChar() wxMBConv *m_conv; + // the multibyte encoding to use by default if input isn't Unicode + wxFontEncoding m_encDefault; + // our BOM type BOMType m_bomType; diff --git a/src/common/convauto.cpp b/src/common/convauto.cpp index d43bb6d1ba..d23fad635e 100644 --- a/src/common/convauto.cpp +++ b/src/common/convauto.cpp @@ -30,10 +30,25 @@ #include "wx/convauto.h" +// we use latin1 by default as it seems the least bad choice: the files we need +// to detect input of don't always come from the user system (they are often +// received from other machines) and so using wxFONTENCODING_SYSTEM doesn't +// seem to be a good idea and there is no other reasonable alternative +wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1; + // ============================================================================ // implementation // ============================================================================ +/* static */ +void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc) +{ + wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT, + _T("wxFONTENCODING_DEFAULT doesn't make sense here") ); + + ms_defaultMBEncoding = enc; +} + /* static */ wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen) { @@ -118,8 +133,7 @@ void wxConvAuto::InitFromBOM(BOMType bomType) break; case BOM_UTF8: - m_conv = &wxConvUTF8; - m_ownsConv = false; + InitWithUTF8(); break; default: @@ -127,7 +141,7 @@ void wxConvAuto::InitFromBOM(BOMType bomType) // fall through: still need to create something case BOM_None: - InitWithDefault(); + InitWithUTF8(); m_consumedBOM = true; // as there is nothing to consume } } @@ -194,7 +208,27 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen, SkipBOM(&src, &srcLen); } - return m_conv->ToWChar(dst, dstLen, src, srcLen); + // try to convert using the auto-detected encoding + size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen); + if ( rc == wxCONV_FAILED && m_bomType == BOM_None ) + { + // if the conversion failed but we didn't really detect anything and + // simply tried UTF-8 by default, retry it using the fall-back + if ( m_encDefault != wxFONTENCODING_MAX ) + { + if ( m_ownsConv ) + delete m_conv; + + self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT + ? GetFallbackEncoding() + : m_encDefault); + self->m_ownsConv = true; + + rc = m_conv->ToWChar(dst, dstLen, src, srcLen); + } + } + + return rc; } size_t @@ -204,7 +238,7 @@ wxConvAuto::FromWChar(char *dst, size_t dstLen, if ( !m_conv ) { // default to UTF-8 for the multibyte output - wx_const_cast(wxConvAuto *, this)->InitWithDefault(); + wx_const_cast(wxConvAuto *, this)->InitWithUTF8(); } return m_conv->FromWChar(dst, dstLen, src, srcLen); -- 2.45.2