src/common/convauto.cpp

///////////////////////////////////////////////////////////////////////////////
// Name:        src/common/convauto.cpp
// Purpose:     implementation of wxConvAuto
// Author:      Vadim Zeitlin
// Created:     2006-04-04
// Copyright:   (c) 2006 Vadim Zeitlin <vadim@wxwindows.org>
// Licence:     wxWindows licence
///////////////////////////////////////////////////////////////////////////////

// ============================================================================
// declarations
// ============================================================================

// ----------------------------------------------------------------------------
// headers
// ----------------------------------------------------------------------------

// for compilers that support precompilation, includes "wx.h".
#include "wx/wxprec.h"

#ifdef __BORLANDC__
    #pragma hdrstop
#endif

#include "wx/convauto.h"

// we use latin1 by default as it seems the least bad choice: the files we need
// to detect input of don't always come from the user system (they are often
// received from other machines) and so using wxFONTENCODING_SYSTEM doesn't
// seem to be a good idea and there is no other reasonable alternative
wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1;

namespace
{

const char BOM_UTF32BE[] = { '\x00', '\x00', '\xFE', '\xFF' };
const char BOM_UTF32LE[] = { '\xFF', '\xFE', '\x00', '\x00' };
const char BOM_UTF16BE[] = { '\xFE', '\xFF'                 };
const char BOM_UTF16LE[] = { '\xFF', '\xFE'                 };
const char BOM_UTF8[]    = { '\xEF', '\xBB', '\xBF'         };

} // anonymous namespace

// ============================================================================
// implementation
// ============================================================================

/* static */
void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
{
    wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT,
                  wxT("wxFONTENCODING_DEFAULT doesn't make sense here") );

    ms_defaultMBEncoding = enc;
}

/* static */
const char* wxConvAuto::GetBOMChars(wxBOM bom, size_t* count)
{
    wxCHECK_MSG( count , NULL, wxS("count pointer must be provided") );

    switch ( bom )
    {
        case wxBOM_UTF32BE: *count = WXSIZEOF(BOM_UTF32BE); return BOM_UTF32BE;
        case wxBOM_UTF32LE: *count = WXSIZEOF(BOM_UTF32LE); return BOM_UTF32LE;
        case wxBOM_UTF16BE: *count = WXSIZEOF(BOM_UTF16BE); return BOM_UTF16BE;
        case wxBOM_UTF16LE: *count = WXSIZEOF(BOM_UTF16LE); return BOM_UTF16LE;
        case wxBOM_UTF8   : *count = WXSIZEOF(BOM_UTF8   ); return BOM_UTF8;
        case wxBOM_Unknown:
        case wxBOM_None:
            wxFAIL_MSG( wxS("Invalid BOM type") );
            return NULL;
    }

    wxFAIL_MSG( wxS("Unknown BOM type") );
    return NULL;
}

/* static */
wxBOM wxConvAuto::DetectBOM(const char *src, size_t srcLen)
{
    // examine the buffer for BOM presence
    //
    // quoting from http://www.unicode.org/faq/utf_bom.html#BOM:
    //
    //  Bytes           Encoding Form
    //
    //  00 00 FE FF     UTF-32, big-endian
    //  FF FE 00 00     UTF-32, little-endian
    //  FE FF           UTF-16, big-endian
    //  FF FE           UTF-16, little-endian
    //  EF BB BF        UTF-8
    //
    // as some BOMs are prefixes of other ones we may need to read more bytes
    // to disambiguate them

    switch ( srcLen )
    {
        case 0:
            return wxBOM_Unknown;

        case 1:
            if ( src[0] == '\x00' || src[0] == '\xFF' ||
                 src[0] == '\xFE' || src[0] == '\xEF')
            {
                // this could be a BOM but we don't know yet
                return wxBOM_Unknown;
            }
            break;

        case 2:
        case 3:
            if ( src[0] == '\xEF' && src[1] == '\xBB' )
            {
                if ( srcLen == 3 )
                    return src[2] == '\xBF' ? wxBOM_UTF8 : wxBOM_None;

                return wxBOM_Unknown;
            }

            if ( src[0] == '\xFE' && src[1] == '\xFF' )
                return wxBOM_UTF16BE;

            if ( src[0] == '\xFF' && src[1] == '\xFE' )
            {
                // if the next byte is 0, it could be an UTF-32LE BOM but if it
                // isn't we can be sure it's UTF-16LE
                if ( srcLen == 3 && src[2] != '\x00' )
                    return wxBOM_UTF16LE;

                return wxBOM_Unknown;
            }

            if ( src[0] == '\x00' && src[1] == '\x00' )
            {
                // this could only be UTF-32BE, check that the data we have so
                // far allows for it
                if ( srcLen == 3 && src[2] != '\xFE' )
                    return wxBOM_None;

                return wxBOM_Unknown;
            }
            break;

        default:
            // we have at least 4 characters so we may finally decide whether
            // we have a BOM or not
            if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
                return wxBOM_UTF8;

            if ( src[0] == '\x00' && src[1] == '\x00' &&
                 src[2] == '\xFE' && src[3] == '\xFF' )
                return wxBOM_UTF32BE;

            if ( src[0] == '\xFF' && src[1] == '\xFE' &&
                 src[2] == '\x00' && src[3] == '\x00' )
                return wxBOM_UTF32LE;

            if ( src[0] == '\xFE' && src[1] == '\xFF' )
                return wxBOM_UTF16BE;

            if ( src[0] == '\xFF' && src[1] == '\xFE' )
                return wxBOM_UTF16LE;
    }

    return wxBOM_None;
}

void wxConvAuto::InitFromBOM(wxBOM bomType)
{
    m_consumedBOM = false;

    switch ( bomType )
    {
        case wxBOM_Unknown:
            wxFAIL_MSG( "shouldn't be called for this BOM type" );
            break;

        case wxBOM_None:
            // use the default
            break;

        case wxBOM_UTF32BE:
            m_conv = new wxMBConvUTF32BE;
            m_ownsConv = true;
            break;

        case wxBOM_UTF32LE:
            m_conv = new wxMBConvUTF32LE;
            m_ownsConv = true;
            break;

        case wxBOM_UTF16BE:
            m_conv = new wxMBConvUTF16BE;
            m_ownsConv = true;
            break;

        case wxBOM_UTF16LE:
            m_conv = new wxMBConvUTF16LE;
            m_ownsConv = true;
            break;

        case wxBOM_UTF8:
            InitWithUTF8();
            break;

        default:
            wxFAIL_MSG( "unknown BOM type" );
    }

    if ( !m_conv )
    {
        // we end up here if there is no BOM or we didn't recognize it somehow
        // (this shouldn't happen but still don't crash if it does), so use the
        // default encoding
        InitWithUTF8();
        m_consumedBOM = true; // as there is nothing to consume
    }
}

void wxConvAuto::SkipBOM(const char **src, size_t *len) const
{
    int ofs;
    switch ( m_bomType )
    {
        case wxBOM_Unknown:
            wxFAIL_MSG( "shouldn't be called for this BOM type" );
            return;

        case wxBOM_None:
            ofs = 0;
            break;

        case wxBOM_UTF32BE:
        case wxBOM_UTF32LE:
            ofs = 4;
            break;

        case wxBOM_UTF16BE:
        case wxBOM_UTF16LE:
            ofs = 2;
            break;

        case wxBOM_UTF8:
            ofs = 3;
            break;

        default:
            wxFAIL_MSG( "unknown BOM type" );
            return;
    }

    *src += ofs;
    if ( *len != (size_t)-1 )
        *len -= ofs;
}

bool wxConvAuto::InitFromInput(const char *src, size_t len)
{
    m_bomType = DetectBOM(src, len == wxNO_LEN ? strlen(src) : len);
    if ( m_bomType == wxBOM_Unknown )
        return false;

    InitFromBOM(m_bomType);

    return true;
}

size_t
wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
                    const char *src, size_t srcLen) const
{
    // we check BOM and create the appropriate conversion the first time we're
    // called but we also need to ensure that the BOM is skipped not only
    // during this initial call but also during the first call with non-NULL
    // dst as typically we're first called with NULL dst to calculate the
    // needed buffer size
    wxConvAuto *self = const_cast<wxConvAuto *>(this);


    if ( !m_conv )
    {
        if ( !self->InitFromInput(src, srcLen) )
        {
            // there is not enough data to determine whether we have a BOM or
            // not, so fail for now -- the caller is supposed to call us again
            // with more data
            return wxCONV_FAILED;
        }
    }

    if ( !m_consumedBOM )
    {
        SkipBOM(&src, &srcLen);
        if ( srcLen == 0 )
        {
            // there is nothing left except the BOM so we'd return 0 below but
            // this is unexpected: decoding a non-empty string must either fail
            // or return something non-empty, in particular this would break
            // the code in wxTextInputStream::NextChar()
            //
            // so still return an error as we need some more data to be able to
            // decode it
            return wxCONV_FAILED;
        }
    }

    // try to convert using the auto-detected encoding
    size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
    if ( rc == wxCONV_FAILED && m_bomType == wxBOM_None )
    {
        // if the conversion failed but we didn't really detect anything and
        // simply tried UTF-8 by default, retry it using the fall-back
        if ( m_encDefault != wxFONTENCODING_MAX )
        {
            if ( m_ownsConv )
                delete m_conv;

            self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT
                                            ? GetFallbackEncoding()
                                            : m_encDefault);
            self->m_ownsConv = true;

            rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
        }
    }

    // don't skip the BOM again the next time if we really consumed it
    if ( rc != wxCONV_FAILED && dst && !m_consumedBOM )
        self->m_consumedBOM = true;

    return rc;
}

size_t
wxConvAuto::FromWChar(char *dst, size_t dstLen,
                      const wchar_t *src, size_t srcLen) const
{
    if ( !m_conv )
    {
        // default to UTF-8 for the multibyte output
        const_cast<wxConvAuto *>(this)->InitWithUTF8();
    }

    return m_conv->FromWChar(dst, dstLen, src, srcLen);
}
Commit	Line	Data
	1	///////////////////////////////////////////////////////////////////////////////
	2	// Name: src/common/convauto.cpp
	3	// Purpose: implementation of wxConvAuto
	4	// Author: Vadim Zeitlin
	5	// Created: 2006-04-04
	6	// Copyright: (c) 2006 Vadim Zeitlin <vadim@wxwindows.org>
	7	// Licence: wxWindows licence
	8	///////////////////////////////////////////////////////////////////////////////
	9
	10	// ============================================================================
	11	// declarations
	12	// ============================================================================
	13
	14	// ----------------------------------------------------------------------------
	15	// headers
	16	// ----------------------------------------------------------------------------
	17
	18	// for compilers that support precompilation, includes "wx.h".
	19	#include "wx/wxprec.h"
	20
	21	#ifdef __BORLANDC__
	22	#pragma hdrstop
	23	#endif
	24
	25	#include "wx/convauto.h"
	26
	27	// we use latin1 by default as it seems the least bad choice: the files we need
	28	// to detect input of don't always come from the user system (they are often
	29	// received from other machines) and so using wxFONTENCODING_SYSTEM doesn't
	30	// seem to be a good idea and there is no other reasonable alternative
	31	wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1;
	32
	33	namespace
	34	{
	35
	36	const char BOM_UTF32BE[] = { '\x00', '\x00', '\xFE', '\xFF' };
	37	const char BOM_UTF32LE[] = { '\xFF', '\xFE', '\x00', '\x00' };
	38	const char BOM_UTF16BE[] = { '\xFE', '\xFF' };
	39	const char BOM_UTF16LE[] = { '\xFF', '\xFE' };
	40	const char BOM_UTF8[] = { '\xEF', '\xBB', '\xBF' };
	41
	42	} // anonymous namespace
	43
	44	// ============================================================================
	45	// implementation
	46	// ============================================================================
	47
	48	/* static */
	49	void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
	50	{
	51	wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT,
	52	wxT("wxFONTENCODING_DEFAULT doesn't make sense here") );
	53
	54	ms_defaultMBEncoding = enc;
	55	}
	56
	57	/* static */
	58	const char* wxConvAuto::GetBOMChars(wxBOM bom, size_t* count)
	59	{
	60	wxCHECK_MSG( count , NULL, wxS("count pointer must be provided") );
	61
	62	switch ( bom )
	63	{
	64	case wxBOM_UTF32BE: *count = WXSIZEOF(BOM_UTF32BE); return BOM_UTF32BE;
	65	case wxBOM_UTF32LE: *count = WXSIZEOF(BOM_UTF32LE); return BOM_UTF32LE;
	66	case wxBOM_UTF16BE: *count = WXSIZEOF(BOM_UTF16BE); return BOM_UTF16BE;
	67	case wxBOM_UTF16LE: *count = WXSIZEOF(BOM_UTF16LE); return BOM_UTF16LE;
	68	case wxBOM_UTF8 : *count = WXSIZEOF(BOM_UTF8 ); return BOM_UTF8;
	69	case wxBOM_Unknown:
	70	case wxBOM_None:
	71	wxFAIL_MSG( wxS("Invalid BOM type") );
	72	return NULL;
	73	}
	74
	75	wxFAIL_MSG( wxS("Unknown BOM type") );
	76	return NULL;
	77	}
	78
	79	/* static */
	80	wxBOM wxConvAuto::DetectBOM(const char *src, size_t srcLen)
	81	{
	82	// examine the buffer for BOM presence
	83	//
	84	// quoting from http://www.unicode.org/faq/utf_bom.html#BOM:
	85	//
	86	// Bytes Encoding Form
	87	//
	88	// 00 00 FE FF UTF-32, big-endian
	89	// FF FE 00 00 UTF-32, little-endian
	90	// FE FF UTF-16, big-endian
	91	// FF FE UTF-16, little-endian
	92	// EF BB BF UTF-8
	93	//
	94	// as some BOMs are prefixes of other ones we may need to read more bytes
	95	// to disambiguate them
	96
	97	switch ( srcLen )
	98	{
	99	case 0:
	100	return wxBOM_Unknown;
	101
	102	case 1:
	103	if ( src[0] == '\x00' \|\| src[0] == '\xFF' \|\|
	104	src[0] == '\xFE' \|\| src[0] == '\xEF')
	105	{
	106	// this could be a BOM but we don't know yet
	107	return wxBOM_Unknown;
	108	}
	109	break;
	110
	111	case 2:
	112	case 3:
	113	if ( src[0] == '\xEF' && src[1] == '\xBB' )
	114	{
	115	if ( srcLen == 3 )
	116	return src[2] == '\xBF' ? wxBOM_UTF8 : wxBOM_None;
	117
	118	return wxBOM_Unknown;
	119	}
	120
	121	if ( src[0] == '\xFE' && src[1] == '\xFF' )
	122	return wxBOM_UTF16BE;
	123
	124	if ( src[0] == '\xFF' && src[1] == '\xFE' )
	125	{
	126	// if the next byte is 0, it could be an UTF-32LE BOM but if it
	127	// isn't we can be sure it's UTF-16LE
	128	if ( srcLen == 3 && src[2] != '\x00' )
	129	return wxBOM_UTF16LE;
	130
	131	return wxBOM_Unknown;
	132	}
	133
	134	if ( src[0] == '\x00' && src[1] == '\x00' )
	135	{
	136	// this could only be UTF-32BE, check that the data we have so
	137	// far allows for it
	138	if ( srcLen == 3 && src[2] != '\xFE' )
	139	return wxBOM_None;
	140
	141	return wxBOM_Unknown;
	142	}
	143	break;
	144
	145	default:
	146	// we have at least 4 characters so we may finally decide whether
	147	// we have a BOM or not
	148	if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
	149	return wxBOM_UTF8;
	150
	151	if ( src[0] == '\x00' && src[1] == '\x00' &&
	152	src[2] == '\xFE' && src[3] == '\xFF' )
	153	return wxBOM_UTF32BE;
	154
	155	if ( src[0] == '\xFF' && src[1] == '\xFE' &&
	156	src[2] == '\x00' && src[3] == '\x00' )
	157	return wxBOM_UTF32LE;
	158
	159	if ( src[0] == '\xFE' && src[1] == '\xFF' )
	160	return wxBOM_UTF16BE;
	161
	162	if ( src[0] == '\xFF' && src[1] == '\xFE' )
	163	return wxBOM_UTF16LE;
	164	}
	165
	166	return wxBOM_None;
	167	}
	168
	169	void wxConvAuto::InitFromBOM(wxBOM bomType)
	170	{
	171	m_consumedBOM = false;
	172
	173	switch ( bomType )
	174	{
	175	case wxBOM_Unknown:
	176	wxFAIL_MSG( "shouldn't be called for this BOM type" );
	177	break;
	178
	179	case wxBOM_None:
	180	// use the default
	181	break;
	182
	183	case wxBOM_UTF32BE:
	184	m_conv = new wxMBConvUTF32BE;
	185	m_ownsConv = true;
	186	break;
	187
	188	case wxBOM_UTF32LE:
	189	m_conv = new wxMBConvUTF32LE;
	190	m_ownsConv = true;
	191	break;
	192
	193	case wxBOM_UTF16BE:
	194	m_conv = new wxMBConvUTF16BE;
	195	m_ownsConv = true;
	196	break;
	197
	198	case wxBOM_UTF16LE:
	199	m_conv = new wxMBConvUTF16LE;
	200	m_ownsConv = true;
	201	break;
	202
	203	case wxBOM_UTF8:
	204	InitWithUTF8();
	205	break;
	206
	207	default:
	208	wxFAIL_MSG( "unknown BOM type" );
	209	}
	210
	211	if ( !m_conv )
	212	{
	213	// we end up here if there is no BOM or we didn't recognize it somehow
	214	// (this shouldn't happen but still don't crash if it does), so use the
	215	// default encoding
	216	InitWithUTF8();
	217	m_consumedBOM = true; // as there is nothing to consume
	218	}
	219	}
	220
	221	void wxConvAuto::SkipBOM(const char *src, size_t len) const
	222	{
	223	int ofs;
	224	switch ( m_bomType )
	225	{
	226	case wxBOM_Unknown:
	227	wxFAIL_MSG( "shouldn't be called for this BOM type" );
	228	return;
	229
	230	case wxBOM_None:
	231	ofs = 0;
	232	break;
	233
	234	case wxBOM_UTF32BE:
	235	case wxBOM_UTF32LE:
	236	ofs = 4;
	237	break;
	238
	239	case wxBOM_UTF16BE:
	240	case wxBOM_UTF16LE:
	241	ofs = 2;
	242	break;
	243
	244	case wxBOM_UTF8:
	245	ofs = 3;
	246	break;
	247
	248	default:
	249	wxFAIL_MSG( "unknown BOM type" );
	250	return;
	251	}
	252
	253	*src += ofs;
	254	if ( *len != (size_t)-1 )
	255	*len -= ofs;
	256	}
	257
	258	bool wxConvAuto::InitFromInput(const char *src, size_t len)
	259	{
	260	m_bomType = DetectBOM(src, len == wxNO_LEN ? strlen(src) : len);
	261	if ( m_bomType == wxBOM_Unknown )
	262	return false;
	263
	264	InitFromBOM(m_bomType);
	265
	266	return true;
	267	}
	268
	269	size_t
	270	wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
	271	const char *src, size_t srcLen) const
	272	{
	273	// we check BOM and create the appropriate conversion the first time we're
	274	// called but we also need to ensure that the BOM is skipped not only
	275	// during this initial call but also during the first call with non-NULL
	276	// dst as typically we're first called with NULL dst to calculate the
	277	// needed buffer size
	278	wxConvAuto self = const_cast<wxConvAuto >(this);
	279
	280
	281	if ( !m_conv )
	282	{
	283	if ( !self->InitFromInput(src, srcLen) )
	284	{
	285	// there is not enough data to determine whether we have a BOM or
	286	// not, so fail for now -- the caller is supposed to call us again
	287	// with more data
	288	return wxCONV_FAILED;
	289	}
	290	}
	291
	292	if ( !m_consumedBOM )
	293	{
	294	SkipBOM(&src, &srcLen);
	295	if ( srcLen == 0 )
	296	{
	297	// there is nothing left except the BOM so we'd return 0 below but
	298	// this is unexpected: decoding a non-empty string must either fail
	299	// or return something non-empty, in particular this would break
	300	// the code in wxTextInputStream::NextChar()
	301	//
	302	// so still return an error as we need some more data to be able to
	303	// decode it
	304	return wxCONV_FAILED;
	305	}
	306	}
	307
	308	// try to convert using the auto-detected encoding
	309	size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
	310	if ( rc == wxCONV_FAILED && m_bomType == wxBOM_None )
	311	{
	312	// if the conversion failed but we didn't really detect anything and
	313	// simply tried UTF-8 by default, retry it using the fall-back
	314	if ( m_encDefault != wxFONTENCODING_MAX )
	315	{
	316	if ( m_ownsConv )
	317	delete m_conv;
	318
	319	self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT
	320	? GetFallbackEncoding()
	321	: m_encDefault);
	322	self->m_ownsConv = true;
	323
	324	rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
	325	}
	326	}
	327
	328	// don't skip the BOM again the next time if we really consumed it
	329	if ( rc != wxCONV_FAILED && dst && !m_consumedBOM )
	330	self->m_consumedBOM = true;
	331
	332	return rc;
	333	}
	334
	335	size_t
	336	wxConvAuto::FromWChar(char *dst, size_t dstLen,
	337	const wchar_t *src, size_t srcLen) const
	338	{
	339	if ( !m_conv )
	340	{
	341	// default to UTF-8 for the multibyte output
	342	const_cast<wxConvAuto *>(this)->InitWithUTF8();
	343	}
	344
	345	return m_conv->FromWChar(dst, dstLen, src, srcLen);
	346	}