[wxWidgets.git] / src / common / convauto.cpp

///////////////////////////////////////////////////////////////////////////////
// Name:        src/common/convauto.cpp
// Purpose:     implementation of wxConvAuto
// Author:      Vadim Zeitlin
// Created:     2006-04-04
// RCS-ID:      $Id$
// Copyright:   (c) 2006 Vadim Zeitlin <vadim@wxwindows.org>
// Licence:     wxWindows licence
///////////////////////////////////////////////////////////////////////////////

// ============================================================================
// declarations
// ============================================================================

// ----------------------------------------------------------------------------
// headers
// ----------------------------------------------------------------------------

// for compilers that support precompilation, includes "wx.h".
#include "wx/wxprec.h"

#ifdef __BORLANDC__
    #pragma hdrstop
#endif

#if wxUSE_WCHAR_T

#ifndef WX_PRECOMP
    #include "wx/wx.h"
#endif //WX_PRECOMP

#include "wx/convauto.h"

// we use latin1 by default as it seems the least bad choice: the files we need
// to detect input of don't always come from the user system (they are often
// received from other machines) and so using wxFONTENCODING_SYSTEM doesn't
// seem to be a good idea and there is no other reasonable alternative
wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1;

// ============================================================================
// implementation
// ============================================================================

/* static */
void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
{
    wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT,
                  wxT("wxFONTENCODING_DEFAULT doesn't make sense here") );

    ms_defaultMBEncoding = enc;
}

/* static */
wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
{
    // examine the buffer for BOM presence
    //
    // quoting from http://www.unicode.org/faq/utf_bom.html#BOM:
    //
    //  Bytes           Encoding Form
    //
    //  00 00 FE FF     UTF-32, big-endian
    //  FF FE 00 00     UTF-32, little-endian
    //  FE FF           UTF-16, big-endian
    //  FF FE           UTF-16, little-endian
    //  EF BB BF        UTF-8
    //
    // as some BOMs are prefixes of other ones we may need to read more bytes
    // to disambiguate them

    switch ( srcLen )
    {
        case 0:
            return BOM_Unknown;

        case 1:
            if ( src[0] == '\x00' || src[0] == '\xFF' ||
                 src[0] == '\xFE' || src[0] == '\xEF')
            {
                // this could be a BOM but we don't know yet
                return BOM_Unknown;
            }
            break;

        case 2:
        case 3:
            if ( src[0] == '\xEF' && src[1] == '\xBB' )
            {
                if ( srcLen == 3 )
                    return src[2] == '\xBF' ? BOM_UTF8 : BOM_None;

                return BOM_Unknown;
            }

            if ( src[0] == '\xFE' && src[1] == '\xFF' )
                return BOM_UTF16BE;

            if ( src[0] == '\xFF' && src[1] == '\xFE' )
            {
                // if the next byte is 0, it could be an UTF-32LE BOM but if it
                // isn't we can be sure it's UTF-16LE
                if ( srcLen == 3 && src[2] != '\x00' )
                    return BOM_UTF16LE;

                return BOM_Unknown;
            }

            if ( src[0] == '\x00' && src[1] == '\x00' )
            {
                // this could only be UTF-32BE, check that the data we have so
                // far allows for it
                if ( srcLen == 3 && src[2] != '\xFE' )
                    return BOM_None;

                return BOM_Unknown;
            }
            break;

        default:
            // we have at least 4 characters so we may finally decide whether
            // we have a BOM or not
            if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
                return BOM_UTF8;

            if ( src[0] == '\x00' && src[1] == '\x00' &&
                 src[2] == '\xFE' && src[3] == '\xFF' )
                return BOM_UTF32BE;

            if ( src[0] == '\xFF' && src[1] == '\xFE' &&
                 src[2] == '\x00' && src[3] == '\x00' )
                return BOM_UTF32LE;

            if ( src[0] == '\xFE' && src[1] == '\xFF' )
                return BOM_UTF16BE;

            if ( src[0] == '\xFF' && src[1] == '\xFE' )
                return BOM_UTF16LE;
    }

    return BOM_None;
}

void wxConvAuto::InitFromBOM(BOMType bomType)
{
    m_consumedBOM = false;

    switch ( bomType )
    {
        case BOM_Unknown:
            wxFAIL_MSG( "shouldn't be called for this BOM type" );
            break;

        case BOM_None:
            // use the default
            break;

        case BOM_UTF32BE:
            m_conv = new wxMBConvUTF32BE;
            m_ownsConv = true;
            break;

        case BOM_UTF32LE:
            m_conv = new wxMBConvUTF32LE;
            m_ownsConv = true;
            break;

        case BOM_UTF16BE:
            m_conv = new wxMBConvUTF16BE;
            m_ownsConv = true;
            break;

        case BOM_UTF16LE:
            m_conv = new wxMBConvUTF16LE;
            m_ownsConv = true;
            break;

        case BOM_UTF8:
            InitWithUTF8();
            break;

        default:
            wxFAIL_MSG( "unknown BOM type" );
    }

    if ( !m_conv )
    {
        // we end up here if there is no BOM or we didn't recognize it somehow
        // (this shouldn't happen but still don't crash if it does), so use the
        // default encoding
        InitWithUTF8();
        m_consumedBOM = true; // as there is nothing to consume
    }
}

void wxConvAuto::SkipBOM(const char **src, size_t *len) const
{
    int ofs;
    switch ( m_bomType )
    {
        case BOM_Unknown:
            wxFAIL_MSG( "shouldn't be called for this BOM type" );
            return;

        case BOM_None:
            ofs = 0;
            break;

        case BOM_UTF32BE:
        case BOM_UTF32LE:
            ofs = 4;
            break;

        case BOM_UTF16BE:
        case BOM_UTF16LE:
            ofs = 2;
            break;

        case BOM_UTF8:
            ofs = 3;
            break;

        default:
            wxFAIL_MSG( "unknown BOM type" );
            return;
    }

    *src += ofs;
    if ( *len != (size_t)-1 )
        *len -= ofs;
}

bool wxConvAuto::InitFromInput(const char *src, size_t len)
{
    m_bomType = DetectBOM(src, len);
    if ( m_bomType == BOM_Unknown )
        return false;

    InitFromBOM(m_bomType);

    return true;
}

size_t
wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
                    const char *src, size_t srcLen) const
{
    // we check BOM and create the appropriate conversion the first time we're
    // called but we also need to ensure that the BOM is skipped not only
    // during this initial call but also during the first call with non-NULL
    // dst as typically we're first called with NULL dst to calculate the
    // needed buffer size
    wxConvAuto *self = const_cast<wxConvAuto *>(this);


    if ( !m_conv )
    {
        if ( !self->InitFromInput(src, srcLen) )
        {
            // there is not enough data to determine whether we have a BOM or
            // not, so fail for now -- the caller is supposed to call us again
            // with more data
            return wxCONV_FAILED;
        }
    }

    if ( !m_consumedBOM )
    {
        SkipBOM(&src, &srcLen);
        if ( srcLen == 0 )
        {
            // there is nothing left except the BOM so we'd return 0 below but
            // this is unexpected: decoding a non-empty string must either fail
            // or return something non-empty, in particular this would break
            // the code in wxTextInputStream::NextChar()
            //
            // so still return an error as we need some more data to be able to
            // decode it
            return wxCONV_FAILED;
        }
    }

    // try to convert using the auto-detected encoding
    size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
    if ( rc == wxCONV_FAILED && m_bomType == BOM_None )
    {
        // if the conversion failed but we didn't really detect anything and
        // simply tried UTF-8 by default, retry it using the fall-back
        if ( m_encDefault != wxFONTENCODING_MAX )
        {
            if ( m_ownsConv )
                delete m_conv;

            self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT
                                            ? GetFallbackEncoding()
                                            : m_encDefault);
            self->m_ownsConv = true;

            rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
        }
    }

    // don't skip the BOM again the next time if we really consumed it
    if ( rc != wxCONV_FAILED && dst && !m_consumedBOM )
        self->m_consumedBOM = true;

    return rc;
}

size_t
wxConvAuto::FromWChar(char *dst, size_t dstLen,
                      const wchar_t *src, size_t srcLen) const
{
    if ( !m_conv )
    {
        // default to UTF-8 for the multibyte output
        const_cast<wxConvAuto *>(this)->InitWithUTF8();
    }

    return m_conv->FromWChar(dst, dstLen, src, srcLen);
}

#endif // wxUSE_WCHAR_T
Commit	Line	Data
830f8f11 VZ	1	///////////////////////////////////////////////////////////////////////////////
	2	// Name: src/common/convauto.cpp
	3	// Purpose: implementation of wxConvAuto
	4	// Author: Vadim Zeitlin
	5	// Created: 2006-04-04
	6	// RCS-ID: $Id$
	7	// Copyright: (c) 2006 Vadim Zeitlin <vadim@wxwindows.org>
	8	// Licence: wxWindows licence
	9	///////////////////////////////////////////////////////////////////////////////
	10
	11	// ============================================================================
	12	// declarations
	13	// ============================================================================
	14
	15	// ----------------------------------------------------------------------------
	16	// headers
	17	// ----------------------------------------------------------------------------
	18
	19	// for compilers that support precompilation, includes "wx.h".
	20	#include "wx/wxprec.h"
	21
	22	#ifdef __BORLANDC__
	23	#pragma hdrstop
	24	#endif
	25
	26	#if wxUSE_WCHAR_T
	27
	28	#ifndef WX_PRECOMP
4cb0e8d0	29	#include "wx/wx.h"
830f8f11 VZ	30	#endif //WX_PRECOMP
	31
	32	#include "wx/convauto.h"
	33
01a9232b VZ	34	// we use latin1 by default as it seems the least bad choice: the files we need
	35	// to detect input of don't always come from the user system (they are often
	36	// received from other machines) and so using wxFONTENCODING_SYSTEM doesn't
	37	// seem to be a good idea and there is no other reasonable alternative
	38	wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1;
	39
830f8f11 VZ	40	// ============================================================================
	41	// implementation
	42	// ============================================================================
	43
01a9232b VZ	44	/* static */
	45	void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
	46	{
	47	wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT,
9a83f860	48	wxT("wxFONTENCODING_DEFAULT doesn't make sense here") );
01a9232b VZ	49
	50	ms_defaultMBEncoding = enc;
	51	}
	52
830f8f11 VZ	53	/* static */
	54	wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
	55	{
830f8f11 VZ	56	// examine the buffer for BOM presence
830f8f11 VZ	57	//
4cb0e8d0 VZ	58	// quoting from http://www.unicode.org/faq/utf_bom.html#BOM:
	59	//
	60	// Bytes Encoding Form
	61	//
	62	// 00 00 FE FF UTF-32, big-endian
	63	// FF FE 00 00 UTF-32, little-endian
	64	// FE FF UTF-16, big-endian
	65	// FF FE UTF-16, little-endian
	66	// EF BB BF UTF-8
	67	//
	68	// as some BOMs are prefixes of other ones we may need to read more bytes
	69	// to disambiguate them
	70
	71	switch ( srcLen )
830f8f11	72	{
4cb0e8d0 VZ	73	case 0:
	74	return BOM_Unknown;
	75
	76	case 1:
	77	if ( src[0] == '\x00' \|\| src[0] == '\xFF' \|\|
	78	src[0] == '\xFE' \|\| src[0] == '\xEF')
830f8f11	79	{
4cb0e8d0 VZ	80	// this could be a BOM but we don't know yet
4cb0e8d0 VZ	81	return BOM_Unknown;
830f8f11 VZ	82	}
	83	break;
	84
4cb0e8d0 VZ	85	case 2:
	86	case 3:
	87	if ( src[0] == '\xEF' && src[1] == '\xBB' )
830f8f11	88	{
4cb0e8d0 VZ	89	if ( srcLen == 3 )
	90	return src[2] == '\xBF' ? BOM_UTF8 : BOM_None;
	91
	92	return BOM_Unknown;
830f8f11	93	}
830f8f11	94
4cb0e8d0 VZ	95	if ( src[0] == '\xFE' && src[1] == '\xFF' )
	96	return BOM_UTF16BE;
	97
	98	if ( src[0] == '\xFF' && src[1] == '\xFE' )
830f8f11	99	{
4cb0e8d0 VZ	100	// if the next byte is 0, it could be an UTF-32LE BOM but if it
	101	// isn't we can be sure it's UTF-16LE
	102	if ( srcLen == 3 && src[2] != '\x00' )
	103	return BOM_UTF16LE;
	104
	105	return BOM_Unknown;
830f8f11	106	}
830f8f11	107
4cb0e8d0	108	if ( src[0] == '\x00' && src[1] == '\x00' )
830f8f11	109	{
823e82e2 VZ	110	// this could only be UTF-32BE, check that the data we have so
	111	// far allows for it
	112	if ( srcLen == 3 && src[2] != '\xFE' )
	113	return BOM_None;
4cb0e8d0	114
823e82e2 VZ	115	return BOM_Unknown;
823e82e2 VZ	116	}
830f8f11	117	break;
4cb0e8d0 VZ	118
	119	default:
	120	// we have at least 4 characters so we may finally decide whether
	121	// we have a BOM or not
	122	if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
	123	return BOM_UTF8;
	124
	125	if ( src[0] == '\x00' && src[1] == '\x00' &&
	126	src[2] == '\xFE' && src[3] == '\xFF' )
	127	return BOM_UTF32BE;
	128
	129	if ( src[0] == '\xFF' && src[1] == '\xFE' &&
	130	src[2] == '\x00' && src[3] == '\x00' )
	131	return BOM_UTF32LE;
	132
	133	if ( src[0] == '\xFE' && src[1] == '\xFF' )
	134	return BOM_UTF16BE;
	135
	136	if ( src[0] == '\xFF' && src[1] == '\xFE' )
	137	return BOM_UTF16LE;
830f8f11 VZ	138	}
	139
	140	return BOM_None;
	141	}
	142
	143	void wxConvAuto::InitFromBOM(BOMType bomType)
	144	{
	145	m_consumedBOM = false;
	146
	147	switch ( bomType )
	148	{
4cb0e8d0 VZ	149	case BOM_Unknown:
	150	wxFAIL_MSG( "shouldn't be called for this BOM type" );
	151	break;
	152
	153	case BOM_None:
	154	// use the default
	155	break;
	156
830f8f11 VZ	157	case BOM_UTF32BE:
	158	m_conv = new wxMBConvUTF32BE;
	159	m_ownsConv = true;
	160	break;
	161
	162	case BOM_UTF32LE:
	163	m_conv = new wxMBConvUTF32LE;
	164	m_ownsConv = true;
	165	break;
	166
	167	case BOM_UTF16BE:
	168	m_conv = new wxMBConvUTF16BE;
	169	m_ownsConv = true;
	170	break;
	171
	172	case BOM_UTF16LE:
	173	m_conv = new wxMBConvUTF16LE;
	174	m_ownsConv = true;
	175	break;
	176
	177	case BOM_UTF8:
01a9232b	178	InitWithUTF8();
830f8f11 VZ	179	break;
	180
	181	default:
4cb0e8d0 VZ	182	wxFAIL_MSG( "unknown BOM type" );
4cb0e8d0 VZ	183	}
830f8f11	184
4cb0e8d0 VZ	185	if ( !m_conv )
	186	{
	187	// we end up here if there is no BOM or we didn't recognize it somehow
	188	// (this shouldn't happen but still don't crash if it does), so use the
	189	// default encoding
	190	InitWithUTF8();
	191	m_consumedBOM = true; // as there is nothing to consume
830f8f11 VZ	192	}
	193	}
	194
	195	void wxConvAuto::SkipBOM(const char *src, size_t len) const
	196	{
	197	int ofs;
	198	switch ( m_bomType )
	199	{
4cb0e8d0 VZ	200	case BOM_Unknown:
	201	wxFAIL_MSG( "shouldn't be called for this BOM type" );
	202	return;
	203
	204	case BOM_None:
	205	ofs = 0;
	206	break;
	207
830f8f11 VZ	208	case BOM_UTF32BE:
	209	case BOM_UTF32LE:
	210	ofs = 4;
	211	break;
	212
	213	case BOM_UTF16BE:
	214	case BOM_UTF16LE:
	215	ofs = 2;
	216	break;
	217
	218	case BOM_UTF8:
	219	ofs = 3;
	220	break;
	221
	222	default:
4cb0e8d0 VZ	223	wxFAIL_MSG( "unknown BOM type" );
4cb0e8d0 VZ	224	return;
830f8f11 VZ	225	}
	226
	227	*src += ofs;
	228	if ( *len != (size_t)-1 )
	229	*len -= ofs;
	230	}
	231
4ca97396	232	bool wxConvAuto::InitFromInput(const char *src, size_t len)
830f8f11	233	{
4ca97396	234	m_bomType = DetectBOM(src, len);
4cb0e8d0 VZ	235	if ( m_bomType == BOM_Unknown )
	236	return false;
	237
830f8f11	238	InitFromBOM(m_bomType);
4cb0e8d0 VZ	239
4cb0e8d0 VZ	240	return true;
830f8f11 VZ	241	}
	242
	243	size_t
	244	wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
	245	const char *src, size_t srcLen) const
	246	{
	247	// we check BOM and create the appropriate conversion the first time we're
	248	// called but we also need to ensure that the BOM is skipped not only
	249	// during this initial call but also during the first call with non-NULL
	250	// dst as typically we're first called with NULL dst to calculate the
	251	// needed buffer size
5c33522f	252	wxConvAuto self = const_cast<wxConvAuto >(this);
4cb0e8d0 VZ	253
4cb0e8d0 VZ	254
830f8f11 VZ	255	if ( !m_conv )
830f8f11 VZ	256	{
4ca97396	257	if ( !self->InitFromInput(src, srcLen) )
4cb0e8d0 VZ	258	{
	259	// there is not enough data to determine whether we have a BOM or
	260	// not, so fail for now -- the caller is supposed to call us again
	261	// with more data
	262	return wxCONV_FAILED;
	263	}
830f8f11	264	}
4ca97396 VZ	265
4ca97396 VZ	266	if ( !m_consumedBOM )
830f8f11	267	{
830f8f11	268	SkipBOM(&src, &srcLen);
4ca97396 VZ	269	if ( srcLen == 0 )
	270	{
	271	// there is nothing left except the BOM so we'd return 0 below but
	272	// this is unexpected: decoding a non-empty string must either fail
	273	// or return something non-empty, in particular this would break
	274	// the code in wxTextInputStream::NextChar()
	275	//
	276	// so still return an error as we need some more data to be able to
	277	// decode it
	278	return wxCONV_FAILED;
	279	}
830f8f11 VZ	280	}
830f8f11 VZ	281
01a9232b VZ	282	// try to convert using the auto-detected encoding
	283	size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
	284	if ( rc == wxCONV_FAILED && m_bomType == BOM_None )
	285	{
	286	// if the conversion failed but we didn't really detect anything and
	287	// simply tried UTF-8 by default, retry it using the fall-back
	288	if ( m_encDefault != wxFONTENCODING_MAX )
	289	{
	290	if ( m_ownsConv )
	291	delete m_conv;
	292
	293	self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT
	294	? GetFallbackEncoding()
	295	: m_encDefault);
	296	self->m_ownsConv = true;
	297
	298	rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
	299	}
	300	}
	301
4ca97396 VZ	302	// don't skip the BOM again the next time if we really consumed it
4ca97396 VZ	303	if ( rc != wxCONV_FAILED && dst && !m_consumedBOM )
4cb0e8d0	304	self->m_consumedBOM = true;
4ca97396	305
01a9232b	306	return rc;
830f8f11 VZ	307	}
	308
	309	size_t
	310	wxConvAuto::FromWChar(char *dst, size_t dstLen,
	311	const wchar_t *src, size_t srcLen) const
	312	{
	313	if ( !m_conv )
	314	{
	315	// default to UTF-8 for the multibyte output
5c33522f	316	const_cast<wxConvAuto *>(this)->InitWithUTF8();
830f8f11 VZ	317	}
	318
	319	return m_conv->FromWChar(dst, dstLen, src, srcLen);
	320	}
	321
	322	#endif // wxUSE_WCHAR_T