[wxWidgets.git] / src / common / convauto.cpp

///////////////////////////////////////////////////////////////////////////////
// Name:        src/common/convauto.cpp
// Purpose:     implementation of wxConvAuto
// Author:      Vadim Zeitlin
// Created:     2006-04-04
// RCS-ID:      $Id$
// Copyright:   (c) 2006 Vadim Zeitlin <vadim@wxwindows.org>
// Licence:     wxWindows licence
///////////////////////////////////////////////////////////////////////////////

// ============================================================================
// declarations
// ============================================================================

// ----------------------------------------------------------------------------
// headers
// ----------------------------------------------------------------------------

// for compilers that support precompilation, includes "wx.h".
#include "wx/wxprec.h"

#ifdef __BORLANDC__
    #pragma hdrstop
#endif

#if wxUSE_WCHAR_T

#ifndef WX_PRECOMP
#endif //WX_PRECOMP

#include "wx/convauto.h"

// we use latin1 by default as it seems the least bad choice: the files we need
// to detect input of don't always come from the user system (they are often
// received from other machines) and so using wxFONTENCODING_SYSTEM doesn't
// seem to be a good idea and there is no other reasonable alternative
wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1;

// ============================================================================
// implementation
// ============================================================================

/* static */
void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
{
    wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT,
                  _T("wxFONTENCODING_DEFAULT doesn't make sense here") );

    ms_defaultMBEncoding = enc;
}

/* static */
wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
{
    if ( srcLen < 2 )
    {
        // minimal BOM is 2 bytes so bail out immediately and simplify the code
        // below which wouldn't need to check for length for UTF-16 cases
        return BOM_None;
    }

    // examine the buffer for BOM presence
    //
    // see http://www.unicode.org/faq/utf_bom.html#BOM
    switch ( *src++ )
    {
        case '\0':
            // could only be big endian UTF-32 (00 00 FE FF)
            if ( srcLen >= 4 &&
                    src[0] == '\0' &&
                        src[1] == '\xfe' &&
                            src[2] == '\xff' )
            {
                return BOM_UTF32BE;
            }
            break;

        case '\xfe':
            // could only be big endian UTF-16 (FE FF)
            if ( *src++ == '\xff' )
            {
                return BOM_UTF16BE;
            }
            break;

        case '\xff':
            // could be either little endian UTF-16 or UTF-32, both start
            // with FF FE
            if ( *src++ == '\xfe' )
            {
                return srcLen >= 4 && src[0] == '\0' && src[1] == '\0'
                            ? BOM_UTF32LE
                            : BOM_UTF16LE;
            }
            break;

        case '\xef':
            // is this UTF-8 BOM (EF BB BF)?
            if ( srcLen >= 3 && src[0] == '\xbb' && src[1] == '\xbf' )
            {
                return BOM_UTF8;
            }
            break;
    }

    return BOM_None;
}

void wxConvAuto::InitFromBOM(BOMType bomType)
{
    m_consumedBOM = false;

    switch ( bomType )
    {
        case BOM_UTF32BE:
            m_conv = new wxMBConvUTF32BE;
            m_ownsConv = true;
            break;

        case BOM_UTF32LE:
            m_conv = new wxMBConvUTF32LE;
            m_ownsConv = true;
            break;

        case BOM_UTF16BE:
            m_conv = new wxMBConvUTF16BE;
            m_ownsConv = true;
            break;

        case BOM_UTF16LE:
            m_conv = new wxMBConvUTF16LE;
            m_ownsConv = true;
            break;

        case BOM_UTF8:
            InitWithUTF8();
            break;

        default:
            wxFAIL_MSG( _T("unexpected BOM type") );
            // fall through: still need to create something

        case BOM_None:
            InitWithUTF8();
            m_consumedBOM = true; // as there is nothing to consume
    }
}

void wxConvAuto::SkipBOM(const char **src, size_t *len) const
{
    int ofs;
    switch ( m_bomType )
    {
        case BOM_UTF32BE:
        case BOM_UTF32LE:
            ofs = 4;
            break;

        case BOM_UTF16BE:
        case BOM_UTF16LE:
            ofs = 2;
            break;

        case BOM_UTF8:
            ofs = 3;
            break;

        default:
            wxFAIL_MSG( _T("unexpected BOM type") );
            // fall through: still need to create something

        case BOM_None:
            ofs = 0;
    }

    *src += ofs;
    if ( *len != (size_t)-1 )
        *len -= ofs;
}

void wxConvAuto::InitFromInput(const char **src, size_t *len)
{
    m_bomType = DetectBOM(*src, *len);
    InitFromBOM(m_bomType);
    SkipBOM(src, len);
}

size_t
wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
                    const char *src, size_t srcLen) const
{
    // we check BOM and create the appropriate conversion the first time we're
    // called but we also need to ensure that the BOM is skipped not only
    // during this initial call but also during the first call with non-NULL
    // dst as typically we're first called with NULL dst to calculate the
    // needed buffer size
    wxConvAuto *self = const_cast<wxConvAuto *>(this);
    if ( !m_conv )
    {
        self->InitFromInput(&src, &srcLen);
        if ( dst )
            self->m_consumedBOM = true;
    }

    if ( !m_consumedBOM && dst )
    {
        self->m_consumedBOM = true;
        SkipBOM(&src, &srcLen);
    }

    // try to convert using the auto-detected encoding
    size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
    if ( rc == wxCONV_FAILED && m_bomType == BOM_None )
    {
        // if the conversion failed but we didn't really detect anything and
        // simply tried UTF-8 by default, retry it using the fall-back
        if ( m_encDefault != wxFONTENCODING_MAX )
        {
            if ( m_ownsConv )
                delete m_conv;

            self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT
                                            ? GetFallbackEncoding()
                                            : m_encDefault);
            self->m_ownsConv = true;

            rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
        }
    }

    return rc;
}

size_t
wxConvAuto::FromWChar(char *dst, size_t dstLen,
                      const wchar_t *src, size_t srcLen) const
{
    if ( !m_conv )
    {
        // default to UTF-8 for the multibyte output
        const_cast<wxConvAuto *>(this)->InitWithUTF8();
    }

    return m_conv->FromWChar(dst, dstLen, src, srcLen);
}

#endif // wxUSE_WCHAR_T
Commit	Line	Data
830f8f11 VZ	1	///////////////////////////////////////////////////////////////////////////////
	2	// Name: src/common/convauto.cpp
	3	// Purpose: implementation of wxConvAuto
	4	// Author: Vadim Zeitlin
	5	// Created: 2006-04-04
	6	// RCS-ID: $Id$
	7	// Copyright: (c) 2006 Vadim Zeitlin <vadim@wxwindows.org>
	8	// Licence: wxWindows licence
	9	///////////////////////////////////////////////////////////////////////////////
	10
	11	// ============================================================================
	12	// declarations
	13	// ============================================================================
	14
	15	// ----------------------------------------------------------------------------
	16	// headers
	17	// ----------------------------------------------------------------------------
	18
	19	// for compilers that support precompilation, includes "wx.h".
	20	#include "wx/wxprec.h"
	21
	22	#ifdef __BORLANDC__
	23	#pragma hdrstop
	24	#endif
	25
	26	#if wxUSE_WCHAR_T
	27
	28	#ifndef WX_PRECOMP
	29	#endif //WX_PRECOMP
	30
	31	#include "wx/convauto.h"
	32
01a9232b VZ	33	// we use latin1 by default as it seems the least bad choice: the files we need
	34	// to detect input of don't always come from the user system (they are often
	35	// received from other machines) and so using wxFONTENCODING_SYSTEM doesn't
	36	// seem to be a good idea and there is no other reasonable alternative
	37	wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1;
	38
830f8f11 VZ	39	// ============================================================================
	40	// implementation
	41	// ============================================================================
	42
01a9232b VZ	43	/* static */
	44	void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
	45	{
	46	wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT,
	47	_T("wxFONTENCODING_DEFAULT doesn't make sense here") );
	48
	49	ms_defaultMBEncoding = enc;
	50	}
	51
830f8f11 VZ	52	/* static */
	53	wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
	54	{
	55	if ( srcLen < 2 )
	56	{
	57	// minimal BOM is 2 bytes so bail out immediately and simplify the code
	58	// below which wouldn't need to check for length for UTF-16 cases
	59	return BOM_None;
	60	}
	61
	62	// examine the buffer for BOM presence
	63	//
	64	// see http://www.unicode.org/faq/utf_bom.html#BOM
	65	switch ( *src++ )
	66	{
	67	case '\0':
	68	// could only be big endian UTF-32 (00 00 FE FF)
	69	if ( srcLen >= 4 &&
	70	src[0] == '\0' &&
	71	src[1] == '\xfe' &&
	72	src[2] == '\xff' )
	73	{
	74	return BOM_UTF32BE;
	75	}
	76	break;
	77
	78	case '\xfe':
	79	// could only be big endian UTF-16 (FE FF)
	80	if ( *src++ == '\xff' )
	81	{
	82	return BOM_UTF16BE;
	83	}
	84	break;
	85
	86	case '\xff':
	87	// could be either little endian UTF-16 or UTF-32, both start
	88	// with FF FE
	89	if ( *src++ == '\xfe' )
	90	{
	91	return srcLen >= 4 && src[0] == '\0' && src[1] == '\0'
	92	? BOM_UTF32LE
	93	: BOM_UTF16LE;
	94	}
	95	break;
	96
	97	case '\xef':
	98	// is this UTF-8 BOM (EF BB BF)?
	99	if ( srcLen >= 3 && src[0] == '\xbb' && src[1] == '\xbf' )
	100	{
	101	return BOM_UTF8;
	102	}
	103	break;
	104	}
	105
	106	return BOM_None;
	107	}
	108
	109	void wxConvAuto::InitFromBOM(BOMType bomType)
	110	{
	111	m_consumedBOM = false;
	112
	113	switch ( bomType )
	114	{
	115	case BOM_UTF32BE:
116	m_conv = new wxMBConvUTF32BE;
117	m_ownsConv = true;
118	break;
119
120	case BOM_UTF32LE:
121	m_conv = new wxMBConvUTF32LE;
122	m_ownsConv = true;
123	break;
124
125	case BOM_UTF16BE:
126	m_conv = new wxMBConvUTF16BE;
127	m_ownsConv = true;
128	break;
129
130	case BOM_UTF16LE:
131	m_conv = new wxMBConvUTF16LE;
132	m_ownsConv = true;
133	break;
134
135	case BOM_UTF8:
01a9232b	136	InitWithUTF8();
830f8f11 VZ	137	break;
	138
	139	default:
	140	wxFAIL_MSG( _T("unexpected BOM type") );
	141	// fall through: still need to create something
	142
	143	case BOM_None:
01a9232b	144	InitWithUTF8();
830f8f11 VZ	145	m_consumedBOM = true; // as there is nothing to consume
	146	}
	147	}
	148
	149	void wxConvAuto::SkipBOM(const char *src, size_t len) const
	150	{
	151	int ofs;
	152	switch ( m_bomType )
	153	{
	154	case BOM_UTF32BE:
	155	case BOM_UTF32LE:
	156	ofs = 4;
	157	break;
	158
	159	case BOM_UTF16BE:
	160	case BOM_UTF16LE:
	161	ofs = 2;
	162	break;
	163
	164	case BOM_UTF8:
	165	ofs = 3;
	166	break;
	167
	168	default:
	169	wxFAIL_MSG( _T("unexpected BOM type") );
	170	// fall through: still need to create something
	171
	172	case BOM_None:
	173	ofs = 0;
	174	}
	175
	176	*src += ofs;
	177	if ( *len != (size_t)-1 )
	178	*len -= ofs;
	179	}
	180
	181	void wxConvAuto::InitFromInput(const char *src, size_t len)
	182	{
	183	m_bomType = DetectBOM(src, len);
	184	InitFromBOM(m_bomType);
	185	SkipBOM(src, len);
	186	}
	187
	188	size_t
	189	wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
	190	const char *src, size_t srcLen) const
	191	{
	192	// we check BOM and create the appropriate conversion the first time we're
	193	// called but we also need to ensure that the BOM is skipped not only
	194	// during this initial call but also during the first call with non-NULL
	195	// dst as typically we're first called with NULL dst to calculate the
	196	// needed buffer size
5c33522f	197	wxConvAuto self = const_cast<wxConvAuto >(this);
830f8f11 VZ	198	if ( !m_conv )
	199	{
	200	self->InitFromInput(&src, &srcLen);
	201	if ( dst )
	202	self->m_consumedBOM = true;
	203	}
	204
	205	if ( !m_consumedBOM && dst )
	206	{
	207	self->m_consumedBOM = true;
	208	SkipBOM(&src, &srcLen);
	209	}
	210
01a9232b VZ	211	// try to convert using the auto-detected encoding
	212	size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
	213	if ( rc == wxCONV_FAILED && m_bomType == BOM_None )
	214	{
	215	// if the conversion failed but we didn't really detect anything and
	216	// simply tried UTF-8 by default, retry it using the fall-back
	217	if ( m_encDefault != wxFONTENCODING_MAX )
	218	{
	219	if ( m_ownsConv )
	220	delete m_conv;
	221
	222	self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT
	223	? GetFallbackEncoding()
	224	: m_encDefault);
	225	self->m_ownsConv = true;
	226
	227	rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
	228	}
	229	}
	230
	231	return rc;
830f8f11 VZ	232	}
	233
	234	size_t
	235	wxConvAuto::FromWChar(char *dst, size_t dstLen,
	236	const wchar_t *src, size_t srcLen) const
	237	{
	238	if ( !m_conv )
	239	{
	240	// default to UTF-8 for the multibyte output
5c33522f	241	const_cast<wxConvAuto *>(this)->InitWithUTF8();
830f8f11 VZ	242	}
	243
	244	return m_conv->FromWChar(dst, dstLen, src, srcLen);
	245	}
	246
	247	#endif // wxUSE_WCHAR_T
	248