[wxWidgets.git] / include / wx / stringops.h

///////////////////////////////////////////////////////////////////////////////
// Name:        wx/stringops.h
// Purpose:     implementation of wxString primitive operations
// Author:      Vaclav Slavik
// Modified by:
// Created:     2007-04-16
// RCS-ID:      $Id$
// Copyright:   (c) 2007 REA Elektronik GmbH
// Licence:     wxWindows licence
///////////////////////////////////////////////////////////////////////////////

#ifndef _WX_WXSTRINGOPS_H__
#define _WX_WXSTRINGOPS_H__

#include "wx/chartype.h"
#include "wx/stringimpl.h"
#include "wx/unichar.h"
#include "wx/buffer.h"

// This header contains wxStringOperations "namespace" class that implements
// elementary operations on string data as static methods; wxString methods and
// iterators are implemented in terms of it. Two implementations are available,
// one for UTF-8 encoded char* string and one for "raw" wchar_t* strings (or
// char* in ANSI build).

// FIXME-UTF8: only wchar after we remove ANSI build
#if wxUSE_UNICODE_WCHAR || !wxUSE_UNICODE
struct WXDLLIMPEXP_BASE wxStringOperationsWchar
{
    // moves the iterator to the next Unicode character
    static void IncIter(wxStringImpl::iterator& i) { ++i; }
    static void IncIter(wxStringImpl::const_iterator& i) { ++i; }

    // moves the iterator to the previous Unicode character
    static void DecIter(wxStringImpl::iterator& i) { --i; }
    static void DecIter(wxStringImpl::const_iterator& i) { --i; }

    // moves the iterator by n Unicode characters
    static wxStringImpl::iterator AddToIter(const wxStringImpl::iterator& i, ptrdiff_t n)
        { return i + n; }
    static wxStringImpl::const_iterator AddToIter(const wxStringImpl::const_iterator& i, ptrdiff_t n)
        { return i + n; }

    // returns distance of the two iterators in Unicode characters
    static ptrdiff_t DiffIters(const wxStringImpl::iterator& i1,
                               const wxStringImpl::iterator& i2)
        { return i1 - i2; }
    static ptrdiff_t DiffIters(const wxStringImpl::const_iterator& i1,
                               const wxStringImpl::const_iterator& i2)
        { return i1 - i2; }

    // encodes the character to a form used to represent it in internal
    // representation (returns a string in UTF8 version)
    static wxChar EncodeChar(const wxUniChar& ch) { return (wxChar)ch; }

    static wxUniChar DecodeChar(const wxStringImpl::const_iterator& i)
        { return *i; }
};
#endif // wxUSE_UNICODE_WCHAR || !wxUSE_UNICODE


#if wxUSE_UNICODE_UTF8
struct WXDLLIMPEXP_BASE wxStringOperationsUtf8
{
    // checks correctness of UTF-8 sequence
    static bool IsValidUtf8String(const char *c,
                                  size_t len = wxStringImpl::npos);
    static bool IsValidUtf8LeadByte(unsigned char c)
    {
        return (c <= 0x7F) || (c >= 0xC2 && c <= 0xF4);
    }

    // table of offsets to skip forward when iterating over UTF-8 sequence
    static const unsigned char ms_utf8IterTable[256];


    template<typename Iterator>
    static void IncIter(Iterator& i)
    {
        wxASSERT( IsValidUtf8LeadByte(*i) );
        i += ms_utf8IterTable[(unsigned char)*i];
    }

    template<typename Iterator>
    static void DecIter(Iterator& i)
    {
        wxASSERT( IsValidUtf8LeadByte(*i) );

        // Non-lead bytes are all in the 0x80..0xBF range (i.e. 10xxxxxx in
        // binary), so we just have to go back until we hit a byte that is
        // either < 0x80 (i.e. 0xxxxxxx in binary) or 0xC0..0xFF (11xxxxxx in
        // binary; this includes some invalid values, but we can ignore it
        // here, because we assume valid UTF-8 input for the purpose of
        // efficient implementation).
        --i;
        while ( ((*i) & 0xC0) == 0x80 /* 2 highest bits are '10' */ )
            --i;
    }

    template<typename Iterator>
    static Iterator AddToIter(const Iterator& i, ptrdiff_t n)
    {
        Iterator out(i);

        if ( n > 0 )
        {
            for ( ptrdiff_t j = 0; j < n; ++j )
                IncIter(out);
        }
        else if ( n < 0 )
        {
            for ( ptrdiff_t j = 0; j > n; --j )
                DecIter(out);
        }

        return out;
    }

    template<typename Iterator>
    static ptrdiff_t DiffIters(Iterator i1, Iterator i2)
    {
        ptrdiff_t dist = 0;

        if ( i1 < i2 )
        {
            while ( i1 != i2 )
            {
                IncIter(i1);
                dist--;
            }
        }
        else if ( i2 < i1 )
        {
            while ( i2 != i1 )
            {
                IncIter(i2);
                dist++;
            }
        }

        return dist;
    }

    // encodes the character as UTF-8:
    typedef wxUniChar::Utf8CharBuffer Utf8CharBuffer;
    static Utf8CharBuffer EncodeChar(const wxUniChar& ch)
        { return ch.AsUTF8(); }

    // returns n copies of ch encoded in UTF-8 string
    static wxCharBuffer EncodeNChars(size_t n, const wxUniChar& ch);

    // returns the length of UTF-8 encoding of the character with lead byte 'c'
    static size_t GetUtf8CharLength(char c)
    {
        wxASSERT( IsValidUtf8LeadByte(c) );
        return ms_utf8IterTable[(unsigned char)c];
    }

    // decodes single UTF-8 character from UTF-8 string
    static wxUniChar DecodeChar(wxStringImpl::const_iterator i)
    {
        if ( (unsigned char)*i < 0x80 )
            return (int)*i;
        return DecodeNonAsciiChar(i);
    }

private:
    static wxUniChar DecodeNonAsciiChar(wxStringImpl::const_iterator i);
};
#endif // wxUSE_UNICODE_UTF8


#if wxUSE_UNICODE_UTF8
typedef wxStringOperationsUtf8 wxStringOperations;
#else
typedef wxStringOperationsWchar wxStringOperations;
#endif

#endif  // _WX_WXSTRINGOPS_H_
Commit	Line	Data
467175ab VS	1	///////////////////////////////////////////////////////////////////////////////
	2	// Name: wx/stringops.h
	3	// Purpose: implementation of wxString primitive operations
	4	// Author: Vaclav Slavik
	5	// Modified by:
	6	// Created: 2007-04-16
	7	// RCS-ID: $Id$
	8	// Copyright: (c) 2007 REA Elektronik GmbH
	9	// Licence: wxWindows licence
	10	///////////////////////////////////////////////////////////////////////////////
	11
	12	#ifndef _WX_WXSTRINGOPS_H__
	13	#define _WX_WXSTRINGOPS_H__
	14
	15	#include "wx/chartype.h"
	16	#include "wx/stringimpl.h"
	17	#include "wx/unichar.h"
04d29fda	18	#include "wx/buffer.h"
467175ab VS	19
	20	// This header contains wxStringOperations "namespace" class that implements
	21	// elementary operations on string data as static methods; wxString methods and
	22	// iterators are implemented in terms of it. Two implementations are available,
	23	// one for UTF-8 encoded char* string and one for "raw" wchar_t* strings (or
	24	// char* in ANSI build).
	25
	26	// FIXME-UTF8: only wchar after we remove ANSI build
	27	#if wxUSE_UNICODE_WCHAR \|\| !wxUSE_UNICODE
	28	struct WXDLLIMPEXP_BASE wxStringOperationsWchar
	29	{
	30	// moves the iterator to the next Unicode character
	31	static void IncIter(wxStringImpl::iterator& i) { ++i; }
	32	static void IncIter(wxStringImpl::const_iterator& i) { ++i; }
	33
	34	// moves the iterator to the previous Unicode character
	35	static void DecIter(wxStringImpl::iterator& i) { --i; }
	36	static void DecIter(wxStringImpl::const_iterator& i) { --i; }
	37
	38	// moves the iterator by n Unicode characters
b5343e06	39	static wxStringImpl::iterator AddToIter(const wxStringImpl::iterator& i, ptrdiff_t n)
467175ab	40	{ return i + n; }
b5343e06	41	static wxStringImpl::const_iterator AddToIter(const wxStringImpl::const_iterator& i, ptrdiff_t n)
467175ab VS	42	{ return i + n; }
	43
	44	// returns distance of the two iterators in Unicode characters
b5343e06 VZ	45	static ptrdiff_t DiffIters(const wxStringImpl::iterator& i1,
b5343e06 VZ	46	const wxStringImpl::iterator& i2)
467175ab	47	{ return i1 - i2; }
b5343e06 VZ	48	static ptrdiff_t DiffIters(const wxStringImpl::const_iterator& i1,
b5343e06 VZ	49	const wxStringImpl::const_iterator& i2)
467175ab VS	50	{ return i1 - i2; }
	51
	52	// encodes the character to a form used to represent it in internal
	53	// representation (returns a string in UTF8 version)
	54	static wxChar EncodeChar(const wxUniChar& ch) { return (wxChar)ch; }
	55
	56	static wxUniChar DecodeChar(const wxStringImpl::const_iterator& i)
	57	{ return *i; }
	58	};
	59	#endif // wxUSE_UNICODE_WCHAR \|\| !wxUSE_UNICODE
	60
	61
	62	#if wxUSE_UNICODE_UTF8
	63	struct WXDLLIMPEXP_BASE wxStringOperationsUtf8
	64	{
	65	// checks correctness of UTF-8 sequence
111d9948 VS	66	static bool IsValidUtf8String(const char *c,
111d9948 VS	67	size_t len = wxStringImpl::npos);
657a8a35 VZ	68	static bool IsValidUtf8LeadByte(unsigned char c)
	69	{
	70	return (c <= 0x7F) \|\| (c >= 0xC2 && c <= 0xF4);
	71	}
467175ab VS	72
467175ab VS	73	// table of offsets to skip forward when iterating over UTF-8 sequence
1774c3c5	74	static const unsigned char ms_utf8IterTable[256];
467175ab VS	75
	76
	77	template<typename Iterator>
	78	static void IncIter(Iterator& i)
	79	{
	80	wxASSERT( IsValidUtf8LeadByte(*i) );
	81	i += ms_utf8IterTable[(unsigned char)*i];
	82	}
	83
	84	template<typename Iterator>
	85	static void DecIter(Iterator& i)
	86	{
	87	wxASSERT( IsValidUtf8LeadByte(*i) );
	88
	89	// Non-lead bytes are all in the 0x80..0xBF range (i.e. 10xxxxxx in
	90	// binary), so we just have to go back until we hit a byte that is
	91	// either < 0x80 (i.e. 0xxxxxxx in binary) or 0xC0..0xFF (11xxxxxx in
	92	// binary; this includes some invalid values, but we can ignore it
	93	// here, because we assume valid UTF-8 input for the purpose of
	94	// efficient implementation).
	95	--i;
	96	while ( ((i) & 0xC0) == 0x80 / 2 highest bits are '10' */ )
	97	--i;
	98	}
	99
	100	template<typename Iterator>
b5343e06	101	static Iterator AddToIter(const Iterator& i, ptrdiff_t n)
467175ab VS	102	{
	103	Iterator out(i);
	104
	105	if ( n > 0 )
	106	{
b5343e06	107	for ( ptrdiff_t j = 0; j < n; ++j )
467175ab VS	108	IncIter(out);
	109	}
	110	else if ( n < 0 )
	111	{
b5343e06	112	for ( ptrdiff_t j = 0; j > n; --j )
467175ab VS	113	DecIter(out);
	114	}
	115
	116	return out;
	117	}
	118
	119	template<typename Iterator>
b5343e06	120	static ptrdiff_t DiffIters(Iterator i1, Iterator i2)
467175ab	121	{
b5343e06	122	ptrdiff_t dist = 0;
467175ab VS	123
	124	if ( i1 < i2 )
	125	{
	126	while ( i1 != i2 )
	127	{
	128	IncIter(i1);
	129	dist--;
	130	}
	131	}
	132	else if ( i2 < i1 )
	133	{
	134	while ( i2 != i1 )
	135	{
	136	IncIter(i2);
	137	dist++;
	138	}
	139	}
	140
	141	return dist;
	142	}
	143
467175ab	144	// encodes the character as UTF-8:
1fc10687 VS	145	typedef wxUniChar::Utf8CharBuffer Utf8CharBuffer;
	146	static Utf8CharBuffer EncodeChar(const wxUniChar& ch)
	147	{ return ch.AsUTF8(); }
467175ab VS	148
	149	// returns n copies of ch encoded in UTF-8 string
	150	static wxCharBuffer EncodeNChars(size_t n, const wxUniChar& ch);
	151
	152	// returns the length of UTF-8 encoding of the character with lead byte 'c'
	153	static size_t GetUtf8CharLength(char c)
	154	{
	155	wxASSERT( IsValidUtf8LeadByte(c) );
	156	return ms_utf8IterTable[(unsigned char)c];
	157	}
	158
	159	// decodes single UTF-8 character from UTF-8 string
ac2d749e VS	160	static wxUniChar DecodeChar(wxStringImpl::const_iterator i)
	161	{
	162	if ( (unsigned char)*i < 0x80 )
	163	return (int)*i;
	164	return DecodeNonAsciiChar(i);
	165	}
	166
	167	private:
	168	static wxUniChar DecodeNonAsciiChar(wxStringImpl::const_iterator i);
467175ab VS	169	};
	170	#endif // wxUSE_UNICODE_UTF8
	171
	172
	173	#if wxUSE_UNICODE_UTF8
	174	typedef wxStringOperationsUtf8 wxStringOperations;
	175	#else
	176	typedef wxStringOperationsWchar wxStringOperations;
	177	#endif
	178
	179	#endif // _WX_WXSTRINGOPS_H_