src/common/ustring.cpp

/////////////////////////////////////////////////////////////////////////////
// Name:        src/common/ustring.cpp
// Purpose:     wxUString class
// Author:      Robert Roebling
// Created:     2008-07-25
// RCS-ID:      $Id:$
// Copyright:   (c) 2008 Robert Roebling
// Licence:     wxWindows licence
///////////////////////////////////////////////////////////////////////////////

// For compilers that support precompilation, includes "wx.h".
#include "wx/wxprec.h"

#ifdef __BORLANDC__
    #pragma hdrstop
#endif

#include "wx/ustring.h"

#ifndef WX_PRECOMP
    #include "wx/crt.h"
    #include "wx/log.h"
#endif

wxUString &wxUString::assignFromAscii( const char *str )
{
   size_type len = wxStrlen( str );

   wxU32CharBuffer buffer( len );
   wxChar32 *ptr = buffer.data();

   size_type i;
   for (i = 0; i < len; i++)
   {
       *ptr = *str;
       ptr++;
       str++;
   }

   return assign( buffer );
}

wxUString &wxUString::assignFromAscii( const char *str, size_type n )
{
   size_type len = 0;
   const char *s = str;
   while (len < n && *s)
   {
       len++;
       s++;
   }

   wxU32CharBuffer buffer( len );
   wxChar32 *ptr = buffer.data();

   size_type i;
   for (i = 0; i < len; i++)
   {
       *ptr = *str;
       ptr++;
       str++;
   }

   return *this;
}

// ----------------------------------------------------------------------------
// UTF-8
// ----------------------------------------------------------------------------

static const wxUint32 utf8_max[]=
    { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };

// this table gives the length of the UTF-8 encoding from its first character:
const unsigned char tableUtf8Lengths[256] = {
    // single-byte sequences (ASCII):
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F

    // these are invalid:
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
    0, 0,                                            // C0,C1

    // two-byte sequences:
          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF

    // three-byte sequences:
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF

    // four-byte sequences:
    4, 4, 4, 4, 4,                                   // F0..F4

    // these are invalid again (5- or 6-byte
    // sequences and sequences for code points
    // above U+10FFFF, as restricted by RFC 3629):
                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
};

wxUString &wxUString::assignFromUTF8( const char *str )
{
    if (!str)
        return assign( wxUString() );

    size_type ucs4_len = 0;
    const char *p = str;
    while (*p)
    {
        unsigned char c = *p;
        size_type len = tableUtf8Lengths[c];
        if (!len)
           return assign( wxUString() );  // don't try to convert invalid UTF-8
        ucs4_len++;
        p += len;
    }

    wxU32CharBuffer buffer( ucs4_len );
    wxChar32 *out = buffer.data();

    p = str;
    while (*p)
    {
        unsigned char c = *p;
        if (c < 0x80)
        {
            *out = c;
            p++;
        }
        else
        {
            size_type len = tableUtf8Lengths[c];  // len == 0 is caught above

            //   Char. number range   |        UTF-8 octet sequence
            //      (hexadecimal)     |              (binary)
            //  ----------------------+----------------------------------------
            //  0000 0000 - 0000 007F | 0xxxxxxx
            //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
            //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
            //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
            //
            //  Code point value is stored in bits marked with 'x',
            //  lowest-order bit of the value on the right side in the diagram
            //  above.                                         (from RFC 3629)

            // mask to extract lead byte's value ('x' bits above), by sequence
            // length:
            static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };

            // mask and value of lead byte's most significant bits, by length:
            static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
            static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };

            len--; // it's more convenient to work with 0-based length here

            // extract the lead byte's value bits:
            if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
                break;

            wxChar32 code = c & leadValueMask[len];

            // all remaining bytes, if any, are handled in the same way
            // regardless of sequence's length:
            for ( ; len; --len )
            {
                c = *++p;
                if ( (c & 0xC0) != 0x80 )
                    return assign( wxUString() );  // don't try to convert invalid UTF-8

                code <<= 6;
                code |= c & 0x3F;
            }

            *out = code;
            p++;
        }
        out++;
    }

    return assign( buffer.data() );
}

wxUString &wxUString::assignFromUTF8( const char *str, size_type n )
{
    if (!str)
        return assign( wxUString() );

    size_type ucs4_len = 0;
    size_type utf8_pos = 0;
    const char *p = str;
    while (*p)
    {
        unsigned char c = *p;
        size_type len = tableUtf8Lengths[c];
        if (!len)
           return assign( wxUString() );  // don't try to convert invalid UTF-8
        if (utf8_pos + len > n)
            break;
        utf8_pos += len;
        ucs4_len ++;
        p += len;
    }

    wxU32CharBuffer buffer( ucs4_len );
    wxChar32 *out = buffer.data();

    utf8_pos = 0;
    p = str;
    while (*p)
    {
        unsigned char c = *p;
        if (c < 0x80)
        {
            if (utf8_pos + 1 > n)
                break;
            utf8_pos++;

            *out = c;
            p++;
        }
        else
        {
            size_type len = tableUtf8Lengths[c];  // len == 0 is caught above
            if (utf8_pos + len > n)
                break;
            utf8_pos += len;

            //   Char. number range   |        UTF-8 octet sequence
            //      (hexadecimal)     |              (binary)
            //  ----------------------+----------------------------------------
            //  0000 0000 - 0000 007F | 0xxxxxxx
            //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
            //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
            //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
            //
            //  Code point value is stored in bits marked with 'x',
            //  lowest-order bit of the value on the right side in the diagram
            //  above.                                         (from RFC 3629)

            // mask to extract lead byte's value ('x' bits above), by sequence
            // length:
            static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };

            // mask and value of lead byte's most significant bits, by length:
            static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
            static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };

            len--; // it's more convenient to work with 0-based length here

            // extract the lead byte's value bits:
            if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
                break;

            wxChar32 code = c & leadValueMask[len];

            // all remaining bytes, if any, are handled in the same way
            // regardless of sequence's length:
            for ( ; len; --len )
            {
                c = *++p;
                if ( (c & 0xC0) != 0x80 )
                    return assign( wxUString() );  // don't try to convert invalid UTF-8

                code <<= 6;
                code |= c & 0x3F;
            }

            *out = code;
            p++;
        }
        out++;
    }

    *out = 0;

    return assign( buffer.data() );
}

wxUString &wxUString::assignFromUTF16( const wxChar16* str, size_type n )
{
    if (!str)
        return assign( wxUString() );

    size_type ucs4_len = 0;
    size_type utf16_pos = 0;
    const wxChar16 *p = str;
    while (*p)
    {
        size_type len;
        if ((*p < 0xd800) || (*p > 0xdfff))
        {
            len = 1;
        }
        else if ((p[1] < 0xdc00) || (p[1] > 0xdfff))
        {
            return assign( wxUString() );  // don't try to convert invalid UTF-16
        }
        else
        {
           len = 2;
        }

        if (utf16_pos + len > n)
            break;

        ucs4_len++;
        p += len;
        utf16_pos += len;
    }

    wxU32CharBuffer buffer( ucs4_len );
    wxChar32 *out = buffer.data();

    utf16_pos = 0;

    p = str;
    while (*p)
    {
        if ((*p < 0xd800) || (*p > 0xdfff))
        {
            if (utf16_pos + 1 > n)
                break;

            *out = *p;
            p++;
            utf16_pos++;
        }
        else
        {
            if (utf16_pos + 2 > n)
                break;

           *out = ((p[0] - 0xd7c0) << 10) + (p[1] - 0xdc00);
           p += 2;
           utf16_pos += 2;
        }
        out++;
    }

    return assign( buffer.data() );
}

wxUString &wxUString::assignFromUTF16( const wxChar16* str )
{
    if (!str)
        return assign( wxUString() );

    size_type ucs4_len = 0;
    const wxChar16 *p = str;
    while (*p)
    {
        size_type len;
        if ((*p < 0xd800) || (*p > 0xdfff))
        {
            len = 1;
        }
        else if ((p[1] < 0xdc00) || (p[1] > 0xdfff))
        {
            return assign( wxUString() );  // don't try to convert invalid UTF-16
        }
        else
        {
           len = 2;
        }

        ucs4_len++;
        p += len;
    }

    wxU32CharBuffer buffer( ucs4_len );
    wxChar32 *out = buffer.data();

    p = str;
    while (*p)
    {
        if ((*p < 0xd800) || (*p > 0xdfff))
        {
            *out = *p;
            p++;
        }
        else
        {
           *out = ((p[0] - 0xd7c0) << 10) + (p[1] - 0xdc00);
           p += 2;
        }
        out++;
    }

    return assign( buffer.data() );
}

wxUString &wxUString::assignFromCString( const char* str )
{
    if (!str)
        return assign( wxUString() );

    wxWCharBuffer buffer = wxConvLibc.cMB2WC( str );

    return assign( buffer );
}

wxUString &wxUString::assignFromCString( const char* str, const wxMBConv &conv )
{
    if (!str)
        return assign( wxUString() );

    wxWCharBuffer buffer = conv.cMB2WC( str );

    return assign( buffer );
}

wxCharBuffer wxUString::utf8_str() const
{
    size_type utf8_length = 0;
    const wxChar32 *ptr = data();

    while (*ptr)
    {
        wxChar32 code = *ptr;
        ptr++;

        if ( code <= 0x7F )
        {
            utf8_length++;
        }
        else if ( code <= 0x07FF )
        {
            utf8_length += 2;
        }
        else if ( code < 0xFFFF )
        {
            utf8_length += 3;
        }
        else if ( code <= 0x10FFFF )
        {
            utf8_length += 4;
        }
        else
        {
            // invalid range, skip
        }
    }

    wxCharBuffer result( utf8_length );

    char *out = result.data();

    ptr = data();
    while (*ptr)
    {
        wxChar32 code = *ptr;
        ptr++;

        if ( code <= 0x7F )
        {
            out[0] = (char)code;
            out++;
        }
        else if ( code <= 0x07FF )
        {
            out[1] = 0x80 | (code & 0x3F);  code >>= 6;
            out[0] = 0xC0 | code;
            out += 2;
        }
        else if ( code < 0xFFFF )
        {
            out[2] = 0x80 | (code & 0x3F);  code >>= 6;
            out[1] = 0x80 | (code & 0x3F);  code >>= 6;
            out[0] = 0xE0 | code;
            out += 3;
        }
        else if ( code <= 0x10FFFF )
        {
            out[3] = 0x80 | (code & 0x3F);  code >>= 6;
            out[2] = 0x80 | (code & 0x3F);  code >>= 6;
            out[1] = 0x80 | (code & 0x3F);  code >>= 6;
            out[0] = 0xF0 | code;
            out += 4;
        }
        else
        {
            // invalid range, skip
        }
    }

    wxPrintf( "utf8_str %s len %d\n", result, wxStrlen( result.data() ) );
    wxPrintf( "utf8_str %s len %d\n", result, wxStrlen( result.data() ) );

    return result;
}

wxU16CharBuffer wxUString::utf16_str() const
{
    size_type utf16_length = 0;
    const wxChar32 *ptr = data();

    while (*ptr)
    {
        wxChar32 code = *ptr;
        ptr++;

        // TODO: error range checks

        if (code < 0x10000)
           utf16_length++;
        else
           utf16_length += 2;
    }

    wxU16CharBuffer result( utf16_length );
    wxChar16 *out = result.data();

    ptr = data();

    while (*ptr)
    {
        wxChar32 code = *ptr;
        ptr++;

        // TODO: error range checks

        if (code < 0x10000)
        {
           out[0] = code;
           out++;
        }
        else
        {
           out[0] = (code - 0x10000) / 0x400 + 0xd800;
           out[1] = (code - 0x10000) % 0x400 + 0xdc00;
           out += 2;
        }
    }

    return result;
}


#if SIZEOF_WCHAR_T != 2
template<>
wxCharTypeBuffer<wxChar16>::Data
wxCharTypeBuffer<wxChar16>::NullData(NULL);
#endif

#if SIZEOF_WCHAR_T != 4
template<>
wxCharTypeBuffer<wxChar32>::Data
wxCharTypeBuffer<wxChar32>::NullData(NULL);
#endif
Commit	Line	Data
	1	/////////////////////////////////////////////////////////////////////////////
	2	// Name: src/common/ustring.cpp
	3	// Purpose: wxUString class
	4	// Author: Robert Roebling
	5	// Created: 2008-07-25
	6	// RCS-ID: $Id:$
	7	// Copyright: (c) 2008 Robert Roebling
	8	// Licence: wxWindows licence
	9	///////////////////////////////////////////////////////////////////////////////
	10
	11	// For compilers that support precompilation, includes "wx.h".
	12	#include "wx/wxprec.h"
	13
	14	#ifdef __BORLANDC__
	15	#pragma hdrstop
	16	#endif
	17
	18	#include "wx/ustring.h"
	19
	20	#ifndef WX_PRECOMP
	21	#include "wx/crt.h"
	22	#include "wx/log.h"
	23	#endif
	24
	25	wxUString &wxUString::assignFromAscii( const char *str )
	26	{
	27	size_type len = wxStrlen( str );
	28
	29	wxU32CharBuffer buffer( len );
	30	wxChar32 *ptr = buffer.data();
	31
	32	size_type i;
	33	for (i = 0; i < len; i++)
	34	{
	35	ptr = str;
	36	ptr++;
	37	str++;
	38	}
	39
	40	return assign( buffer );
	41	}
	42
	43	wxUString &wxUString::assignFromAscii( const char *str, size_type n )
	44	{
	45	size_type len = 0;
	46	const char *s = str;
	47	while (len < n && *s)
	48	{
	49	len++;
	50	s++;
	51	}
	52
	53	wxU32CharBuffer buffer( len );
	54	wxChar32 *ptr = buffer.data();
	55
	56	size_type i;
	57	for (i = 0; i < len; i++)
	58	{
	59	ptr = str;
	60	ptr++;
	61	str++;
	62	}
	63
	64	return *this;
	65	}
	66
	67	// ----------------------------------------------------------------------------
	68	// UTF-8
	69	// ----------------------------------------------------------------------------
	70
	71	static const wxUint32 utf8_max[]=
	72	{ 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
	73
	74	// this table gives the length of the UTF-8 encoding from its first character:
	75	const unsigned char tableUtf8Lengths[256] = {
	76	// single-byte sequences (ASCII):
	77	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
	78	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
	79	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
	80	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
	81	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
	82	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
	83	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
	84	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
	85
	86	// these are invalid:
	87	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
	88	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
	89	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
	90	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
	91	0, 0, // C0,C1
	92
	93	// two-byte sequences:
	94	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
	95	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
	96
	97	// three-byte sequences:
	98	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
	99
	100	// four-byte sequences:
	101	4, 4, 4, 4, 4, // F0..F4
	102
	103	// these are invalid again (5- or 6-byte
	104	// sequences and sequences for code points
	105	// above U+10FFFF, as restricted by RFC 3629):
	106	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
	107	};
	108
	109	wxUString &wxUString::assignFromUTF8( const char *str )
	110	{
	111	if (!str)
	112	return assign( wxUString() );
	113
	114	size_type ucs4_len = 0;
	115	const char *p = str;
	116	while (*p)
	117	{
	118	unsigned char c = *p;
	119	size_type len = tableUtf8Lengths[c];
	120	if (!len)
	121	return assign( wxUString() ); // don't try to convert invalid UTF-8
	122	ucs4_len++;
	123	p += len;
	124	}
	125
	126	wxU32CharBuffer buffer( ucs4_len );
	127	wxChar32 *out = buffer.data();
	128
	129	p = str;
	130	while (*p)
	131	{
	132	unsigned char c = *p;
	133	if (c < 0x80)
	134	{
	135	*out = c;
	136	p++;
	137	}
	138	else
	139	{
	140	size_type len = tableUtf8Lengths[c]; // len == 0 is caught above
	141
	142	// Char. number range \| UTF-8 octet sequence
	143	// (hexadecimal) \| (binary)
	144	// ----------------------+----------------------------------------
	145	// 0000 0000 - 0000 007F \| 0xxxxxxx
	146	// 0000 0080 - 0000 07FF \| 110xxxxx 10xxxxxx
	147	// 0000 0800 - 0000 FFFF \| 1110xxxx 10xxxxxx 10xxxxxx
	148	// 0001 0000 - 0010 FFFF \| 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
	149	//
	150	// Code point value is stored in bits marked with 'x',
	151	// lowest-order bit of the value on the right side in the diagram
	152	// above. (from RFC 3629)
	153
	154	// mask to extract lead byte's value ('x' bits above), by sequence
	155	// length:
	156	static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
	157
	158	// mask and value of lead byte's most significant bits, by length:
	159	static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
	160	static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
	161
	162	len--; // it's more convenient to work with 0-based length here
	163
	164	// extract the lead byte's value bits:
	165	if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
	166	break;
	167
	168	wxChar32 code = c & leadValueMask[len];
	169
	170	// all remaining bytes, if any, are handled in the same way
	171	// regardless of sequence's length:
	172	for ( ; len; --len )
	173	{
	174	c = *++p;
	175	if ( (c & 0xC0) != 0x80 )
	176	return assign( wxUString() ); // don't try to convert invalid UTF-8
	177
	178	code <<= 6;
	179	code \|= c & 0x3F;
	180	}
	181
	182	*out = code;
	183	p++;
	184	}
	185	out++;
	186	}
	187
	188	return assign( buffer.data() );
	189	}
	190
	191	wxUString &wxUString::assignFromUTF8( const char *str, size_type n )
	192	{
	193	if (!str)
	194	return assign( wxUString() );
	195
	196	size_type ucs4_len = 0;
	197	size_type utf8_pos = 0;
	198	const char *p = str;
	199	while (*p)
	200	{
	201	unsigned char c = *p;
	202	size_type len = tableUtf8Lengths[c];
	203	if (!len)
	204	return assign( wxUString() ); // don't try to convert invalid UTF-8
	205	if (utf8_pos + len > n)
	206	break;
	207	utf8_pos += len;
	208	ucs4_len ++;
	209	p += len;
	210	}
	211
	212	wxU32CharBuffer buffer( ucs4_len );
	213	wxChar32 *out = buffer.data();
	214
	215	utf8_pos = 0;
	216	p = str;
	217	while (*p)
	218	{
	219	unsigned char c = *p;
	220	if (c < 0x80)
	221	{
	222	if (utf8_pos + 1 > n)
	223	break;
	224	utf8_pos++;
	225
	226	*out = c;
	227	p++;
	228	}
	229	else
	230	{
	231	size_type len = tableUtf8Lengths[c]; // len == 0 is caught above
	232	if (utf8_pos + len > n)
	233	break;
	234	utf8_pos += len;
	235
	236	// Char. number range \| UTF-8 octet sequence
	237	// (hexadecimal) \| (binary)
	238	// ----------------------+----------------------------------------
	239	// 0000 0000 - 0000 007F \| 0xxxxxxx
	240	// 0000 0080 - 0000 07FF \| 110xxxxx 10xxxxxx
	241	// 0000 0800 - 0000 FFFF \| 1110xxxx 10xxxxxx 10xxxxxx
	242	// 0001 0000 - 0010 FFFF \| 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
	243	//
	244	// Code point value is stored in bits marked with 'x',
	245	// lowest-order bit of the value on the right side in the diagram
	246	// above. (from RFC 3629)
	247
	248	// mask to extract lead byte's value ('x' bits above), by sequence
	249	// length:
	250	static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
	251
	252	// mask and value of lead byte's most significant bits, by length:
	253	static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
	254	static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
	255
	256	len--; // it's more convenient to work with 0-based length here
	257
	258	// extract the lead byte's value bits:
	259	if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
	260	break;
	261
	262	wxChar32 code = c & leadValueMask[len];
	263
	264	// all remaining bytes, if any, are handled in the same way
	265	// regardless of sequence's length:
	266	for ( ; len; --len )
	267	{
	268	c = *++p;
	269	if ( (c & 0xC0) != 0x80 )
	270	return assign( wxUString() ); // don't try to convert invalid UTF-8
	271
	272	code <<= 6;
	273	code \|= c & 0x3F;
	274	}
	275
	276	*out = code;
	277	p++;
	278	}
	279	out++;
	280	}
	281
	282	*out = 0;
	283
	284	return assign( buffer.data() );
	285	}
	286
	287	wxUString &wxUString::assignFromUTF16( const wxChar16* str, size_type n )
	288	{
	289	if (!str)
	290	return assign( wxUString() );
	291
	292	size_type ucs4_len = 0;
	293	size_type utf16_pos = 0;
	294	const wxChar16 *p = str;
	295	while (*p)
	296	{
	297	size_type len;
	298	if ((p < 0xd800) \|\| (p > 0xdfff))
	299	{
	300	len = 1;
	301	}
	302	else if ((p[1] < 0xdc00) \|\| (p[1] > 0xdfff))
	303	{
	304	return assign( wxUString() ); // don't try to convert invalid UTF-16
	305	}
	306	else
	307	{
	308	len = 2;
	309	}
	310
	311	if (utf16_pos + len > n)
	312	break;
	313
	314	ucs4_len++;
	315	p += len;
	316	utf16_pos += len;
	317	}
	318
	319	wxU32CharBuffer buffer( ucs4_len );
	320	wxChar32 *out = buffer.data();
	321
	322	utf16_pos = 0;
	323
	324	p = str;
	325	while (*p)
	326	{
	327	if ((p < 0xd800) \|\| (p > 0xdfff))
	328	{
	329	if (utf16_pos + 1 > n)
	330	break;
	331
	332	out = p;
	333	p++;
	334	utf16_pos++;
	335	}
	336	else
	337	{
	338	if (utf16_pos + 2 > n)
	339	break;
	340
	341	*out = ((p[0] - 0xd7c0) << 10) + (p[1] - 0xdc00);
	342	p += 2;
	343	utf16_pos += 2;
	344	}
	345	out++;
	346	}
	347
	348	return assign( buffer.data() );
	349	}
	350
	351	wxUString &wxUString::assignFromUTF16( const wxChar16* str )
	352	{
	353	if (!str)
	354	return assign( wxUString() );
	355
	356	size_type ucs4_len = 0;
	357	const wxChar16 *p = str;
	358	while (*p)
	359	{
	360	size_type len;
	361	if ((p < 0xd800) \|\| (p > 0xdfff))
	362	{
	363	len = 1;
	364	}
	365	else if ((p[1] < 0xdc00) \|\| (p[1] > 0xdfff))
	366	{
	367	return assign( wxUString() ); // don't try to convert invalid UTF-16
	368	}
	369	else
	370	{
	371	len = 2;
	372	}
	373
	374	ucs4_len++;
	375	p += len;
	376	}
	377
	378	wxU32CharBuffer buffer( ucs4_len );
	379	wxChar32 *out = buffer.data();
	380
	381	p = str;
	382	while (*p)
	383	{
	384	if ((p < 0xd800) \|\| (p > 0xdfff))
	385	{
	386	out = p;
	387	p++;
	388	}
	389	else
	390	{
	391	*out = ((p[0] - 0xd7c0) << 10) + (p[1] - 0xdc00);
	392	p += 2;
	393	}
	394	out++;
	395	}
	396
	397	return assign( buffer.data() );
	398	}
	399
	400	wxUString &wxUString::assignFromCString( const char* str )
	401	{
	402	if (!str)
	403	return assign( wxUString() );
	404
	405	wxWCharBuffer buffer = wxConvLibc.cMB2WC( str );
	406
	407	return assign( buffer );
	408	}
	409
	410	wxUString &wxUString::assignFromCString( const char* str, const wxMBConv &conv )
	411	{
	412	if (!str)
	413	return assign( wxUString() );
	414
	415	wxWCharBuffer buffer = conv.cMB2WC( str );
	416
	417	return assign( buffer );
	418	}
	419
	420	wxCharBuffer wxUString::utf8_str() const
	421	{
	422	size_type utf8_length = 0;
	423	const wxChar32 *ptr = data();
	424
	425	while (*ptr)
	426	{
	427	wxChar32 code = *ptr;
	428	ptr++;
	429
	430	if ( code <= 0x7F )
	431	{
	432	utf8_length++;
	433	}
	434	else if ( code <= 0x07FF )
	435	{
	436	utf8_length += 2;
	437	}
	438	else if ( code < 0xFFFF )
	439	{
	440	utf8_length += 3;
	441	}
	442	else if ( code <= 0x10FFFF )
	443	{
	444	utf8_length += 4;
	445	}
	446	else
	447	{
	448	// invalid range, skip
	449	}
	450	}
	451
	452	wxCharBuffer result( utf8_length );
	453
	454	char *out = result.data();
	455
	456	ptr = data();
	457	while (*ptr)
	458	{
	459	wxChar32 code = *ptr;
	460	ptr++;
	461
	462	if ( code <= 0x7F )
	463	{
	464	out[0] = (char)code;
	465	out++;
	466	}
	467	else if ( code <= 0x07FF )
	468	{
	469	out[1] = 0x80 \| (code & 0x3F); code >>= 6;
	470	out[0] = 0xC0 \| code;
	471	out += 2;
	472	}
	473	else if ( code < 0xFFFF )
	474	{
	475	out[2] = 0x80 \| (code & 0x3F); code >>= 6;
	476	out[1] = 0x80 \| (code & 0x3F); code >>= 6;
	477	out[0] = 0xE0 \| code;
	478	out += 3;
	479	}
	480	else if ( code <= 0x10FFFF )
	481	{
	482	out[3] = 0x80 \| (code & 0x3F); code >>= 6;
	483	out[2] = 0x80 \| (code & 0x3F); code >>= 6;
	484	out[1] = 0x80 \| (code & 0x3F); code >>= 6;
	485	out[0] = 0xF0 \| code;
	486	out += 4;
	487	}
	488	else
	489	{
	490	// invalid range, skip
	491	}
	492	}
	493
	494	wxPrintf( "utf8_str %s len %d\n", result, wxStrlen( result.data() ) );
	495	wxPrintf( "utf8_str %s len %d\n", result, wxStrlen( result.data() ) );
	496
	497	return result;
	498	}
	499
	500	wxU16CharBuffer wxUString::utf16_str() const
	501	{
	502	size_type utf16_length = 0;
	503	const wxChar32 *ptr = data();
	504
	505	while (*ptr)
	506	{
	507	wxChar32 code = *ptr;
	508	ptr++;
	509
	510	// TODO: error range checks
	511
	512	if (code < 0x10000)
	513	utf16_length++;
	514	else
	515	utf16_length += 2;
	516	}
	517
	518	wxU16CharBuffer result( utf16_length );
	519	wxChar16 *out = result.data();
	520
	521	ptr = data();
	522
	523	while (*ptr)
	524	{
	525	wxChar32 code = *ptr;
	526	ptr++;
	527
	528	// TODO: error range checks
	529
	530	if (code < 0x10000)
	531	{
	532	out[0] = code;
	533	out++;
	534	}
	535	else
	536	{
	537	out[0] = (code - 0x10000) / 0x400 + 0xd800;
	538	out[1] = (code - 0x10000) % 0x400 + 0xdc00;
	539	out += 2;
	540	}
	541	}
	542
	543	return result;
	544	}
	545
	546
	547	#if SIZEOF_WCHAR_T != 2
	548	template<>
	549	wxCharTypeBuffer<wxChar16>::Data
	550	wxCharTypeBuffer<wxChar16>::NullData(NULL);
	551	#endif
	552
	553	#if SIZEOF_WCHAR_T != 4
	554	template<>
	555	wxCharTypeBuffer<wxChar32>::Data
	556	wxCharTypeBuffer<wxChar32>::NullData(NULL);
	557	#endif