[wxWidgets.git] / src / common / ustring.cpp

/////////////////////////////////////////////////////////////////////////////
// Name:        src/common/ustring.cpp
// Purpose:     wxUString class
// Author:      Robert Roebling
// Created:     2008-07-25
// RCS-ID:      $Id:$
// Copyright:   (c) 2008 Robert Roebling
// Licence:     wxWindows licence
///////////////////////////////////////////////////////////////////////////////

// For compilers that support precompilation, includes "wx.h".
#include "wx/wxprec.h"

#ifdef __BORLANDC__
    #pragma hdrstop
#endif

#include "wx/ustring.h"

#ifndef WX_PRECOMP
    #include "wx/crt.h"
    #include "wx/log.h"
#endif

wxUString &wxUString::assignFromAscii( const char *str )
{
   size_type len = wxStrlen( str );

   wxU32CharBuffer buffer( len );
   wxChar32 *ptr = buffer.data();

   size_type i;
   for (i = 0; i < len; i++)
   {
       *ptr = *str;
       ptr++;
       str++;
   }

   return assign( buffer );
}

wxUString &wxUString::assignFromAscii( const char *str, size_type n )
{
   size_type len = 0;
   const char *s = str;
   while (len < n && *s)
   {
       len++;
       s++;
   }

   wxU32CharBuffer buffer( len );
   wxChar32 *ptr = buffer.data();

   size_type i;
   for (i = 0; i < len; i++)
   {
       *ptr = *str;
       ptr++;
       str++;
   }

   return *this;
}

// ----------------------------------------------------------------------------
// UTF-8
// ----------------------------------------------------------------------------

// this table gives the length of the UTF-8 encoding from its first character:
const unsigned char tableUtf8Lengths[256] = {
    // single-byte sequences (ASCII):
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F

    // these are invalid:
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
    0, 0,                                            // C0,C1

    // two-byte sequences:
          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF

    // three-byte sequences:
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF

    // four-byte sequences:
    4, 4, 4, 4, 4,                                   // F0..F4

    // these are invalid again (5- or 6-byte
    // sequences and sequences for code points
    // above U+10FFFF, as restricted by RFC 3629):
                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
};

wxUString &wxUString::assignFromUTF8( const char *str )
{
    if (!str)
        return assign( wxUString() );

    size_type ucs4_len = 0;
    const char *p = str;
    while (*p)
    {
        unsigned char c = *p;
        size_type len = tableUtf8Lengths[c];
        if (!len)
           return assign( wxUString() );  // don't try to convert invalid UTF-8
        ucs4_len++;
        p += len;
    }

    wxU32CharBuffer buffer( ucs4_len );
    wxChar32 *out = buffer.data();

    p = str;
    while (*p)
    {
        unsigned char c = *p;
        if (c < 0x80)
        {
            *out = c;
            p++;
        }
        else
        {
            size_type len = tableUtf8Lengths[c];  // len == 0 is caught above

            //   Char. number range   |        UTF-8 octet sequence
            //      (hexadecimal)     |              (binary)
            //  ----------------------+----------------------------------------
            //  0000 0000 - 0000 007F | 0xxxxxxx
            //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
            //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
            //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
            //
            //  Code point value is stored in bits marked with 'x',
            //  lowest-order bit of the value on the right side in the diagram
            //  above.                                         (from RFC 3629)

            // mask to extract lead byte's value ('x' bits above), by sequence
            // length:
            static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };

            // mask and value of lead byte's most significant bits, by length:
            static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
            static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };

            len--; // it's more convenient to work with 0-based length here

            // extract the lead byte's value bits:
            if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
                break;

            wxChar32 code = c & leadValueMask[len];

            // all remaining bytes, if any, are handled in the same way
            // regardless of sequence's length:
            for ( ; len; --len )
            {
                c = *++p;
                if ( (c & 0xC0) != 0x80 )
                    return assign( wxUString() );  // don't try to convert invalid UTF-8

                code <<= 6;
                code |= c & 0x3F;
            }

            *out = code;
            p++;
        }
        out++;
    }

    return assign( buffer.data() );
}

wxUString &wxUString::assignFromUTF8( const char *str, size_type n )
{
    if (!str)
        return assign( wxUString() );

    size_type ucs4_len = 0;
    size_type utf8_pos = 0;
    const char *p = str;
    while (*p)
    {
        unsigned char c = *p;
        size_type len = tableUtf8Lengths[c];
        if (!len)
           return assign( wxUString() );  // don't try to convert invalid UTF-8
        if (utf8_pos + len > n)
            break;
        utf8_pos += len;
        ucs4_len ++;
        p += len;
    }

    wxU32CharBuffer buffer( ucs4_len );
    wxChar32 *out = buffer.data();

    utf8_pos = 0;
    p = str;
    while (*p)
    {
        unsigned char c = *p;
        if (c < 0x80)
        {
            if (utf8_pos + 1 > n)
                break;
            utf8_pos++;

            *out = c;
            p++;
        }
        else
        {
            size_type len = tableUtf8Lengths[c];  // len == 0 is caught above
            if (utf8_pos + len > n)
                break;
            utf8_pos += len;

            //   Char. number range   |        UTF-8 octet sequence
            //      (hexadecimal)     |              (binary)
            //  ----------------------+----------------------------------------
            //  0000 0000 - 0000 007F | 0xxxxxxx
            //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
            //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
            //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
            //
            //  Code point value is stored in bits marked with 'x',
            //  lowest-order bit of the value on the right side in the diagram
            //  above.                                         (from RFC 3629)

            // mask to extract lead byte's value ('x' bits above), by sequence
            // length:
            static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };

            // mask and value of lead byte's most significant bits, by length:
            static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
            static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };

            len--; // it's more convenient to work with 0-based length here

            // extract the lead byte's value bits:
            if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
                break;

            wxChar32 code = c & leadValueMask[len];

            // all remaining bytes, if any, are handled in the same way
            // regardless of sequence's length:
            for ( ; len; --len )
            {
                c = *++p;
                if ( (c & 0xC0) != 0x80 )
                    return assign( wxUString() );  // don't try to convert invalid UTF-8

                code <<= 6;
                code |= c & 0x3F;
            }

            *out = code;
            p++;
        }
        out++;
    }

    *out = 0;

    return assign( buffer.data() );
}

wxUString &wxUString::assignFromUTF16( const wxChar16* str, size_type n )
{
    if (!str)
        return assign( wxUString() );

    size_type ucs4_len = 0;
    size_type utf16_pos = 0;
    const wxChar16 *p = str;
    while (*p)
    {
        size_type len;
        if ((*p < 0xd800) || (*p > 0xdfff))
        {
            len = 1;
        }
        else if ((p[1] < 0xdc00) || (p[1] > 0xdfff))
        {
            return assign( wxUString() );  // don't try to convert invalid UTF-16
        }
        else
        {
           len = 2;
        }

        if (utf16_pos + len > n)
            break;

        ucs4_len++;
        p += len;
        utf16_pos += len;
    }

    wxU32CharBuffer buffer( ucs4_len );
    wxChar32 *out = buffer.data();

    utf16_pos = 0;

    p = str;
    while (*p)
    {
        if ((*p < 0xd800) || (*p > 0xdfff))
        {
            if (utf16_pos + 1 > n)
                break;

            *out = *p;
            p++;
            utf16_pos++;
        }
        else
        {
            if (utf16_pos + 2 > n)
                break;

           *out = ((p[0] - 0xd7c0) << 10) + (p[1] - 0xdc00);
           p += 2;
           utf16_pos += 2;
        }
        out++;
    }

    return assign( buffer.data() );
}

wxUString &wxUString::assignFromUTF16( const wxChar16* str )
{
    if (!str)
        return assign( wxUString() );

    size_type ucs4_len = 0;
    const wxChar16 *p = str;
    while (*p)
    {
        size_type len;
        if ((*p < 0xd800) || (*p > 0xdfff))
        {
            len = 1;
        }
        else if ((p[1] < 0xdc00) || (p[1] > 0xdfff))
        {
            return assign( wxUString() );  // don't try to convert invalid UTF-16
        }
        else
        {
           len = 2;
        }

        ucs4_len++;
        p += len;
    }

    wxU32CharBuffer buffer( ucs4_len );
    wxChar32 *out = buffer.data();

    p = str;
    while (*p)
    {
        if ((*p < 0xd800) || (*p > 0xdfff))
        {
            *out = *p;
            p++;
        }
        else
        {
           *out = ((p[0] - 0xd7c0) << 10) + (p[1] - 0xdc00);
           p += 2;
        }
        out++;
    }

    return assign( buffer.data() );
}

wxUString &wxUString::assignFromCString( const char* str )
{
    if (!str)
        return assign( wxUString() );

    wxWCharBuffer buffer = wxConvLibc.cMB2WC( str );

    return assign( buffer );
}

wxUString &wxUString::assignFromCString( const char* str, const wxMBConv &conv )
{
    if (!str)
        return assign( wxUString() );

    wxWCharBuffer buffer = conv.cMB2WC( str );

    return assign( buffer );
}

wxCharBuffer wxUString::utf8_str() const
{
    size_type utf8_length = 0;
    const wxChar32 *ptr = data();

    while (*ptr)
    {
        wxChar32 code = *ptr;
        ptr++;

        if ( code <= 0x7F )
        {
            utf8_length++;
        }
        else if ( code <= 0x07FF )
        {
            utf8_length += 2;
        }
        else if ( code < 0xFFFF )
        {
            utf8_length += 3;
        }
        else if ( code <= 0x10FFFF )
        {
            utf8_length += 4;
        }
        else
        {
            // invalid range, skip
        }
    }

    wxCharBuffer result( utf8_length );

    char *out = result.data();

    ptr = data();
    while (*ptr)
    {
        wxChar32 code = *ptr;
        ptr++;

        if ( code <= 0x7F )
        {
            out[0] = (char)code;
            out++;
        }
        else if ( code <= 0x07FF )
        {
            out[1] = 0x80 | (code & 0x3F);  code >>= 6;
            out[0] = 0xC0 | code;
            out += 2;
        }
        else if ( code < 0xFFFF )
        {
            out[2] = 0x80 | (code & 0x3F);  code >>= 6;
            out[1] = 0x80 | (code & 0x3F);  code >>= 6;
            out[0] = 0xE0 | code;
            out += 3;
        }
        else if ( code <= 0x10FFFF )
        {
            out[3] = 0x80 | (code & 0x3F);  code >>= 6;
            out[2] = 0x80 | (code & 0x3F);  code >>= 6;
            out[1] = 0x80 | (code & 0x3F);  code >>= 6;
            out[0] = 0xF0 | code;
            out += 4;
        }
        else
        {
            // invalid range, skip
        }
    }

    wxPrintf( "utf8_str %s len %d\n", result, wxStrlen( result.data() ) );
    wxPrintf( "utf8_str %s len %d\n", result, wxStrlen( result.data() ) );

    return result;
}

wxU16CharBuffer wxUString::utf16_str() const
{
    size_type utf16_length = 0;
    const wxChar32 *ptr = data();

    while (*ptr)
    {
        wxChar32 code = *ptr;
        ptr++;

        // TODO: error range checks

        if (code < 0x10000)
           utf16_length++;
        else
           utf16_length += 2;
    }

    wxU16CharBuffer result( utf16_length );
    wxChar16 *out = result.data();

    ptr = data();

    while (*ptr)
    {
        wxChar32 code = *ptr;
        ptr++;

        // TODO: error range checks

        if (code < 0x10000)
        {
           out[0] = code;
           out++;
        }
        else
        {
           out[0] = (code - 0x10000) / 0x400 + 0xd800;
           out[1] = (code - 0x10000) % 0x400 + 0xdc00;
           out += 2;
        }
    }

    return result;
}
Commit	Line	Data
9a6d1438 RR	1	/////////////////////////////////////////////////////////////////////////////
	2	// Name: src/common/ustring.cpp
	3	// Purpose: wxUString class
	4	// Author: Robert Roebling
	5	// Created: 2008-07-25
	6	// RCS-ID: $Id:$
	7	// Copyright: (c) 2008 Robert Roebling
	8	// Licence: wxWindows licence
	9	///////////////////////////////////////////////////////////////////////////////
	10
	11	// For compilers that support precompilation, includes "wx.h".
	12	#include "wx/wxprec.h"
	13
	14	#ifdef __BORLANDC__
	15	#pragma hdrstop
	16	#endif
	17
a99bcb5e PC	18	#include "wx/ustring.h"
a99bcb5e PC	19
9a6d1438	20	#ifndef WX_PRECOMP
a99bcb5e	21	#include "wx/crt.h"
9a6d1438 RR	22	#include "wx/log.h"
	23	#endif
	24
9a6d1438 RR	25	wxUString &wxUString::assignFromAscii( const char *str )
	26	{
	27	size_type len = wxStrlen( str );
5c69ef61	28
9a6d1438 RR	29	wxU32CharBuffer buffer( len );
9a6d1438 RR	30	wxChar32 *ptr = buffer.data();
5c69ef61	31
9a6d1438 RR	32	size_type i;
	33	for (i = 0; i < len; i++)
	34	{
	35	ptr = str;
	36	ptr++;
	37	str++;
	38	}
5c69ef61	39
9a6d1438 RR	40	return assign( buffer );
	41	}
	42
	43	wxUString &wxUString::assignFromAscii( const char *str, size_type n )
	44	{
	45	size_type len = 0;
	46	const char *s = str;
	47	while (len < n && *s)
	48	{
	49	len++;
	50	s++;
	51	}
5c69ef61	52
9a6d1438 RR	53	wxU32CharBuffer buffer( len );
9a6d1438 RR	54	wxChar32 *ptr = buffer.data();
5c69ef61	55
9a6d1438 RR	56	size_type i;
	57	for (i = 0; i < len; i++)
	58	{
	59	ptr = str;
	60	ptr++;
	61	str++;
	62	}
5c69ef61	63
9a6d1438 RR	64	return *this;
	65	}
	66
	67	// ----------------------------------------------------------------------------
	68	// UTF-8
	69	// ----------------------------------------------------------------------------
	70
9a6d1438 RR	71	// this table gives the length of the UTF-8 encoding from its first character:
	72	const unsigned char tableUtf8Lengths[256] = {
	73	// single-byte sequences (ASCII):
	74	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
	75	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
	76	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
	77	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
	78	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
	79	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
	80	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
	81	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
	82
	83	// these are invalid:
	84	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
	85	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
	86	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
	87	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
	88	0, 0, // C0,C1
	89
	90	// two-byte sequences:
	91	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
	92	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
	93
	94	// three-byte sequences:
	95	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
	96
	97	// four-byte sequences:
	98	4, 4, 4, 4, 4, // F0..F4
	99
	100	// these are invalid again (5- or 6-byte
	101	// sequences and sequences for code points
	102	// above U+10FFFF, as restricted by RFC 3629):
	103	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
	104	};
	105
	106	wxUString &wxUString::assignFromUTF8( const char *str )
	107	{
	108	if (!str)
	109	return assign( wxUString() );
5c69ef61	110
9a6d1438 RR	111	size_type ucs4_len = 0;
	112	const char *p = str;
	113	while (*p)
	114	{
	115	unsigned char c = *p;
	116	size_type len = tableUtf8Lengths[c];
	117	if (!len)
	118	return assign( wxUString() ); // don't try to convert invalid UTF-8
	119	ucs4_len++;
	120	p += len;
	121	}
	122
	123	wxU32CharBuffer buffer( ucs4_len );
	124	wxChar32 *out = buffer.data();
5c69ef61	125
9a6d1438 RR	126	p = str;
	127	while (*p)
	128	{
	129	unsigned char c = *p;
	130	if (c < 0x80)
	131	{
	132	*out = c;
	133	p++;
	134	}
	135	else
	136	{
	137	size_type len = tableUtf8Lengths[c]; // len == 0 is caught above
	138
	139	// Char. number range \| UTF-8 octet sequence
	140	// (hexadecimal) \| (binary)
	141	// ----------------------+----------------------------------------
	142	// 0000 0000 - 0000 007F \| 0xxxxxxx
	143	// 0000 0080 - 0000 07FF \| 110xxxxx 10xxxxxx
	144	// 0000 0800 - 0000 FFFF \| 1110xxxx 10xxxxxx 10xxxxxx
	145	// 0001 0000 - 0010 FFFF \| 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
	146	//
	147	// Code point value is stored in bits marked with 'x',
	148	// lowest-order bit of the value on the right side in the diagram
	149	// above. (from RFC 3629)
	150
	151	// mask to extract lead byte's value ('x' bits above), by sequence
	152	// length:
	153	static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
	154
	155	// mask and value of lead byte's most significant bits, by length:
	156	static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
	157	static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
	158
	159	len--; // it's more convenient to work with 0-based length here
	160
	161	// extract the lead byte's value bits:
	162	if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
	163	break;
	164
	165	wxChar32 code = c & leadValueMask[len];
	166
	167	// all remaining bytes, if any, are handled in the same way
	168	// regardless of sequence's length:
	169	for ( ; len; --len )
	170	{
	171	c = *++p;
	172	if ( (c & 0xC0) != 0x80 )
	173	return assign( wxUString() ); // don't try to convert invalid UTF-8
	174
	175	code <<= 6;
	176	code \|= c & 0x3F;
	177	}
5c69ef61	178
9a6d1438 RR	179	*out = code;
	180	p++;
	181	}
	182	out++;
	183	}
	184
	185	return assign( buffer.data() );
	186	}
	187
	188	wxUString &wxUString::assignFromUTF8( const char *str, size_type n )
	189	{
	190	if (!str)
	191	return assign( wxUString() );
5c69ef61	192
9a6d1438 RR	193	size_type ucs4_len = 0;
	194	size_type utf8_pos = 0;
	195	const char *p = str;
	196	while (*p)
	197	{
	198	unsigned char c = *p;
	199	size_type len = tableUtf8Lengths[c];
	200	if (!len)
	201	return assign( wxUString() ); // don't try to convert invalid UTF-8
	202	if (utf8_pos + len > n)
	203	break;
	204	utf8_pos += len;
	205	ucs4_len ++;
	206	p += len;
	207	}
5c69ef61	208
9a6d1438 RR	209	wxU32CharBuffer buffer( ucs4_len );
9a6d1438 RR	210	wxChar32 *out = buffer.data();
5c69ef61	211
9a6d1438 RR	212	utf8_pos = 0;
	213	p = str;
	214	while (*p)
	215	{
	216	unsigned char c = *p;
	217	if (c < 0x80)
	218	{
	219	if (utf8_pos + 1 > n)
	220	break;
	221	utf8_pos++;
5c69ef61	222
9a6d1438 RR	223	*out = c;
	224	p++;
	225	}
	226	else
	227	{
	228	size_type len = tableUtf8Lengths[c]; // len == 0 is caught above
	229	if (utf8_pos + len > n)
	230	break;
	231	utf8_pos += len;
	232
	233	// Char. number range \| UTF-8 octet sequence
	234	// (hexadecimal) \| (binary)
	235	// ----------------------+----------------------------------------
	236	// 0000 0000 - 0000 007F \| 0xxxxxxx
	237	// 0000 0080 - 0000 07FF \| 110xxxxx 10xxxxxx
	238	// 0000 0800 - 0000 FFFF \| 1110xxxx 10xxxxxx 10xxxxxx
	239	// 0001 0000 - 0010 FFFF \| 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
	240	//
	241	// Code point value is stored in bits marked with 'x',
	242	// lowest-order bit of the value on the right side in the diagram
	243	// above. (from RFC 3629)
	244
	245	// mask to extract lead byte's value ('x' bits above), by sequence
	246	// length:
	247	static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
	248
	249	// mask and value of lead byte's most significant bits, by length:
	250	static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
	251	static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
	252
	253	len--; // it's more convenient to work with 0-based length here
	254
	255	// extract the lead byte's value bits:
	256	if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
	257	break;
	258
	259	wxChar32 code = c & leadValueMask[len];
	260
	261	// all remaining bytes, if any, are handled in the same way
	262	// regardless of sequence's length:
	263	for ( ; len; --len )
	264	{
	265	c = *++p;
	266	if ( (c & 0xC0) != 0x80 )
	267	return assign( wxUString() ); // don't try to convert invalid UTF-8
	268
	269	code <<= 6;
	270	code \|= c & 0x3F;
	271	}
5c69ef61	272
9a6d1438 RR	273	*out = code;
	274	p++;
	275	}
	276	out++;
	277	}
5c69ef61	278
9a6d1438 RR	279	*out = 0;
	280
	281	return assign( buffer.data() );
	282	}
	283
	284	wxUString &wxUString::assignFromUTF16( const wxChar16* str, size_type n )
	285	{
	286	if (!str)
	287	return assign( wxUString() );
5c69ef61	288
9a6d1438 RR	289	size_type ucs4_len = 0;
	290	size_type utf16_pos = 0;
	291	const wxChar16 *p = str;
	292	while (*p)
	293	{
	294	size_type len;
	295	if ((p < 0xd800) \|\| (p > 0xdfff))
	296	{
	297	len = 1;
	298	}
	299	else if ((p[1] < 0xdc00) \|\| (p[1] > 0xdfff))
	300	{
	301	return assign( wxUString() ); // don't try to convert invalid UTF-16
	302	}
	303	else
	304	{
	305	len = 2;
	306	}
5c69ef61	307
9a6d1438 RR	308	if (utf16_pos + len > n)
9a6d1438 RR	309	break;
5c69ef61	310
9a6d1438 RR	311	ucs4_len++;
	312	p += len;
	313	utf16_pos += len;
	314	}
	315
	316	wxU32CharBuffer buffer( ucs4_len );
	317	wxChar32 *out = buffer.data();
	318
	319	utf16_pos = 0;
5c69ef61	320
9a6d1438 RR	321	p = str;
	322	while (*p)
	323	{
	324	if ((p < 0xd800) \|\| (p > 0xdfff))
	325	{
	326	if (utf16_pos + 1 > n)
	327	break;
5c69ef61	328
9a6d1438 RR	329	out = p;
	330	p++;
	331	utf16_pos++;
	332	}
	333	else
	334	{
	335	if (utf16_pos + 2 > n)
	336	break;
5c69ef61	337
9a6d1438 RR	338	*out = ((p[0] - 0xd7c0) << 10) + (p[1] - 0xdc00);
	339	p += 2;
	340	utf16_pos += 2;
	341	}
	342	out++;
	343	}
5c69ef61	344
9a6d1438 RR	345	return assign( buffer.data() );
	346	}
	347
	348	wxUString &wxUString::assignFromUTF16( const wxChar16* str )
	349	{
	350	if (!str)
	351	return assign( wxUString() );
5c69ef61	352
9a6d1438 RR	353	size_type ucs4_len = 0;
	354	const wxChar16 *p = str;
	355	while (*p)
	356	{
	357	size_type len;
	358	if ((p < 0xd800) \|\| (p > 0xdfff))
	359	{
	360	len = 1;
	361	}
	362	else if ((p[1] < 0xdc00) \|\| (p[1] > 0xdfff))
	363	{
	364	return assign( wxUString() ); // don't try to convert invalid UTF-16
	365	}
	366	else
	367	{
	368	len = 2;
	369	}
5c69ef61	370
9a6d1438 RR	371	ucs4_len++;
	372	p += len;
	373	}
	374
	375	wxU32CharBuffer buffer( ucs4_len );
	376	wxChar32 *out = buffer.data();
5c69ef61	377
9a6d1438 RR	378	p = str;
	379	while (*p)
	380	{
	381	if ((p < 0xd800) \|\| (p > 0xdfff))
	382	{
	383	out = p;
	384	p++;
	385	}
	386	else
	387	{
	388	*out = ((p[0] - 0xd7c0) << 10) + (p[1] - 0xdc00);
	389	p += 2;
	390	}
	391	out++;
	392	}
5c69ef61	393
9a6d1438 RR	394	return assign( buffer.data() );
	395	}
	396
	397	wxUString &wxUString::assignFromCString( const char* str )
	398	{
	399	if (!str)
	400	return assign( wxUString() );
5c69ef61	401
9a6d1438	402	wxWCharBuffer buffer = wxConvLibc.cMB2WC( str );
5c69ef61	403
9a6d1438 RR	404	return assign( buffer );
	405	}
	406
	407	wxUString &wxUString::assignFromCString( const char* str, const wxMBConv &conv )
	408	{
	409	if (!str)
	410	return assign( wxUString() );
5c69ef61	411
9a6d1438	412	wxWCharBuffer buffer = conv.cMB2WC( str );
5c69ef61	413
9a6d1438 RR	414	return assign( buffer );
	415	}
	416
	417	wxCharBuffer wxUString::utf8_str() const
	418	{
	419	size_type utf8_length = 0;
	420	const wxChar32 *ptr = data();
5c69ef61	421
9a6d1438 RR	422	while (*ptr)
	423	{
	424	wxChar32 code = *ptr;
	425	ptr++;
5c69ef61	426
9a6d1438 RR	427	if ( code <= 0x7F )
	428	{
	429	utf8_length++;
	430	}
	431	else if ( code <= 0x07FF )
	432	{
	433	utf8_length += 2;
	434	}
	435	else if ( code < 0xFFFF )
	436	{
	437	utf8_length += 3;
	438	}
	439	else if ( code <= 0x10FFFF )
	440	{
	441	utf8_length += 4;
	442	}
	443	else
	444	{
	445	// invalid range, skip
	446	}
	447	}
5c69ef61	448
9a6d1438	449	wxCharBuffer result( utf8_length );
5c69ef61	450
9a6d1438	451	char *out = result.data();
5c69ef61	452
9a6d1438 RR	453	ptr = data();
	454	while (*ptr)
	455	{
	456	wxChar32 code = *ptr;
	457	ptr++;
5c69ef61	458
9a6d1438 RR	459	if ( code <= 0x7F )
	460	{
	461	out[0] = (char)code;
	462	out++;
	463	}
	464	else if ( code <= 0x07FF )
	465	{
	466	out[1] = 0x80 \| (code & 0x3F); code >>= 6;
	467	out[0] = 0xC0 \| code;
	468	out += 2;
	469	}
	470	else if ( code < 0xFFFF )
	471	{
	472	out[2] = 0x80 \| (code & 0x3F); code >>= 6;
	473	out[1] = 0x80 \| (code & 0x3F); code >>= 6;
	474	out[0] = 0xE0 \| code;
	475	out += 3;
	476	}
	477	else if ( code <= 0x10FFFF )
	478	{
	479	out[3] = 0x80 \| (code & 0x3F); code >>= 6;
	480	out[2] = 0x80 \| (code & 0x3F); code >>= 6;
	481	out[1] = 0x80 \| (code & 0x3F); code >>= 6;
	482	out[0] = 0xF0 \| code;
	483	out += 4;
	484	}
	485	else
	486	{
	487	// invalid range, skip
	488	}
	489	}
	490
	491	wxPrintf( "utf8_str %s len %d\n", result, wxStrlen( result.data() ) );
	492	wxPrintf( "utf8_str %s len %d\n", result, wxStrlen( result.data() ) );
5c69ef61	493
9a6d1438 RR	494	return result;
9a6d1438 RR	495	}
5c69ef61	496
9a6d1438 RR	497	wxU16CharBuffer wxUString::utf16_str() const
	498	{
	499	size_type utf16_length = 0;
	500	const wxChar32 *ptr = data();
5c69ef61	501
9a6d1438 RR	502	while (*ptr)
	503	{
	504	wxChar32 code = *ptr;
	505	ptr++;
5c69ef61	506
9a6d1438	507	// TODO: error range checks
5c69ef61	508
9a6d1438 RR	509	if (code < 0x10000)
	510	utf16_length++;
	511	else
	512	utf16_length += 2;
	513	}
5c69ef61	514
9a6d1438 RR	515	wxU16CharBuffer result( utf16_length );
9a6d1438 RR	516	wxChar16 *out = result.data();
5c69ef61	517
9a6d1438	518	ptr = data();
5c69ef61	519
9a6d1438 RR	520	while (*ptr)
	521	{
	522	wxChar32 code = *ptr;
	523	ptr++;
5c69ef61	524
9a6d1438	525	// TODO: error range checks
5c69ef61	526
9a6d1438 RR	527	if (code < 0x10000)
	528	{
	529	out[0] = code;
	530	out++;
	531	}
	532	else
	533	{
	534	out[0] = (code - 0x10000) / 0x400 + 0xd800;
	535	out[1] = (code - 0x10000) % 0x400 + 0xdc00;
	536	out += 2;
	537	}
	538	}
5c69ef61 VZ	539
5c69ef61 VZ	540	return result;
9a6d1438	541	}