// UTF-8 sequences lengths
// ---------------------------------------------------------------------------
-unsigned char wxStringOperationsUtf8::ms_utf8IterTable[256] = {
+const unsigned char wxStringOperationsUtf8::ms_utf8IterTable[256] = {
// single-byte sequences (ASCII):
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
// U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF |
// -------------------+----------+----------+----------+----------+
-bool wxStringOperationsUtf8::IsValidUtf8String(const char *str)
+bool wxStringOperationsUtf8::IsValidUtf8String(const char *str, size_t len)
{
if ( !str )
return true; // empty string is UTF8 string
const unsigned char *c = (const unsigned char*)str;
+ const unsigned char * const end = (len == wxStringImpl::npos) ? NULL : c + len;
- for ( ; *c; ++c )
+ for ( ; c != end && *c; ++c )
{
unsigned char b = *c;
+ if ( end != NULL )
+ {
+ // if the string is not NULL-terminated, verify we have enough
+ // bytes in it left for current character's encoding:
+ if ( c + ms_utf8IterTable[*c] > end )
+ return false;
+ }
+
if ( b <= 0x7F ) // 00..7F
continue;
if ( !(b >= 0x80 && b <= 0xBF ) )
return false;
}
- else if ( b <= 0xEF ) // E1..EF
+ else if ( b == 0xED )
+ {
+ b = *(++c);
+ if ( !(b >= 0x80 && b <= 0x9F ) )
+ return false;
+ b = *(++c);
+ if ( !(b >= 0x80 && b <= 0xBF ) )
+ return false;
+ }
+ else if ( b <= 0xEF ) // E1..EC EE..EF
{
for ( int i = 0; i < 2; ++i )
{
return true;
}
-#ifdef __WXDEBUG__
-bool wxStringOperationsUtf8::IsValidUtf8LeadByte(unsigned char c)
-{
- return (c <= 0x7F) || (c >= 0xC2 && c <= 0xF4);
-}
-#endif
-
-
-wxStringOperationsUtf8::Utf8CharBuffer
-wxStringOperationsUtf8::EncodeChar(const wxUniChar& ch)
+// NB: this is in this file and not unichar.cpp to keep all UTF-8 encoding
+// code in single place
+wxUniChar::Utf8CharBuffer wxUniChar::AsUTF8() const
{
- Utf8CharBuffer buf;
+ Utf8CharBuffer buf = { "" }; // init to avoid g++ 4.1 warning with -O2
char *out = buf.data;
- wxUniChar::value_type code = ch.GetValue();
+ value_type code = GetValue();
// Char. number range | UTF-8 octet sequence
// (hexadecimal) | (binary)
}
else
{
- wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
+ wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
out[0] = 0;
}
}
wxUniChar
-wxStringOperationsUtf8::DecodeChar(wxStringImpl::const_iterator i)
+wxStringOperationsUtf8::DecodeNonAsciiChar(wxStringImpl::const_iterator i)
{
wxASSERT( IsValidUtf8LeadByte(*i) );
- wxUniChar::value_type code = 0;
size_t len = GetUtf8CharLength(*i);
- wxASSERT_MSG( len <= 4, _T("invalid UTF-8 sequence length") );
+ wxASSERT_MSG( len <= 4, wxT("invalid UTF-8 sequence length") );
// Char. number range | UTF-8 octet sequence
// (hexadecimal) | (binary)
// mask to extract lead byte's value ('x' bits above), by sequence's length:
static const unsigned char s_leadValueMask[4] = { 0x7F, 0x1F, 0x0F, 0x07 };
-#ifdef __WXDEBUG__
+#if wxDEBUG_LEVEL
// mask and value of lead byte's most significant bits, by length:
static const unsigned char s_leadMarkerMask[4] = { 0x80, 0xE0, 0xF0, 0xF8 };
static const unsigned char s_leadMarkerVal[4] = { 0x00, 0xC0, 0xE0, 0xF0 };
// extract the lead byte's value bits:
wxASSERT_MSG( ((unsigned char)*i & s_leadMarkerMask[len-1]) ==
s_leadMarkerVal[len-1],
- _T("invalid UTF-8 lead byte") );
- code = (unsigned char)*i & s_leadValueMask[len-1];
+ wxT("invalid UTF-8 lead byte") );
+ wxUniChar::value_type code = (unsigned char)*i & s_leadValueMask[len-1];
// all remaining bytes, if any, are handled in the same way regardless of
// sequence's length:
for ( ++i ; len > 1; --len, ++i )
{
wxASSERT_MSG( ((unsigned char)*i & 0xC0) == 0x80,
- _T("invalid UTF-8 byte") );
+ wxT("invalid UTF-8 byte") );
code <<= 6;
code |= (unsigned char)*i & 0x3F;