X-Git-Url: https://git.saurik.com/wxWidgets.git/blobdiff_plain/467175ab3f2177aa824ceb7b67934fd54ce4e8e0..c80d4c1e207b0011db61cb2ce1cc8babe8e54582:/src/common/stringops.cpp?ds=sidebyside diff --git a/src/common/stringops.cpp b/src/common/stringops.cpp index ac0455da53..1592706872 100644 --- a/src/common/stringops.cpp +++ b/src/common/stringops.cpp @@ -4,7 +4,6 @@ // Author: Vaclav Slavik // Modified by: // Created: 2007-04-16 -// RCS-ID: $Id$ // Copyright: (c) 2007 REA Elektronik GmbH // Licence: wxWindows licence ///////////////////////////////////////////////////////////////////////////// @@ -34,7 +33,7 @@ // UTF-8 sequences lengths // --------------------------------------------------------------------------- -unsigned char wxStringOperationsUtf8::ms_utf8IterTable[256] = { +const unsigned char wxStringOperationsUtf8::ms_utf8IterTable[256] = { // single-byte sequences (ASCII): 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F @@ -87,17 +86,26 @@ unsigned char wxStringOperationsUtf8::ms_utf8IterTable[256] = { // U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | // -------------------+----------+----------+----------+----------+ -bool wxStringOperationsUtf8::IsValidUtf8String(const char *str) +bool wxStringOperationsUtf8::IsValidUtf8String(const char *str, size_t len) { if ( !str ) return true; // empty string is UTF8 string const unsigned char *c = (const unsigned char*)str; + const unsigned char * const end = (len == wxStringImpl::npos) ? NULL : c + len; - for ( ; *c; ++c ) + for ( ; c != end && *c; ++c ) { unsigned char b = *c; + if ( end != NULL ) + { + // if the string is not NULL-terminated, verify we have enough + // bytes in it left for current character's encoding: + if ( c + ms_utf8IterTable[*c] > end ) + return false; + } + if ( b <= 0x7F ) // 00..7F continue; @@ -122,7 +130,16 @@ bool wxStringOperationsUtf8::IsValidUtf8String(const char *str) if ( !(b >= 0x80 && b <= 0xBF ) ) return false; } - else if ( b <= 0xEF ) // E1..EF + else if ( b == 0xED ) + { + b = *(++c); + if ( !(b >= 0x80 && b <= 0x9F ) ) + return false; + b = *(++c); + if ( !(b >= 0x80 && b <= 0xBF ) ) + return false; + } + else if ( b <= 0xEF ) // E1..EC EE..EF { for ( int i = 0; i < 2; ++i ) { @@ -173,21 +190,14 @@ bool wxStringOperationsUtf8::IsValidUtf8String(const char *str) return true; } -#ifdef __WXDEBUG__ -bool wxStringOperationsUtf8::IsValidUtf8LeadByte(unsigned char c) -{ - return (c <= 0x7F) || (c >= 0xC2 && c <= 0xF4); -} -#endif - - -wxStringOperationsUtf8::Utf8CharBuffer -wxStringOperationsUtf8::EncodeChar(const wxUniChar& ch) +// NB: this is in this file and not unichar.cpp to keep all UTF-8 encoding +// code in single place +wxUniChar::Utf8CharBuffer wxUniChar::AsUTF8() const { - Utf8CharBuffer buf; + Utf8CharBuffer buf = { "" }; // init to avoid g++ 4.1 warning with -O2 char *out = buf.data; - wxUniChar::value_type code = ch.GetValue(); + value_type code = GetValue(); // Char. number range | UTF-8 octet sequence // (hexadecimal) | (binary) @@ -231,7 +241,7 @@ wxStringOperationsUtf8::EncodeChar(const wxUniChar& ch) } else { - wxFAIL_MSG( _T("trying to encode undefined Unicode character") ); + wxFAIL_MSG( wxT("trying to encode undefined Unicode character") ); out[0] = 0; } @@ -239,13 +249,12 @@ wxStringOperationsUtf8::EncodeChar(const wxUniChar& ch) } wxUniChar -wxStringOperationsUtf8::DecodeChar(wxStringImpl::const_iterator i) +wxStringOperationsUtf8::DecodeNonAsciiChar(wxStringImpl::const_iterator i) { wxASSERT( IsValidUtf8LeadByte(*i) ); - wxUniChar::value_type code = 0; size_t len = GetUtf8CharLength(*i); - wxASSERT_MSG( len <= 4, _T("invalid UTF-8 sequence length") ); + wxASSERT_MSG( len <= 4, wxT("invalid UTF-8 sequence length") ); // Char. number range | UTF-8 octet sequence // (hexadecimal) | (binary) @@ -261,7 +270,7 @@ wxStringOperationsUtf8::DecodeChar(wxStringImpl::const_iterator i) // mask to extract lead byte's value ('x' bits above), by sequence's length: static const unsigned char s_leadValueMask[4] = { 0x7F, 0x1F, 0x0F, 0x07 }; -#ifdef __WXDEBUG__ +#if wxDEBUG_LEVEL // mask and value of lead byte's most significant bits, by length: static const unsigned char s_leadMarkerMask[4] = { 0x80, 0xE0, 0xF0, 0xF8 }; static const unsigned char s_leadMarkerVal[4] = { 0x00, 0xC0, 0xE0, 0xF0 }; @@ -270,15 +279,15 @@ wxStringOperationsUtf8::DecodeChar(wxStringImpl::const_iterator i) // extract the lead byte's value bits: wxASSERT_MSG( ((unsigned char)*i & s_leadMarkerMask[len-1]) == s_leadMarkerVal[len-1], - _T("invalid UTF-8 lead byte") ); - code = (unsigned char)*i & s_leadValueMask[len-1]; + wxT("invalid UTF-8 lead byte") ); + wxUniChar::value_type code = (unsigned char)*i & s_leadValueMask[len-1]; // all remaining bytes, if any, are handled in the same way regardless of // sequence's length: for ( ++i ; len > 1; --len, ++i ) { wxASSERT_MSG( ((unsigned char)*i & 0xC0) == 0x80, - _T("invalid UTF-8 byte") ); + wxT("invalid UTF-8 byte") ); code <<= 6; code |= (unsigned char)*i & 0x3F;