From 5367a38ad3bca064575f0c857a69550a99a0cc4f Mon Sep 17 00:00:00 2001 From: =?utf8?q?V=C3=A1clav=20Slav=C3=ADk?= Date: Tue, 28 Aug 2007 10:40:40 +0000 Subject: [PATCH] optimized wxMBConvStringUTF8::ToWchar() for ASCII characters git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@48427 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775 --- src/common/strconv.cpp | 93 ++++++++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 39 deletions(-) diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index eeaa071f31..609f44b9d1 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -778,58 +778,73 @@ wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen, return written; } - unsigned char c = *p; - unsigned len = tableUtf8Lengths[c]; - if ( !len ) + if ( out && !dstLen-- ) break; - if ( srcLen < len ) // the test works for wxNO_LEN too - break; + wxUint32 code; + unsigned char c = *p; - if ( srcLen != wxNO_LEN ) - srcLen -= len; + if ( c < 0x80 ) + { + if ( srcLen == 0 ) // the test works for wxNO_LEN too + break; - if ( out && !dstLen-- ) - break; + if ( srcLen != wxNO_LEN ) + srcLen--; + code = c; + } + else + { + unsigned len = tableUtf8Lengths[c]; + if ( !len ) + break; - // Char. number range | UTF-8 octet sequence - // (hexadecimal) | (binary) - // ----------------------+--------------------------------------------- - // 0000 0000 - 0000 007F | 0xxxxxxx - // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx - // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx - // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - // - // Code point value is stored in bits marked with 'x', lowest-order bit - // of the value on the right side in the diagram above. - // (from RFC 3629) + if ( srcLen < len ) // the test works for wxNO_LEN too + break; - // mask to extract lead byte's value ('x' bits above), by sequence length: - static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 }; + if ( srcLen != wxNO_LEN ) + srcLen -= len; - // mask and value of lead byte's most significant bits, by length: - static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 }; - static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 }; + // Char. number range | UTF-8 octet sequence + // (hexadecimal) | (binary) + // ----------------------+---------------------------------------- + // 0000 0000 - 0000 007F | 0xxxxxxx + // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx + // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx + // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + // + // Code point value is stored in bits marked with 'x', + // lowest-order bit of the value on the right side in the diagram + // above. (from RFC 3629) - len--; // it's more convenient to work with 0-based length here + // mask to extract lead byte's value ('x' bits above), by sequence + // length: + static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 }; - // extract the lead byte's value bits: - if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] ) - break; + // mask and value of lead byte's most significant bits, by length: + static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 }; + static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 }; - wxUint32 code = c & leadValueMask[len]; + len--; // it's more convenient to work with 0-based length here - // all remaining bytes, if any, are handled in the same way regardless of - // sequence's length: - for ( ; len; --len ) - { - c = *++p; - if ( (c & 0xC0) != 0x80 ) - return wxCONV_FAILED; + // extract the lead byte's value bits: + if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] ) + break; - code <<= 6; - code |= c & 0x3F; + code = c & leadValueMask[len]; + + // all remaining bytes, if any, are handled in the same way + // regardless of sequence's length: + for ( ; len; --len ) + { + c = *++p; + if ( (c & 0xC0) != 0x80 ) + return wxCONV_FAILED; + + code <<= 6; + code |= c & 0x3F; + } } #ifdef WC_UTF16 -- 2.45.2