From: Václav Slavík Date: Thu, 3 May 2007 11:05:04 +0000 (+0000) Subject: added code for optimized handling of UTF-8 locales: some string operations are more... X-Git-Url: https://git.saurik.com/wxWidgets.git/commitdiff_plain/111d99489d509bc96819877b88596474f3253859 added code for optimized handling of UTF-8 locales: some string operations are more efficient under it and it's possible to completely compile-out support for other locales if the target system is known to only use UTF-8 locales git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@45782 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775 --- diff --git a/configure b/configure index 44f30a7654..ca04b7a9f9 100755 --- a/configure +++ b/configure @@ -1001,6 +1001,7 @@ Optional Features: --enable-mimetype use wxMimeTypesManager --enable-mslu use MS Layer for Unicode on Windows 9x (Win32 only) --enable-utf8 use UTF-8 representation for strings (Unix only) + --enable-utf8only only support UTF-8 locales in UTF-8 build (Unix only) --enable-snglinst use wxSingleInstanceChecker class --enable-std_iostreams use standard C++ stream classes --enable-std_string use standard C++ string classes @@ -2389,6 +2390,7 @@ if test $DEBUG_CONFIGURE = 1; then DEFAULT_wxUSE_UNICODE=no DEFAULT_wxUSE_UNICODE_MSLU=no DEFAULT_wxUSE_UNICODE_UTF8=no + DEFAULT_wxUSE_UNICODE_UTF8_LOCALE=no DEFAULT_wxUSE_WCSRTOMBS=no DEFAULT_wxUSE_PALETTE=no @@ -2616,6 +2618,7 @@ else DEFAULT_wxUSE_UNICODE=no DEFAULT_wxUSE_UNICODE_MSLU=yes DEFAULT_wxUSE_UNICODE_UTF8=no + DEFAULT_wxUSE_UNICODE_UTF8_LOCALE=no DEFAULT_wxUSE_WCSRTOMBS=no DEFAULT_wxUSE_PALETTE=yes @@ -6066,6 +6069,47 @@ echo "${ECHO_T}no" >&6 fi + enablestring= + echo "$as_me:$LINENO: checking for --${enablestring:-enable}-utf8only" >&5 +echo $ECHO_N "checking for --${enablestring:-enable}-utf8only... $ECHO_C" >&6 + no_cache=0 + # Check whether --enable-utf8only or --disable-utf8only was given. +if test "${enable_utf8only+set}" = set; then + enableval="$enable_utf8only" + + if test "$enableval" = yes; then + ac_cv_use_utf8only='wxUSE_UNICODE_UTF8_LOCALE=yes' + else + ac_cv_use_utf8only='wxUSE_UNICODE_UTF8_LOCALE=no' + fi + +else + + LINE=`grep "wxUSE_UNICODE_UTF8_LOCALE" ${wx_arg_cache_file}` + if test "x$LINE" != x ; then + eval "DEFAULT_$LINE" + else + no_cache=1 + fi + + ac_cv_use_utf8only='wxUSE_UNICODE_UTF8_LOCALE='$DEFAULT_wxUSE_UNICODE_UTF8_LOCALE + +fi; + + eval "$ac_cv_use_utf8only" + if test "$no_cache" != 1; then + echo $ac_cv_use_utf8only >> ${wx_arg_cache_file}.tmp + fi + + if test "$wxUSE_UNICODE_UTF8_LOCALE" = yes; then + echo "$as_me:$LINENO: result: yes" >&5 +echo "${ECHO_T}yes" >&6 + else + echo "$as_me:$LINENO: result: no" >&5 +echo "${ECHO_T}no" >&6 + fi + + enablestring= echo "$as_me:$LINENO: checking for --${enablestring:-enable}-snglinst" >&5 echo $ECHO_N "checking for --${enablestring:-enable}-snglinst... $ECHO_C" >&6 @@ -39098,7 +39142,6 @@ echo $ECHO_N "checking how many arguments gethostbyname_r() takes... $ECHO_C" >& else -################################################################ ac_cv_func_which_gethostbyname_r=unknown @@ -39340,7 +39383,6 @@ rm -f conftest.err conftest.$ac_objext conftest.$ac_ext fi -################################################################ fi @@ -39498,19 +39540,103 @@ _ACEOF fi fi - -echo "$as_me:$LINENO: checking how many arguments getservbyname_r() takes" >&5 -echo $ECHO_N "checking how many arguments getservbyname_r() takes... $ECHO_C" >&6 + echo "$as_me:$LINENO: checking for getservbyname_r" >&5 +echo $ECHO_N "checking for getservbyname_r... $ECHO_C" >&6 if test "${ac_cv_func_which_getservbyname_r+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else - ac_ext=cc -ac_cpp='$CXXCPP $CPPFLAGS' -ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' -ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' -ac_compiler_gnu=$ac_cv_cxx_compiler_gnu +echo "$as_me:$LINENO: checking for getservbyname_r" >&5 +echo $ECHO_N "checking for getservbyname_r... $ECHO_C" >&6 +if test "${ac_cv_func_getservbyname_r+set}" = set; then + echo $ECHO_N "(cached) $ECHO_C" >&6 +else + cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ +/* Define getservbyname_r to an innocuous variant, in case declares getservbyname_r. + For example, HP-UX 11i declares gettimeofday. */ +#define getservbyname_r innocuous_getservbyname_r + +/* System header to define __stub macros and hopefully few prototypes, + which can conflict with char getservbyname_r (); below. + Prefer to if __STDC__ is defined, since + exists even on freestanding compilers. */ + +#ifdef __STDC__ +# include +#else +# include +#endif + +#undef getservbyname_r + +/* Override any gcc2 internal prototype to avoid an error. */ +#ifdef __cplusplus +extern "C" +{ +#endif +/* We use char because int might match the return type of a gcc2 + builtin and then its argument prototype would still apply. */ +char getservbyname_r (); +/* The GNU C library defines this for functions which it implements + to always fail with ENOSYS. Some functions are actually named + something starting with __ and the normal name is an alias. */ +#if defined (__stub_getservbyname_r) || defined (__stub___getservbyname_r) +choke me +#else +char (*f) () = getservbyname_r; +#endif +#ifdef __cplusplus +} +#endif + +int +main () +{ +return f != getservbyname_r; + ; + return 0; +} +_ACEOF +rm -f conftest.$ac_objext conftest$ac_exeext +if { (eval echo "$as_me:$LINENO: \"$ac_link\"") >&5 + (eval $ac_link) 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && + { ac_try='test -z "$ac_c_werror_flag" + || test ! -s conftest.err' + { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 + (eval $ac_try) 2>&5 + ac_status=$? + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); }; } && + { ac_try='test -s conftest$ac_exeext' + { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 + (eval $ac_try) 2>&5 + ac_status=$? + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); }; }; then + ac_cv_func_getservbyname_r=yes +else + echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 +ac_cv_func_getservbyname_r=no +fi +rm -f conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +echo "$as_me:$LINENO: result: $ac_cv_func_getservbyname_r" >&5 +echo "${ECHO_T}$ac_cv_func_getservbyname_r" >&6 +if test $ac_cv_func_getservbyname_r = yes; then cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ @@ -39518,17 +39644,20 @@ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ -#include + +# include + int main () { - char *name; - char *proto; - struct servent *se, *res; - char buffer[2048]; - int buflen = 2048; - (void) getservbyname_r(name, proto, se, buffer, buflen, &res) + + char *name; + char *proto; + struct servent *se; + struct servent_data data; + (void) getservbyname_r(name, proto, se, &data); + ; return 0; @@ -39543,7 +39672,7 @@ if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && - { ac_try='test -z "$ac_cxx_werror_flag" + { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 @@ -39556,29 +39685,31 @@ if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then - ac_cv_func_which_getservbyname_r=six + ac_cv_func_which_getservbyname_r=four else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 - cat >conftest.$ac_ext <<_ACEOF + cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ -#include + +# include + int main () { - char *name; - char *proto; - struct servent *se; - char buffer[2048]; - int buflen = 2048; - (void) getservbyname_r(name, proto, se, buffer, buflen) + char *name; + char *proto; + struct servent *se, *res; + char buffer[2048]; + int buflen = 2048; + (void) getservbyname_r(name, proto, se, buffer, buflen, &res) ; return 0; @@ -39593,7 +39724,7 @@ if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && - { ac_try='test -z "$ac_cxx_werror_flag" + { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 @@ -39606,28 +39737,31 @@ if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then - ac_cv_func_which_getservbyname_r=five + ac_cv_func_which_getservbyname_r=six else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 - cat >conftest.$ac_ext <<_ACEOF + cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ -#include + +# include + int main () { - char *name; - char *proto; - struct servent *se; - struct servent_data data; - (void) getservbyname_r(name, proto, se, &data); + char *name; + char *proto; + struct servent *se; + char buffer[2048]; + int buflen = 2048; + (void) getservbyname_r(name, proto, se, buffer, buflen) ; return 0; @@ -39642,7 +39776,7 @@ if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && - { ac_try='test -z "$ac_cxx_werror_flag" + { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 @@ -39655,30 +39789,28 @@ if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then - ac_cv_func_which_getservbyname_r=four + ac_cv_func_which_getservbyname_r=five else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 ac_cv_func_which_getservbyname_r=no - fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext + + fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext - ac_ext=c -ac_cpp='$CPP $CPPFLAGS' -ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' -ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' -ac_compiler_gnu=$ac_cv_c_compiler_gnu - +else + ac_cv_func_which_getservbyname_r=no +fi fi echo "$as_me:$LINENO: result: $ac_cv_func_which_getservbyname_r" >&5 @@ -39699,6 +39831,7 @@ elif test $ac_cv_func_which_getservbyname_r = four; then #define HAVE_FUNC_GETSERVBYNAME_R_4 1 _ACEOF + fi @@ -43472,6 +43605,13 @@ if test "$wxUSE_UNICODE" = "yes" -a "$wxUSE_UNICODE_UTF8" = "yes"; then #define wxUSE_UNICODE_UTF8 1 _ACEOF + + if test "$wxUSE_UNICODE_UTF8_LOCALE" = "yes"; then + cat >>confdefs.h <<\_ACEOF +#define wxUSE_UTF8_LOCALE_ONLY 1 +_ACEOF + + fi fi if test "$wxUSE_wxUSE_EXPERIMENTAL_PRINTF" = "yes"; then @@ -46067,7 +46207,10 @@ echo "${ECHO_T}$bakefile_cv_prog_makeisgnu" >&6 PLATFORM_BEOS=1 ;; * ) - ;; + { { echo "$as_me:$LINENO: error: Unknown platform: $BAKEFILE_FORCE_PLATFORM" >&5 +echo "$as_me: error: Unknown platform: $BAKEFILE_FORCE_PLATFORM" >&2;} + { (exit 1); exit 1; }; } + ;; esac fi @@ -48802,10 +48945,21 @@ echo "${ECHO_T}no" >&6 cppunit_major_min=`echo $cppunit_version_min | \ sed 's/\([0-9]*\).\([0-9]*\).\([0-9]*\)/\1/'` + if test "x${cppunit_major_min}" = "x" ; then + cppunit_major_min=0 + fi + cppunit_minor_min=`echo $cppunit_version_min | \ sed 's/\([0-9]*\).\([0-9]*\).\([0-9]*\)/\2/'` + if test "x${cppunit_minor_min}" = "x" ; then + cppunit_minor_min=0 + fi + cppunit_micro_min=`echo $cppunit_version_min | \ sed 's/\([0-9]*\).\([0-9]*\).\([0-9]*\)/\3/'` + if test "x${cppunit_micro_min}" = "x" ; then + cppunit_micro_min=0 + fi cppunit_version_proper=`expr \ $cppunit_major_version \> $cppunit_major_min \| \ diff --git a/configure.in b/configure.in index 39103c8504..ce7e311a4f 100644 --- a/configure.in +++ b/configure.in @@ -578,6 +578,7 @@ if test $DEBUG_CONFIGURE = 1; then DEFAULT_wxUSE_UNICODE=no DEFAULT_wxUSE_UNICODE_MSLU=no DEFAULT_wxUSE_UNICODE_UTF8=no + DEFAULT_wxUSE_UNICODE_UTF8_LOCALE=no DEFAULT_wxUSE_WCSRTOMBS=no DEFAULT_wxUSE_PALETTE=no @@ -805,6 +806,7 @@ else DEFAULT_wxUSE_UNICODE=no DEFAULT_wxUSE_UNICODE_MSLU=yes DEFAULT_wxUSE_UNICODE_UTF8=no + DEFAULT_wxUSE_UNICODE_UTF8_LOCALE=no DEFAULT_wxUSE_WCSRTOMBS=no DEFAULT_wxUSE_PALETTE=yes @@ -993,6 +995,7 @@ WX_ARG_ENABLE(mimetype, [ --enable-mimetype use wxMimeTypesManager], WX_ARG_ENABLE(mslu, [ --enable-mslu use MS Layer for Unicode on Windows 9x (Win32 only)], wxUSE_UNICODE_MSLU) dnl FIXME-UTF8: make UTF8 automatic WX_ARG_ENABLE(utf8, [ --enable-utf8 use UTF-8 representation for strings (Unix only)], wxUSE_UNICODE_UTF8) +WX_ARG_ENABLE(utf8only, [ --enable-utf8only only support UTF-8 locales in UTF-8 build (Unix only)], wxUSE_UNICODE_UTF8_LOCALE) WX_ARG_ENABLE(snglinst, [ --enable-snglinst use wxSingleInstanceChecker class], wxUSE_SNGLINST_CHECKER) WX_ARG_ENABLE(std_iostreams, [ --enable-std_iostreams use standard C++ stream classes], wxUSE_STD_IOSTREAM) WX_ARG_ENABLE(std_string, [ --enable-std_string use standard C++ string classes], wxUSE_STD_STRING) @@ -6492,6 +6495,10 @@ fi if test "$wxUSE_UNICODE" = "yes" -a "$wxUSE_UNICODE_UTF8" = "yes"; then AC_DEFINE(wxUSE_UNICODE_UTF8) + + if test "$wxUSE_UNICODE_UTF8_LOCALE" = "yes"; then + AC_DEFINE(wxUSE_UTF8_LOCALE_ONLY) + fi fi if test "$wxUSE_wxUSE_EXPERIMENTAL_PRINTF" = "yes"; then diff --git a/include/wx/strconv.h b/include/wx/strconv.h index b9ea52f68e..4360b7f6a3 100644 --- a/include/wx/strconv.h +++ b/include/wx/strconv.h @@ -135,6 +135,12 @@ public: // encoding static size_t GetMaxMBNulLen() { return 4 /* for UTF-32 */; } +#if wxUSE_UNICODE_UTF8 + // return true if the converter's charset is UTF-8, i.e. char* strings + // decoded using this object can be directly copied to wxString's internal + // storage without converting to WC and than back to UTF-8 MB string + virtual bool IsUTF8() const { return false; } +#endif // The old conversion functions. The existing classes currently mostly // implement these ones but we're in transition to using To/FromWChar() @@ -175,6 +181,10 @@ public: virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const; virtual wxMBConv *Clone() const { return new wxMBConvLibc; } + +#if wxUSE_UNICODE_UTF8 + virtual bool IsUTF8() const { return wxLocaleIsUtf8; } +#endif }; #ifdef __UNIX__ @@ -244,6 +254,8 @@ public: class WXDLLIMPEXP_BASE wxMBConvUTF8 : public wxMBConv { public: + // FIXME-UTF8: split this class into multiple classes, one strict and + // other lossy (PUA, OCTAL mappings) enum { MAP_INVALID_UTF8_NOT = 0, @@ -257,6 +269,12 @@ public: virtual wxMBConv *Clone() const { return new wxMBConvUTF8(m_options); } +#if wxUSE_UNICODE_UTF8 + // NB: other mapping modes are not, strictly speaking, UTF-8, so we can't + // take the shortcut in that case + virtual bool IsUTF8() const { return m_options == MAP_INVALID_UTF8_NOT; } +#endif + private: int m_options; }; diff --git a/include/wx/string.h b/include/wx/string.h index 346316a492..063115b34e 100644 --- a/include/wx/string.h +++ b/include/wx/string.h @@ -201,7 +201,7 @@ public: const wchar_t* AsWChar() const; operator const wchar_t*() const { return AsWChar(); } -#if !wxUSE_UNICODE +#if !wxUSE_UNICODE || wxUSE_UTF8_LOCALE_ONLY inline #endif const char* AsChar() const; @@ -477,9 +477,6 @@ private: #else // wxUSE_UNICODE_UTF8 - // FIXME-UTF8: return as-is without copying under UTF8 locale, return - // converted string under other locales - needs wxCharBuffer - // changes static wxCharBuffer ImplStr(const char* str, const wxMBConv& conv = wxConvLibc) { return ConvertStr(str, npos, conv).data; } @@ -931,8 +928,7 @@ public: { return wxStdWideString(wc_str()); } #endif - #if !wxUSE_UNICODE && wxUSE_STL_BASED_WXSTRING - // FIXME-UTF8: do this in UTF8 build #if wxUSE_UTF8_LOCALE_ONLY, too + #if (!wxUSE_UNICODE || wxUSE_UTF8_LOCALE_ONLY) && wxUSE_STL_BASED_WXSTRING // wxStringImpl is std::string in the encoding we want operator const std::string&() const { return m_impl; } #else @@ -941,8 +937,7 @@ public: // FIXME-UTF8: broken for embedded NULs { return std::string(mb_str()); } #endif - -#endif // wxUSE_STD_STRING +#endif // wxUSE_STL // first valid index position const_iterator begin() const { return const_iterator(m_impl.begin()); } @@ -1161,7 +1156,13 @@ public: // type differs because a function may either return pointer to the buffer // directly or have to use intermediate buffer for translation. #if wxUSE_UNICODE + +#if wxUSE_UTF8_LOCALE_ONLY + const char* mb_str() const { return wx_str(); } + const wxCharBuffer mb_str(const wxMBConv& conv) const; +#else const wxCharBuffer mb_str(const wxMBConv& conv = wxConvLibc) const; +#endif const wxWX2MBbuf mbc_str() const { return mb_str(*wxConvCurrent); } @@ -2428,7 +2429,7 @@ private: T *m_buf; }; -#if wxUSE_UNICODE +#if wxUSE_UNICODE && !wxUSE_UTF8_LOCALE_ONLY ConvertedBuffer m_convertedToChar; #endif #if !wxUSE_UNICODE_WCHAR @@ -2821,10 +2822,10 @@ inline const wchar_t* wxCStrData::AsWChar() const } #endif // wxUSE_UNICODE_WCHAR -#if !wxUSE_UNICODE +#if !wxUSE_UNICODE || wxUSE_UTF8_LOCALE_ONLY inline const char* wxCStrData::AsChar() const { - return m_str->wx_str() + m_offset; + return wxStringOperations::AddToIter(m_str->wx_str(), m_offset); } #endif // !wxUSE_UNICODE diff --git a/include/wx/stringops.h b/include/wx/stringops.h index 2cfbe188e4..ed53532d9e 100644 --- a/include/wx/stringops.h +++ b/include/wx/stringops.h @@ -65,7 +65,8 @@ struct WXDLLIMPEXP_BASE wxStringOperationsWchar struct WXDLLIMPEXP_BASE wxStringOperationsUtf8 { // checks correctness of UTF-8 sequence - static bool IsValidUtf8String(const char *c); + static bool IsValidUtf8String(const char *c, + size_t len = wxStringImpl::npos); #ifdef __WXDEBUG__ static bool IsValidUtf8LeadByte(unsigned char c); #endif diff --git a/setup.h.in b/setup.h.in index ec9d560f63..14e379d9b5 100644 --- a/setup.h.in +++ b/setup.h.in @@ -562,6 +562,8 @@ #define wxUSE_UNICODE_UTF8 0 +#define wxUSE_UTF8_LOCALE_ONLY 0 + #define wxUSE_DC_CACHEING 0 #define wxUSE_GADGETS 0 diff --git a/src/common/string.cpp b/src/common/string.cpp index f9f389cfa2..daa4901016 100644 --- a/src/common/string.cpp +++ b/src/common/string.cpp @@ -220,9 +220,16 @@ wxString::~wxString() } #endif -#if wxUSE_UNICODE +#if wxUSE_UNICODE && !wxUSE_UTF8_LOCALE_ONLY const char* wxCStrData::AsChar() const { +#if wxUSE_UNICODE_UTF8 + if ( wxLocaleIsUtf8 ) + return AsInternal(); +#endif + // under non-UTF8 locales, we have to convert the internal UTF-8 + // representation using wxConvLibc and cache the result + wxString *str = wxConstCast(m_str, wxString); // convert the string: @@ -244,7 +251,7 @@ const char* wxCStrData::AsChar() const // and keep it: return str->m_convertedToChar + m_offset; } -#endif // wxUSE_UNICODE +#endif // wxUSE_UNICODE && !wxUSE_UTF8_LOCALE_ONLY #if !wxUSE_UNICODE_WCHAR const wchar_t* wxCStrData::AsWChar() const @@ -306,14 +313,23 @@ wxString::SubstrBufFromMB wxString::ConvertStr(const char *psz, size_t nLength, wxString::SubstrBufFromMB wxString::ConvertStr(const char *psz, size_t nLength, const wxMBConv& conv) { - // FIXME-UTF8: return as-is without copying under UTF8 locale, return - // converted string under other locales - needs wxCharBuffer - // changes - // anything to do? if ( !psz || nLength == 0 ) return SubstrBufFromMB("", 0); + // if psz is already in UTF-8, we don't have to do the roundtrip to + // wchar_t* and back: + if ( conv.IsUTF8() ) + { + // we need to validate the input because UTF8 iterators assume valid + // UTF-8 sequence and psz may be invalid: + if ( wxStringOperations::IsValidUtf8String(psz, nLength) ) + { + return SubstrBufFromMB(wxCharBuffer::CreateNonOwned(psz), nLength); + } + // else: do the roundtrip through wchar_t* + } + if ( nLength == npos ) nLength = wxNO_LEN; @@ -373,8 +389,9 @@ const wxWCharBuffer wxString::wc_str() const const wxCharBuffer wxString::mb_str(const wxMBConv& conv) const { - // FIXME-UTF8: optimize the case when conv==wxConvUTF8 or wxConvLibc - // under UTF8 locale + if ( conv.IsUTF8() ) + return wxCharBuffer::CreateNonOwned(m_impl.c_str()); + // FIXME-UTF8: use wc_str() here once we have buffers with length size_t wcLen; diff --git a/src/common/stringops.cpp b/src/common/stringops.cpp index ac0455da53..66a461a6bb 100644 --- a/src/common/stringops.cpp +++ b/src/common/stringops.cpp @@ -87,17 +87,26 @@ unsigned char wxStringOperationsUtf8::ms_utf8IterTable[256] = { // U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | // -------------------+----------+----------+----------+----------+ -bool wxStringOperationsUtf8::IsValidUtf8String(const char *str) +bool wxStringOperationsUtf8::IsValidUtf8String(const char *str, size_t len) { if ( !str ) return true; // empty string is UTF8 string const unsigned char *c = (const unsigned char*)str; + const unsigned char * const end = (len == wxStringImpl::npos) ? NULL : c + len; - for ( ; *c; ++c ) + for ( ; c != end && *c; ++c ) { unsigned char b = *c; + if ( end != NULL ) + { + // if the string is not NULL-terminated, verify we have enough + // bytes in it left for current character's encoding: + if ( c + ms_utf8IterTable[*c] > end ) + return false; + } + if ( b <= 0x7F ) // 00..7F continue; diff --git a/src/common/strvararg.cpp b/src/common/strvararg.cpp index f18e2f0bc1..dc59f13556 100644 --- a/src/common/strvararg.cpp +++ b/src/common/strvararg.cpp @@ -41,7 +41,7 @@ const wxStringCharType *wxArgNormalizerNative::get() const return m_value.AsInternal(); } -#if wxUSE_UNICODE_UTF8 +#if wxUSE_UNICODE_UTF8 && !wxUSE_UTF8_LOCALE_ONLY wxArgNormalizerWchar::wxArgNormalizerWchar(const wxString& s) : wxArgNormalizerWithBuffer(s.wc_str()) { @@ -51,7 +51,7 @@ wxArgNormalizerWchar::wxArgNormalizerWchar(const wxCStrData& : wxArgNormalizerWithBuffer(s.AsWCharBuf()) { } -#endif // wxUSE_UNICODE_UTF8 +#endif // wxUSE_UNICODE_UTF8 && !wxUSE_UTF8_LOCALE_ONLY wxString wxArgNormalizedString::GetString() const { diff --git a/src/common/unichar.cpp b/src/common/unichar.cpp index f533a9ce41..fa9890365f 100644 --- a/src/common/unichar.cpp +++ b/src/common/unichar.cpp @@ -41,10 +41,15 @@ wxUniChar::value_type wxUniChar::From8bit(char c) if ( (unsigned char)c < 0x80 ) return c; +#if wxUSE_UTF8_LOCALE_ONLY + wxFAIL_MSG( _T("invalid UTF-8 character") ); + return wxT('?'); // FIXME-UTF8: what to use as failure character? +#else wchar_t buf[2]; if ( wxConvLibc.ToWChar(buf, 2, &c, 1) != 2 ) return wxT('?'); // FIXME-UTF8: what to use as failure character? return buf[0]; +#endif } /* static */ @@ -54,11 +59,16 @@ char wxUniChar::To8bit(wxUniChar::value_type c) if ( c < 0x80 ) return c; +#if wxUSE_UTF8_LOCALE_ONLY + wxFAIL_MSG( _T("character cannot be converted to single UTF-8 byte") ); + return '?'; // FIXME-UTF8: what to use as failure character? +#else wchar_t in = c; char buf[2]; if ( wxConvLibc.FromWChar(buf, 2, &in, 1) != 2 ) return '?'; // FIXME-UTF8: what to use as failure character? return buf[0]; +#endif }