added code for optimized handling of UTF-8 locales: some string operations are more...

author Václav Slavík <vslavik@fastmail.fm>

Thu, 3 May 2007 11:05:04 +0000 (11:05 +0000)

committer Václav Slavík <vslavik@fastmail.fm>

Thu, 3 May 2007 11:05:04 +0000 (11:05 +0000)
author Václav Slavík <vslavik@fastmail.fm>
Thu, 3 May 2007 11:05:04 +0000 (11:05 +0000)
committer Václav Slavík <vslavik@fastmail.fm>
Thu, 3 May 2007 11:05:04 +0000 (11:05 +0000)
diff --git a/configure b/configure

index 44f30a7654255fa86f54727061b2d58b5c456bbf..ca04b7a9f929eb4b4129e3bc1beb592e03e1f1f8 100755 (executable)
--- a/configure
+++ b/configure
@@ -1001,6 +1001,7 @@ Optional Features:
    --enable-mimetype       use wxMimeTypesManager
    --enable-mslu           use MS Layer for Unicode on Windows 9x (Win32 only)
    --enable-utf8           use UTF-8 representation for strings (Unix only)
+  --enable-utf8only      only support UTF-8 locales in UTF-8 build (Unix only)
    --enable-snglinst       use wxSingleInstanceChecker class
    --enable-std_iostreams  use standard C++ stream classes
    --enable-std_string     use standard C++ string classes
@@ -2389,6 +2390,7 @@ if test $DEBUG_CONFIGURE = 1; then
    DEFAULT_wxUSE_UNICODE=no
    DEFAULT_wxUSE_UNICODE_MSLU=no
    DEFAULT_wxUSE_UNICODE_UTF8=no
+  DEFAULT_wxUSE_UNICODE_UTF8_LOCALE=no
    DEFAULT_wxUSE_WCSRTOMBS=no
  
    DEFAULT_wxUSE_PALETTE=no
@@ -2616,6 +2618,7 @@ else
    DEFAULT_wxUSE_UNICODE=no
    DEFAULT_wxUSE_UNICODE_MSLU=yes
    DEFAULT_wxUSE_UNICODE_UTF8=no
+  DEFAULT_wxUSE_UNICODE_UTF8_LOCALE=no
    DEFAULT_wxUSE_WCSRTOMBS=no
  
    DEFAULT_wxUSE_PALETTE=yes
@@ -6066,6 +6069,47 @@ echo "${ECHO_T}no" >&6
            fi
  
  
+          enablestring=
+          echo "$as_me:$LINENO: checking for --${enablestring:-enable}-utf8only" >&5
+echo $ECHO_N "checking for --${enablestring:-enable}-utf8only... $ECHO_C" >&6
+          no_cache=0
+          # Check whether --enable-utf8only or --disable-utf8only was given.
+if test "${enable_utf8only+set}" = set; then
+  enableval="$enable_utf8only"
+
+                          if test "$enableval" = yes; then
+                            ac_cv_use_utf8only='wxUSE_UNICODE_UTF8_LOCALE=yes'
+                          else
+                            ac_cv_use_utf8only='wxUSE_UNICODE_UTF8_LOCALE=no'
+                          fi
+
+else
+
+                          LINE=`grep "wxUSE_UNICODE_UTF8_LOCALE" ${wx_arg_cache_file}`
+                          if test "x$LINE" != x ; then
+                            eval "DEFAULT_$LINE"
+                          else
+                            no_cache=1
+                          fi
+
+                          ac_cv_use_utf8only='wxUSE_UNICODE_UTF8_LOCALE='$DEFAULT_wxUSE_UNICODE_UTF8_LOCALE
+
+fi;
+
+          eval "$ac_cv_use_utf8only"
+          if test "$no_cache" != 1; then
+            echo $ac_cv_use_utf8only >> ${wx_arg_cache_file}.tmp
+          fi
+
+          if test "$wxUSE_UNICODE_UTF8_LOCALE" = yes; then
+            echo "$as_me:$LINENO: result: yes" >&5
+echo "${ECHO_T}yes" >&6
+          else
+            echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6
+          fi
+
+
            enablestring=
            echo "$as_me:$LINENO: checking for --${enablestring:-enable}-snglinst" >&5
  echo $ECHO_N "checking for --${enablestring:-enable}-snglinst... $ECHO_C" >&6
@@ -39098,7 +39142,6 @@ echo $ECHO_N "checking how many arguments gethostbyname_r() takes... $ECHO_C" >&
  else
  
  
-################################################################
  
  ac_cv_func_which_gethostbyname_r=unknown
  
@@ -39340,7 +39383,6 @@ rm -f conftest.err conftest.$ac_objext conftest.$ac_ext
  
  fi
  
-################################################################
  
  
  fi
@@ -39498,19 +39540,103 @@ _ACEOF
  fi
  
        fi
-
-echo "$as_me:$LINENO: checking how many arguments getservbyname_r() takes" >&5
-echo $ECHO_N "checking how many arguments getservbyname_r() takes... $ECHO_C" >&6
+                              echo "$as_me:$LINENO: checking for getservbyname_r" >&5
+echo $ECHO_N "checking for getservbyname_r... $ECHO_C" >&6
  if test "${ac_cv_func_which_getservbyname_r+set}" = set; then
    echo $ECHO_N "(cached) $ECHO_C" >&6
  else
  
-                ac_ext=cc
-ac_cpp='$CXXCPP $CPPFLAGS'
-ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+echo "$as_me:$LINENO: checking for getservbyname_r" >&5
+echo $ECHO_N "checking for getservbyname_r... $ECHO_C" >&6
+if test "${ac_cv_func_getservbyname_r+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+/* Define getservbyname_r to an innocuous variant, in case <limits.h> declares getservbyname_r.
+   For example, HP-UX 11i <limits.h> declares gettimeofday.  */
+#define getservbyname_r innocuous_getservbyname_r
+
+/* System header to define __stub macros and hopefully few prototypes,
+    which can conflict with char getservbyname_r (); below.
+    Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
+    <limits.h> exists even on freestanding compilers.  */
+
+#ifdef __STDC__
+# include <limits.h>
+#else
+# include <assert.h>
+#endif
+
+#undef getservbyname_r
+
+/* Override any gcc2 internal prototype to avoid an error.  */
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+/* We use char because int might match the return type of a gcc2
+   builtin and then its argument prototype would still apply.  */
+char getservbyname_r ();
+/* The GNU C library defines this for functions which it implements
+    to always fail with ENOSYS.  Some functions are actually named
+    something starting with __ and the normal name is an alias.  */
+#if defined (__stub_getservbyname_r) || defined (__stub___getservbyname_r)
+choke me
+#else
+char (*f) () = getservbyname_r;
+#endif
+#ifdef __cplusplus
+}
+#endif
+
+int
+main ()
+{
+return f != getservbyname_r;
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (eval echo "$as_me:$LINENO: \"$ac_link\"") >&5
+  (eval $ac_link) 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } &&
+        { ac_try='test -z "$ac_c_werror_flag"
+                        || test ! -s conftest.err'
+  { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; } &&
+        { ac_try='test -s conftest$ac_exeext'
+  { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  ac_cv_func_getservbyname_r=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
  
+ac_cv_func_getservbyname_r=no
+fi
+rm -f conftest.err conftest.$ac_objext \
+      conftest$ac_exeext conftest.$ac_ext
+fi
+echo "$as_me:$LINENO: result: $ac_cv_func_getservbyname_r" >&5
+echo "${ECHO_T}$ac_cv_func_getservbyname_r" >&6
+if test $ac_cv_func_getservbyname_r = yes; then
  
          cat >conftest.$ac_ext <<_ACEOF
  /* confdefs.h.  */
@@ -39518,17 +39644,20 @@ _ACEOF
  cat confdefs.h >>conftest.$ac_ext
  cat >>conftest.$ac_ext <<_ACEOF
  /* end confdefs.h.  */
-#include <netdb.h>
+
+#               include <netdb.h>
+
  int
  main ()
  {
  
-                char *name;
-                char *proto;
-                struct servent *se, *res;
-                char buffer[2048];
-                int buflen = 2048;
-                (void) getservbyname_r(name, proto, se, buffer, buflen, &res)
+
+        char *name;
+        char *proto;
+        struct servent *se;
+        struct servent_data data;
+        (void) getservbyname_r(name, proto, se, &data);
+
  
    ;
    return 0;
@@ -39543,7 +39672,7 @@ if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5
    cat conftest.err >&5
    echo "$as_me:$LINENO: \$? = $ac_status" >&5
    (exit $ac_status); } &&
-        { ac_try='test -z "$ac_cxx_werror_flag"
+        { ac_try='test -z "$ac_c_werror_flag"
                          || test ! -s conftest.err'
    { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
    (eval $ac_try) 2>&5
@@ -39556,29 +39685,31 @@ if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5
    ac_status=$?
    echo "$as_me:$LINENO: \$? = $ac_status" >&5
    (exit $ac_status); }; }; then
-  ac_cv_func_which_getservbyname_r=six
+  ac_cv_func_which_getservbyname_r=four
  else
    echo "$as_me: failed program was:" >&5
  sed 's/^/| /' conftest.$ac_ext >&5
  
  
-                cat >conftest.$ac_ext <<_ACEOF
+  cat >conftest.$ac_ext <<_ACEOF
  /* confdefs.h.  */
  _ACEOF
  cat confdefs.h >>conftest.$ac_ext
  cat >>conftest.$ac_ext <<_ACEOF
  /* end confdefs.h.  */
-#include <netdb.h>
+
+#   include <netdb.h>
+
  int
  main ()
  {
  
-                        char *name;
-                        char *proto;
-                        struct servent *se;
-                        char buffer[2048];
-                        int buflen = 2048;
-                        (void) getservbyname_r(name, proto, se, buffer, buflen)
+        char *name;
+        char *proto;
+        struct servent *se, *res;
+        char buffer[2048];
+        int buflen = 2048;
+        (void) getservbyname_r(name, proto, se, buffer, buflen, &res)
  
    ;
    return 0;
@@ -39593,7 +39724,7 @@ if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5
    cat conftest.err >&5
    echo "$as_me:$LINENO: \$? = $ac_status" >&5
    (exit $ac_status); } &&
-        { ac_try='test -z "$ac_cxx_werror_flag"
+        { ac_try='test -z "$ac_c_werror_flag"
                          || test ! -s conftest.err'
    { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
    (eval $ac_try) 2>&5
@@ -39606,28 +39737,31 @@ if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5
    ac_status=$?
    echo "$as_me:$LINENO: \$? = $ac_status" >&5
    (exit $ac_status); }; }; then
-  ac_cv_func_which_getservbyname_r=five
+  ac_cv_func_which_getservbyname_r=six
  else
    echo "$as_me: failed program was:" >&5
  sed 's/^/| /' conftest.$ac_ext >&5
  
  
-                        cat >conftest.$ac_ext <<_ACEOF
+  cat >conftest.$ac_ext <<_ACEOF
  /* confdefs.h.  */
  _ACEOF
  cat confdefs.h >>conftest.$ac_ext
  cat >>conftest.$ac_ext <<_ACEOF
  /* end confdefs.h.  */
-#include <netdb.h>
+
+#   include <netdb.h>
+
  int
  main ()
  {
  
-                                char *name;
-                                char *proto;
-                                struct servent *se;
-                                struct servent_data data;
-                                (void) getservbyname_r(name, proto, se, &data);
+        char *name;
+        char *proto;
+        struct servent *se;
+        char buffer[2048];
+        int buflen = 2048;
+        (void) getservbyname_r(name, proto, se, buffer, buflen)
  
    ;
    return 0;
@@ -39642,7 +39776,7 @@ if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5
    cat conftest.err >&5
    echo "$as_me:$LINENO: \$? = $ac_status" >&5
    (exit $ac_status); } &&
-        { ac_try='test -z "$ac_cxx_werror_flag"
+        { ac_try='test -z "$ac_c_werror_flag"
                          || test ! -s conftest.err'
    { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
    (eval $ac_try) 2>&5
@@ -39655,30 +39789,28 @@ if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5
    ac_status=$?
    echo "$as_me:$LINENO: \$? = $ac_status" >&5
    (exit $ac_status); }; }; then
-  ac_cv_func_which_getservbyname_r=four
+  ac_cv_func_which_getservbyname_r=five
  else
    echo "$as_me: failed program was:" >&5
  sed 's/^/| /' conftest.$ac_ext >&5
  
  ac_cv_func_which_getservbyname_r=no
-
  fi
  rm -f conftest.err conftest.$ac_objext conftest.$ac_ext
  
  
+
+
  fi
  rm -f conftest.err conftest.$ac_objext conftest.$ac_ext
  
  
  fi
  rm -f conftest.err conftest.$ac_objext conftest.$ac_ext
-        ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
-
  
+else
+  ac_cv_func_which_getservbyname_r=no
+fi
  
  fi
  echo "$as_me:$LINENO: result: $ac_cv_func_which_getservbyname_r" >&5
@@ -39699,6 +39831,7 @@ elif test $ac_cv_func_which_getservbyname_r = four; then
  #define HAVE_FUNC_GETSERVBYNAME_R_4 1
  _ACEOF
  
+
  fi
  
  
@@ -43472,6 +43605,13 @@ if test "$wxUSE_UNICODE" = "yes" -a "$wxUSE_UNICODE_UTF8" = "yes"; then
  #define wxUSE_UNICODE_UTF8 1
  _ACEOF
  
+
+    if test "$wxUSE_UNICODE_UTF8_LOCALE" = "yes"; then
+        cat >>confdefs.h <<\_ACEOF
+#define wxUSE_UTF8_LOCALE_ONLY 1
+_ACEOF
+
+    fi
  fi
  
  if test "$wxUSE_wxUSE_EXPERIMENTAL_PRINTF" = "yes"; then
@@ -46067,7 +46207,10 @@ echo "${ECHO_T}$bakefile_cv_prog_makeisgnu" >&6
                  PLATFORM_BEOS=1
              ;;
              * )
-                                            ;;
+                { { echo "$as_me:$LINENO: error: Unknown platform: $BAKEFILE_FORCE_PLATFORM" >&5
+echo "$as_me: error: Unknown platform: $BAKEFILE_FORCE_PLATFORM" >&2;}
+   { (exit 1); exit 1; }; }
+            ;;
          esac
      fi
  
@@ -48802,10 +48945,21 @@ echo "${ECHO_T}no" >&6
  
      cppunit_major_min=`echo $cppunit_version_min | \
             sed 's/\([0-9]*\).\([0-9]*\).\([0-9]*\)/\1/'`
+    if test "x${cppunit_major_min}" = "x" ; then
+       cppunit_major_min=0
+    fi
+
      cppunit_minor_min=`echo $cppunit_version_min | \
             sed 's/\([0-9]*\).\([0-9]*\).\([0-9]*\)/\2/'`
+    if test "x${cppunit_minor_min}" = "x" ; then
+       cppunit_minor_min=0
+    fi
+
      cppunit_micro_min=`echo $cppunit_version_min | \
             sed 's/\([0-9]*\).\([0-9]*\).\([0-9]*\)/\3/'`
+    if test "x${cppunit_micro_min}" = "x" ; then
+       cppunit_micro_min=0
+    fi
  
      cppunit_version_proper=`expr \
          $cppunit_major_version \> $cppunit_major_min \| \
diff --git a/configure.in b/configure.in

index 39103c85044bb12a17adb5c45c03f79f5e04d499..ce7e311a4fd6995fcf0a636b0e722d4131288bea 100644 (file)
--- a/configure.in
+++ b/configure.in
@@ -578,6 +578,7 @@ if test $DEBUG_CONFIGURE = 1; then
    DEFAULT_wxUSE_UNICODE=no
    DEFAULT_wxUSE_UNICODE_MSLU=no
    DEFAULT_wxUSE_UNICODE_UTF8=no
+  DEFAULT_wxUSE_UNICODE_UTF8_LOCALE=no
    DEFAULT_wxUSE_WCSRTOMBS=no
  
    DEFAULT_wxUSE_PALETTE=no
@@ -805,6 +806,7 @@ else
    DEFAULT_wxUSE_UNICODE=no
    DEFAULT_wxUSE_UNICODE_MSLU=yes
    DEFAULT_wxUSE_UNICODE_UTF8=no
+  DEFAULT_wxUSE_UNICODE_UTF8_LOCALE=no
    DEFAULT_wxUSE_WCSRTOMBS=no
  
    DEFAULT_wxUSE_PALETTE=yes
@@ -993,6 +995,7 @@ WX_ARG_ENABLE(mimetype,      [  --enable-mimetype       use wxMimeTypesManager],
  WX_ARG_ENABLE(mslu,          [  --enable-mslu           use MS Layer for Unicode on Windows 9x (Win32 only)], wxUSE_UNICODE_MSLU)
  dnl FIXME-UTF8: make UTF8 automatic
  WX_ARG_ENABLE(utf8,          [  --enable-utf8           use UTF-8 representation for strings (Unix only)], wxUSE_UNICODE_UTF8)
+WX_ARG_ENABLE(utf8only,      [  --enable-utf8only      only support UTF-8 locales in UTF-8 build (Unix only)], wxUSE_UNICODE_UTF8_LOCALE)
  WX_ARG_ENABLE(snglinst,      [  --enable-snglinst       use wxSingleInstanceChecker class], wxUSE_SNGLINST_CHECKER)
  WX_ARG_ENABLE(std_iostreams, [  --enable-std_iostreams  use standard C++ stream classes], wxUSE_STD_IOSTREAM)
  WX_ARG_ENABLE(std_string,    [  --enable-std_string     use standard C++ string classes], wxUSE_STD_STRING)
@@ -6492,6 +6495,10 @@ fi
  
  if test "$wxUSE_UNICODE" = "yes" -a "$wxUSE_UNICODE_UTF8" = "yes"; then
      AC_DEFINE(wxUSE_UNICODE_UTF8)
+
+    if test "$wxUSE_UNICODE_UTF8_LOCALE" = "yes"; then
+        AC_DEFINE(wxUSE_UTF8_LOCALE_ONLY)
+    fi
  fi
  
  if test "$wxUSE_wxUSE_EXPERIMENTAL_PRINTF" = "yes"; then
diff --git a/include/wx/strconv.h b/include/wx/strconv.h

index b9ea52f68e7b943850cdd5778cbb067fe8f14344..4360b7f6a32928eeffb0ab7628a40f8a1c3eb38d 100644 (file)
--- a/include/wx/strconv.h
+++ b/include/wx/strconv.h
@@ -135,6 +135,12 @@ public:
      // encoding
      static size_t GetMaxMBNulLen() { return 4 /* for UTF-32 */; }
  
+#if wxUSE_UNICODE_UTF8
+    // return true if the converter's charset is UTF-8, i.e. char* strings
+    // decoded using this object can be directly copied to wxString's internal
+    // storage without converting to WC and than back to UTF-8 MB string
+    virtual bool IsUTF8() const { return false; }
+#endif
  
      // The old conversion functions. The existing classes currently mostly
      // implement these ones but we're in transition to using To/FromWChar()
@@ -175,6 +181,10 @@ public:
      virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const;
  
      virtual wxMBConv *Clone() const { return new wxMBConvLibc; }
+
+#if wxUSE_UNICODE_UTF8
+    virtual bool IsUTF8() const { return wxLocaleIsUtf8; }
+#endif
  };
  
  #ifdef __UNIX__
@@ -244,6 +254,8 @@ public:
  class WXDLLIMPEXP_BASE wxMBConvUTF8 : public wxMBConv
  {
  public:
+    // FIXME-UTF8: split this class into multiple classes, one strict and
+    //             other lossy (PUA, OCTAL mappings)
      enum
      {
          MAP_INVALID_UTF8_NOT = 0,
@@ -257,6 +269,12 @@ public:
  
      virtual wxMBConv *Clone() const { return new wxMBConvUTF8(m_options); }
  
+#if wxUSE_UNICODE_UTF8
+    // NB: other mapping modes are not, strictly speaking, UTF-8, so we can't
+    //     take the shortcut in that case
+    virtual bool IsUTF8() const { return m_options == MAP_INVALID_UTF8_NOT; }
+#endif
+
  private:
      int m_options;
  };
diff --git a/include/wx/string.h b/include/wx/string.h

index 346316a4928fe647a55f002f30f5bda8d69b1421..063115b34e9ade00f0972f8c9c1ee27c022a6c49 100644 (file)
--- a/include/wx/string.h
+++ b/include/wx/string.h
@@ -201,7 +201,7 @@ public:
      const wchar_t* AsWChar() const;
      operator const wchar_t*() const { return AsWChar(); }
  
-#if !wxUSE_UNICODE
+#if !wxUSE_UNICODE || wxUSE_UTF8_LOCALE_ONLY
      inline
  #endif
      const char* AsChar() const;
@@ -477,9 +477,6 @@ private:
  
  #else // wxUSE_UNICODE_UTF8
  
-  // FIXME-UTF8: return as-is without copying under UTF8 locale, return
-  //             converted string under other locales - needs wxCharBuffer
-  //             changes
    static wxCharBuffer ImplStr(const char* str,
                                const wxMBConv& conv = wxConvLibc)
      { return ConvertStr(str, npos, conv).data; }
@@ -931,8 +928,7 @@ public:
          { return wxStdWideString(wc_str()); }
    #endif
  
-  #if !wxUSE_UNICODE && wxUSE_STL_BASED_WXSTRING
-    // FIXME-UTF8: do this in UTF8 build #if wxUSE_UTF8_LOCALE_ONLY, too
+  #if (!wxUSE_UNICODE || wxUSE_UTF8_LOCALE_ONLY) && wxUSE_STL_BASED_WXSTRING
      // wxStringImpl is std::string in the encoding we want
      operator const std::string&() const { return m_impl; }
    #else
@@ -941,8 +937,7 @@ public:
          // FIXME-UTF8: broken for embedded NULs
          { return std::string(mb_str()); }
    #endif
-
-#endif // wxUSE_STD_STRING
+#endif // wxUSE_STL
  
    // first valid index position
    const_iterator begin() const { return const_iterator(m_impl.begin()); }
@@ -1161,7 +1156,13 @@ public:
      // type differs because a function may either return pointer to the buffer
      // directly or have to use intermediate buffer for translation.
  #if wxUSE_UNICODE
+
+#if wxUSE_UTF8_LOCALE_ONLY
+    const char* mb_str() const { return wx_str(); }
+    const wxCharBuffer mb_str(const wxMBConv& conv) const;
+#else
      const wxCharBuffer mb_str(const wxMBConv& conv = wxConvLibc) const;
+#endif
  
      const wxWX2MBbuf mbc_str() const { return mb_str(*wxConvCurrent); }
  
@@ -2428,7 +2429,7 @@ private:
  
        T *m_buf;
    };
-#if wxUSE_UNICODE
+#if wxUSE_UNICODE && !wxUSE_UTF8_LOCALE_ONLY
    ConvertedBuffer<char> m_convertedToChar;
  #endif
  #if !wxUSE_UNICODE_WCHAR
@@ -2821,10 +2822,10 @@ inline const wchar_t* wxCStrData::AsWChar() const
  }
  #endif // wxUSE_UNICODE_WCHAR
  
-#if !wxUSE_UNICODE
+#if !wxUSE_UNICODE || wxUSE_UTF8_LOCALE_ONLY
  inline const char* wxCStrData::AsChar() const
  {
-    return m_str->wx_str() + m_offset;
+    return wxStringOperations::AddToIter(m_str->wx_str(), m_offset);
  }
  #endif // !wxUSE_UNICODE
  
diff --git a/include/wx/stringops.h b/include/wx/stringops.h

index 2cfbe188e40ad3a746ffa81f32ea9acfd9368c24..ed53532d9e0392665af8410c6e7026f317502ba6 100644 (file)
--- a/include/wx/stringops.h
+++ b/include/wx/stringops.h
@@ -65,7 +65,8 @@ struct WXDLLIMPEXP_BASE wxStringOperationsWchar
  struct WXDLLIMPEXP_BASE wxStringOperationsUtf8
  {
      // checks correctness of UTF-8 sequence
-    static bool IsValidUtf8String(const char *c);
+    static bool IsValidUtf8String(const char *c,
+                                  size_t len = wxStringImpl::npos);
  #ifdef __WXDEBUG__
      static bool IsValidUtf8LeadByte(unsigned char c);
  #endif
diff --git a/setup.h.in b/setup.h.in

index ec9d560f633ee20b2a86154c53018738132abe5d..14e379d9b57ac9bdac82ca3486043b90336692bb 100644 (file)
--- a/setup.h.in
+++ b/setup.h.in
@@ -562,6 +562,8 @@
  
  #define wxUSE_UNICODE_UTF8 0
  
+#define wxUSE_UTF8_LOCALE_ONLY 0
+
  #define wxUSE_DC_CACHEING 0
  
  #define wxUSE_GADGETS 0
diff --git a/src/common/string.cpp b/src/common/string.cpp

index f9f389cfa209b8a722cd905e2a109f1708a61aa2..daa4901016cbaee06499c06036851c21f672bdac 100644 (file)
--- a/src/common/string.cpp
+++ b/src/common/string.cpp
@@ -220,9 +220,16 @@ wxString::~wxString()
  }
  #endif
  
-#if wxUSE_UNICODE
+#if wxUSE_UNICODE && !wxUSE_UTF8_LOCALE_ONLY
  const char* wxCStrData::AsChar() const
  {
+#if wxUSE_UNICODE_UTF8
+    if ( wxLocaleIsUtf8 )
+        return AsInternal();
+#endif
+    // under non-UTF8 locales, we have to convert the internal UTF-8
+    // representation using wxConvLibc and cache the result
+
      wxString *str = wxConstCast(m_str, wxString);
  
      // convert the string:
@@ -244,7 +251,7 @@ const char* wxCStrData::AsChar() const
      // and keep it:
      return str->m_convertedToChar + m_offset;
  }
-#endif // wxUSE_UNICODE
+#endif // wxUSE_UNICODE && !wxUSE_UTF8_LOCALE_ONLY
  
  #if !wxUSE_UNICODE_WCHAR
  const wchar_t* wxCStrData::AsWChar() const
@@ -306,14 +313,23 @@ wxString::SubstrBufFromMB wxString::ConvertStr(const char *psz, size_t nLength,
  wxString::SubstrBufFromMB wxString::ConvertStr(const char *psz, size_t nLength,
                                                 const wxMBConv& conv)
  {
-    // FIXME-UTF8: return as-is without copying under UTF8 locale, return
-    //             converted string under other locales - needs wxCharBuffer
-    //             changes
-
      // anything to do?
      if ( !psz || nLength == 0 )
          return SubstrBufFromMB("", 0);
  
+    // if psz is already in UTF-8, we don't have to do the roundtrip to
+    // wchar_t* and back:
+    if ( conv.IsUTF8() )
+    {
+        // we need to validate the input because UTF8 iterators assume valid
+        // UTF-8 sequence and psz may be invalid:
+        if ( wxStringOperations::IsValidUtf8String(psz, nLength) )
+        {
+            return SubstrBufFromMB(wxCharBuffer::CreateNonOwned(psz), nLength);
+        }
+        // else: do the roundtrip through wchar_t*
+    }
+
      if ( nLength == npos )
          nLength = wxNO_LEN;
  
@@ -373,8 +389,9 @@ const wxWCharBuffer wxString::wc_str() const
  
  const wxCharBuffer wxString::mb_str(const wxMBConv& conv) const
  {
-    // FIXME-UTF8: optimize the case when conv==wxConvUTF8 or wxConvLibc
-    //             under UTF8 locale
+    if ( conv.IsUTF8() )
+        return wxCharBuffer::CreateNonOwned(m_impl.c_str());
+
      // FIXME-UTF8: use wc_str() here once we have buffers with length
  
      size_t wcLen;
diff --git a/src/common/stringops.cpp b/src/common/stringops.cpp

index ac0455da53cd8b75d8f6333f044be29f9ea7972d..66a461a6bb20f507b3cd058d7039705478953d75 100644 (file)
--- a/src/common/stringops.cpp
+++ b/src/common/stringops.cpp
@@ -87,17 +87,26 @@ unsigned char wxStringOperationsUtf8::ms_utf8IterTable[256] = {
  // U+100000..U+10FFFF |  F4      |  80..8F  |  80..BF  |  80..BF  |
  // -------------------+----------+----------+----------+----------+
  
-bool wxStringOperationsUtf8::IsValidUtf8String(const char *str)
+bool wxStringOperationsUtf8::IsValidUtf8String(const char *str, size_t len)
  {
      if ( !str )
          return true; // empty string is UTF8 string
  
      const unsigned char *c = (const unsigned char*)str;
+    const unsigned char * const end = (len == wxStringImpl::npos) ? NULL : c + len;
  
-    for ( ; *c; ++c )
+    for ( ; c != end && *c; ++c )
      {
          unsigned char b = *c;
  
+        if ( end != NULL )
+        {
+            // if the string is not NULL-terminated, verify we have enough
+            // bytes in it left for current character's encoding:
+            if ( c + ms_utf8IterTable[*c] > end )
+                return false;
+        }
+
          if ( b <= 0x7F ) // 00..7F
              continue;
  
diff --git a/src/common/strvararg.cpp b/src/common/strvararg.cpp

index f18e2f0bc1551a792d961b13dc4870086bc324df..dc59f135560423f2fe38936f0e21eaa093ce90d2 100644 (file)
--- a/src/common/strvararg.cpp
+++ b/src/common/strvararg.cpp
@@ -41,7 +41,7 @@ const wxStringCharType *wxArgNormalizerNative<const wxCStrData&>::get() const
      return m_value.AsInternal();
  }
  
-#if wxUSE_UNICODE_UTF8
+#if wxUSE_UNICODE_UTF8 && !wxUSE_UTF8_LOCALE_ONLY
  wxArgNormalizerWchar<const wxString&>::wxArgNormalizerWchar(const wxString& s)
      : wxArgNormalizerWithBuffer<wchar_t>(s.wc_str())
  {
@@ -51,7 +51,7 @@ wxArgNormalizerWchar<const wxCStrData&>::wxArgNormalizerWchar(const wxCStrData&
      : wxArgNormalizerWithBuffer<wchar_t>(s.AsWCharBuf())
  {
  }
-#endif // wxUSE_UNICODE_UTF8
+#endif // wxUSE_UNICODE_UTF8 && !wxUSE_UTF8_LOCALE_ONLY
  
  wxString wxArgNormalizedString::GetString() const
  {
diff --git a/src/common/unichar.cpp b/src/common/unichar.cpp

index f533a9ce417ae16e38df1309b46749e0f124540d..fa9890365fd572a55ffde76ae1b7647271ad1cbe 100644 (file)
--- a/src/common/unichar.cpp
+++ b/src/common/unichar.cpp
@@ -41,10 +41,15 @@ wxUniChar::value_type wxUniChar::From8bit(char c)
      if ( (unsigned char)c < 0x80 )
          return c;
  
+#if wxUSE_UTF8_LOCALE_ONLY
+    wxFAIL_MSG( _T("invalid UTF-8 character") );
+    return wxT('?'); // FIXME-UTF8: what to use as failure character?
+#else
      wchar_t buf[2];
      if ( wxConvLibc.ToWChar(buf, 2, &c, 1) != 2 )
          return wxT('?'); // FIXME-UTF8: what to use as failure character?
      return buf[0];
+#endif
  }
  
  /* static */
@@ -54,11 +59,16 @@ char wxUniChar::To8bit(wxUniChar::value_type c)
      if ( c < 0x80 )
          return c;
  
+#if wxUSE_UTF8_LOCALE_ONLY
+    wxFAIL_MSG( _T("character cannot be converted to single UTF-8 byte") );
+    return '?'; // FIXME-UTF8: what to use as failure character?
+#else
      wchar_t in = c;
      char buf[2];
      if ( wxConvLibc.FromWChar(buf, 2, &in, 1) != 2 )
          return '?'; // FIXME-UTF8: what to use as failure character?
      return buf[0];
+#endif
  }
author	Václav Slavík <vslavik@fastmail.fm>
	Thu, 3 May 2007 11:05:04 +0000 (11:05 +0000)
committer	Václav Slavík <vslavik@fastmail.fm>
	Thu, 3 May 2007 11:05:04 +0000 (11:05 +0000)
configure		patch \| blob \| blame \| history
configure.in		patch \| blob \| blame \| history
include/wx/strconv.h		patch \| blob \| blame \| history
include/wx/string.h		patch \| blob \| blame \| history
include/wx/stringops.h		patch \| blob \| blame \| history
setup.h.in		patch \| blob \| blame \| history
src/common/string.cpp		patch \| blob \| blame \| history
src/common/stringops.cpp		patch \| blob \| blame \| history
src/common/strvararg.cpp		patch \| blob \| blame \| history
src/common/unichar.cpp		patch \| blob \| blame \| history