fixed non-const wxString iterators to work when a character is changed to another...

author Václav Slavík <vslavik@fastmail.fm>

Tue, 15 May 2007 13:16:01 +0000 (13:16 +0000)

committer Václav Slavík <vslavik@fastmail.fm>

Tue, 15 May 2007 13:16:01 +0000 (13:16 +0000)
author Václav Slavík <vslavik@fastmail.fm>
Tue, 15 May 2007 13:16:01 +0000 (13:16 +0000)
committer Václav Slavík <vslavik@fastmail.fm>
Tue, 15 May 2007 13:16:01 +0000 (13:16 +0000)
diff --git a/include/wx/string.h b/include/wx/string.h

index 5bbf524adbbd53cba2569c5ba2370fe248d3b9f2..6bce6b892f543cb305d8fb22ad6aa4ca736c64a1 100644 (file)
--- a/include/wx/string.h
+++ b/include/wx/string.h
@@ -376,6 +376,23 @@ protected:
      #pragma warning (disable:4275)
  #endif
  
+#if wxUSE_UNICODE_UTF8
+// see the comment near wxString::iterator for why we need this
+struct WXDLLIMPEXP_BASE wxStringIteratorNode
+{
+    inline wxStringIteratorNode(const wxString *str,
+                                wxStringImpl::const_iterator *citer);
+    inline wxStringIteratorNode(const wxString *str,
+                                wxStringImpl::iterator *iter);
+    inline ~wxStringIteratorNode();
+
+    const wxString *m_str;
+    wxStringImpl::const_iterator *m_citer;
+    wxStringImpl::iterator *m_iter;
+    wxStringIteratorNode *m_prev, *m_next;
+};
+#endif // wxUSE_UNICODE_UTF8
+
  class WXDLLIMPEXP_BASE wxString
  #ifdef wxNEEDS_WXSTRING_PRINTF_MIXIN
                                  : public wxStringPrintfMixin
@@ -519,7 +536,7 @@ private:
        if ( pos == 0 || pos == npos )
            return pos;
        else
-          return const_iterator(m_impl.begin() + pos) - begin();
+          return const_iterator(this, m_impl.begin() + pos) - begin();
    }
  #endif // !wxUSE_UNICODE_UTF8/wxUSE_UNICODE_UTF8
  
@@ -545,8 +562,7 @@ public:
    #define WX_STR_ITERATOR_TAG void /* dummy type */
  #endif
  
-  #define WX_STR_ITERATOR_IMPL(iterator_name, pointer_type,                 \
-                               reference_type, reference_ctor)              \
+  #define WX_STR_ITERATOR_IMPL(iterator_name, pointer_type, reference_type) \
        private:                                                              \
            typedef wxStringImpl::iterator_name underlying_iterator;          \
        public:                                                               \
@@ -556,7 +572,6 @@ public:
            typedef reference_type reference;                                 \
            typedef pointer_type pointer;                                     \
                                                                              \
-          reference operator*() const { return reference_ctor; }            \
            reference operator[](size_t n) const { return *(*this + n); }     \
                                                                              \
            iterator_name& operator++()                                       \
@@ -627,40 +642,89 @@ public:
    class const_iterator;
  
  #if wxUSE_UNICODE_UTF8
+  // NB: In UTF-8 build, (non-const) iterator needs to keep reference
+  //     to the underlying wxStringImpl, because UTF-8 is variable-length
+  //     encoding and changing the value pointer to by an iterator (using
+  //     its operator*) requires calling wxStringImpl::replace() if the old
+  //     and new values differ in their encoding's length.
+  //
+  //     Furthermore, the replace() call may invalid all iterators for the
+  //     string, so we have to keep track of outstanding iterators and update
+  //     them if replace() happens.
+  //
+  //     This is implemented by maintaining linked list of iterators for every
+  //     string and traversing it in wxUniCharRef::operator=(). Head of the
+  //     list is stored in wxString. (FIXME-UTF8)
+
    class iterator
    {
-      // NB: In UTF-8 build, (non-const) iterator needs to keep reference
-      //     to the underlying wxStringImpl, because UTF-8 is variable-length
-      //     encoding and changing the value pointer to by an iterator using
-      //     its operator* requires calling wxStringImpl::replace() if the old
-      //     and new values differ in their encoding's length.
-
-      WX_STR_ITERATOR_IMPL(iterator, wxChar*, wxUniCharRef,
-                           wxUniCharRef::CreateForString(m_str, m_cur));
+      WX_STR_ITERATOR_IMPL(iterator, wxChar*, wxUniCharRef);
  
    public:
-      iterator(const iterator& i) : m_cur(i.m_cur), m_str(i.m_str) {}
+      iterator(const iterator& i)
+          : m_cur(i.m_cur), m_node(i.str(), &m_cur) {}
+
+      reference operator*()
+        { return wxUniCharRef::CreateForString(m_node, m_cur); }
  
        iterator operator+(int n) const
-        { return iterator(m_str, wxStringOperations::AddToIter(m_cur, n)); }
+        { return iterator(str(), wxStringOperations::AddToIter(m_cur, n)); }
        iterator operator+(size_t n) const
-        { return iterator(m_str, wxStringOperations::AddToIter(m_cur, (int)n)); }
+        { return iterator(str(), wxStringOperations::AddToIter(m_cur, (int)n)); }
        iterator operator-(int n) const
-        { return iterator(m_str, wxStringOperations::AddToIter(m_cur, -n)); }
+        { return iterator(str(), wxStringOperations::AddToIter(m_cur, -n)); }
        iterator operator-(size_t n) const
-        { return iterator(m_str, wxStringOperations::AddToIter(m_cur, -(int)n)); }
+        { return iterator(str(), wxStringOperations::AddToIter(m_cur, -(int)n)); }
  
    private:
        iterator(wxString *str, underlying_iterator ptr)
-          : m_cur(ptr), m_str(str->m_impl) {}
-      iterator(wxStringImpl& str, underlying_iterator ptr)
-          : m_cur(ptr), m_str(str) {}
+          : m_cur(ptr), m_node(str, &m_cur) {}
+      iterator(wxString& str, underlying_iterator ptr)
+          : m_cur(ptr), m_node(&str, &m_cur) {}
  
-      wxStringImpl& m_str;
+      wxString* str() const { return wx_const_cast(wxString*, m_node.m_str); }
+
+      wxStringIteratorNode m_node;
  
        friend class const_iterator;
    };
  
+  class const_iterator
+  {
+      // NB: reference_type is intentionally value, not reference, the character
+      //     may be encoded differently in wxString data:
+      WX_STR_ITERATOR_IMPL(const_iterator, const wxChar*, wxUniChar);
+
+  public:
+      const_iterator(const const_iterator& i)
+          : m_cur(i.m_cur), m_node(i.str(), &m_cur) {}
+      const_iterator(const iterator& i)
+          : m_cur(i.m_cur), m_node(i.str(), &m_cur) {}
+
+      reference operator*() const
+        { return wxStringOperations::DecodeChar(m_cur); }
+
+      const_iterator operator+(int n) const
+        { return const_iterator(str(), wxStringOperations::AddToIter(m_cur, n)); }
+      const_iterator operator+(size_t n) const
+        { return const_iterator(str(), wxStringOperations::AddToIter(m_cur, (int)n)); }
+      const_iterator operator-(int n) const
+        { return const_iterator(str(), wxStringOperations::AddToIter(m_cur, -n)); }
+      const_iterator operator-(size_t n) const
+        { return const_iterator(str(), wxStringOperations::AddToIter(m_cur, -(int)n)); }
+
+  private:
+      // for internal wxString use only:
+      const_iterator(const wxString *str, underlying_iterator ptr)
+          : m_cur(ptr), m_node(str, &m_cur) {}
+      const_iterator(const wxString& str, underlying_iterator ptr)
+          : m_cur(ptr), m_node(&str, &m_cur) {}
+
+      const wxString* str() const { return m_node.m_str; }
+
+      wxStringIteratorNode m_node;
+  };
+
    size_t IterToImplPos(wxString::iterator i) const
      { return wxStringImpl::const_iterator(i.impl()) - m_impl.begin(); }
  
@@ -668,12 +732,14 @@ public:
  
    class iterator
    {
-      WX_STR_ITERATOR_IMPL(iterator, wxChar*, wxUniCharRef,
-                           wxUniCharRef::CreateForString(m_cur));
+      WX_STR_ITERATOR_IMPL(iterator, wxChar*, wxUniCharRef);
  
    public:
        iterator(const iterator& i) : m_cur(i.m_cur) {}
  
+      reference operator*()
+        { return wxUniCharRef::CreateForString(m_cur); }
+
        iterator operator+(int n) const
          { return iterator(wxStringOperations::AddToIter(m_cur, n)); }
        iterator operator+(size_t n) const
@@ -690,19 +756,20 @@ public:
  
        friend class const_iterator;
    };
-#endif // wxUSE_UNICODE_UTF8/!wxUSE_UNICODE_UTF8
  
    class const_iterator
    {
        // NB: reference_type is intentionally value, not reference, the character
        //     may be encoded differently in wxString data:
-      WX_STR_ITERATOR_IMPL(const_iterator, const wxChar*, wxUniChar,
-                           wxStringOperations::DecodeChar(m_cur));
+      WX_STR_ITERATOR_IMPL(const_iterator, const wxChar*, wxUniChar);
  
    public:
        const_iterator(const const_iterator& i) : m_cur(i.m_cur) {}
        const_iterator(const iterator& i) : m_cur(i.m_cur) {}
  
+      reference operator*() const
+        { return wxStringOperations::DecodeChar(m_cur); }
+
        const_iterator operator+(int n) const
          { return const_iterator(wxStringOperations::AddToIter(m_cur, n)); }
        const_iterator operator+(size_t n) const
@@ -715,7 +782,10 @@ public:
    private:
        // for internal wxString use only:
        const_iterator(underlying_iterator ptr) : m_cur(ptr) {}
+      const_iterator(const wxString *WXUNUSED(str), underlying_iterator ptr)
+          : m_cur(ptr) {}
    };
+#endif // wxUSE_UNICODE_UTF8/!wxUSE_UNICODE_UTF8
  
    #undef WX_STR_ITERATOR_TAG
    #undef WX_STR_ITERATOR_IMPL
@@ -800,7 +870,8 @@ private:
    // wxCStrData) to an iterator into the string
    static const_iterator CreateConstIterator(const wxCStrData& data)
    {
-      return const_iterator(data.m_str->begin() + data.m_offset);
+      return const_iterator(data.m_str,
+                            (data.m_str->begin() + data.m_offset).impl());
    }
  
    // in UTF-8 STL build, creation from std::string requires conversion under
@@ -948,10 +1019,10 @@ public:
  #endif // wxUSE_STL
  
    // first valid index position
-  const_iterator begin() const { return const_iterator(m_impl.begin()); }
+  const_iterator begin() const { return const_iterator(this, m_impl.begin()); }
    iterator begin() { return iterator(this, m_impl.begin()); }
    // position one after the last valid one
-  const_iterator end() const { return const_iterator(m_impl.end()); }
+  const_iterator end() const { return const_iterator(this, m_impl.end()); }
    iterator end() { return iterator(this, m_impl.end()); }
  
    // first element of the reversed string
@@ -2485,6 +2556,23 @@ private:
    ConvertedBuffer<wchar_t> m_convertedToWChar;
  #endif
  
+#if wxUSE_UNICODE_UTF8
+  // FIXME-UTF8: (try to) move this elsewhere (TLS) or solve differently
+  //             assigning to character pointer to by wxString::interator may
+  //             change the underlying wxStringImpl iterator, so we have to
+  //             keep track of all iterators and update them as necessary:
+  struct wxStringIteratorNodeHead
+  {
+      wxStringIteratorNodeHead() : ptr(NULL) {}
+      wxStringIteratorNode *ptr;
+  };
+
+  wxStringIteratorNodeHead m_iterators;
+
+  friend class WXDLLIMPEXP_BASE wxStringIteratorNode;
+  friend class WXDLLIMPEXP_BASE wxUniCharRef;
+#endif // wxUSE_UNICODE_UTF8
+
    friend class WXDLLIMPEXP_BASE wxCStrData;
    friend class wxImplStringBuffer;
    friend class wxImplStringBufferLength;
@@ -2966,6 +3054,48 @@ inline wxWCharBuffer::wxWCharBuffer(const wxCStrData& cstr)
  {
  }
  
+#if wxUSE_UNICODE_UTF8
+// ----------------------------------------------------------------------------
+// implementation of wxStringIteratorNode inline methods
+// ----------------------------------------------------------------------------
+
+wxStringIteratorNode::wxStringIteratorNode(const wxString *str,
+                                           wxStringImpl::const_iterator *citer)
+    : m_str(str),
+      m_citer(citer),
+      m_iter(NULL),
+      m_prev(NULL),
+      m_next(str->m_iterators.ptr)
+{
+    wx_const_cast(wxString*, m_str)->m_iterators.ptr = this;
+    if ( m_next )
+        m_next->m_prev = this;
+}
+
+wxStringIteratorNode::wxStringIteratorNode(const wxString *str,
+                                           wxStringImpl::iterator *iter)
+    : m_str(str),
+      m_citer(NULL),
+      m_iter(iter),
+      m_prev(NULL),
+      m_next(str->m_iterators.ptr)
+{
+    wx_const_cast(wxString*, m_str)->m_iterators.ptr = this;
+    if ( m_next)
+        m_next->m_prev = this;
+}
+
+wxStringIteratorNode::~wxStringIteratorNode()
+{
+    if ( m_next )
+        m_next->m_prev = m_prev;
+    if ( m_prev )
+        m_prev->m_next = m_next;
+    else // first in the list
+        wx_const_cast(wxString*, m_str)->m_iterators.ptr = m_next;
+}
+#endif // wxUSE_UNICODE_UTF8
+
  #if WXWIN_COMPATIBILITY_2_8
      // lot of code out there doesn't explicitly include wx/wxchar.h, but uses
      // CRT wrappers that are now declared in wx/wxcrt.h and wx/wxcrtvararg.h,
diff --git a/include/wx/unichar.h b/include/wx/unichar.h

index 31794161aa1fbb393b5db3618e158a792e5287de..2d6603f4c517b3d72d4141f0fc5dec3ed3d94a56 100644 (file)
--- a/include/wx/unichar.h
+++ b/include/wx/unichar.h
@@ -39,6 +39,7 @@
  #endif
  
  class WXDLLIMPEXP_BASE wxUniCharRef;
+class WXDLLIMPEXP_BASE wxStringIteratorNode;
  
  // This class represents single Unicode character. It can be converted to
  // and from char or wchar_t and implements commonly used character operations.
@@ -157,7 +158,7 @@ private:
  
      // create the reference
  #if wxUSE_UNICODE_UTF8
-    wxUniCharRef(wxStringImpl& str, iterator pos) : m_str(str), m_pos(pos) {}
+    wxUniCharRef(wxStringIteratorNode& node, iterator pos) : m_node(node), m_pos(pos) {}
  #else
      wxUniCharRef(iterator pos) : m_pos(pos) {}
  #endif
@@ -169,8 +170,8 @@ public:
      //     that must be used explicitly (this is more than using 'explicit'
      //     keyword on ctor!):
  #if wxUSE_UNICODE_UTF8
-    static wxUniCharRef CreateForString(wxStringImpl& str, iterator pos)
-        { return wxUniCharRef(str, pos); }
+    static wxUniCharRef CreateForString(wxStringIteratorNode& node, iterator pos)
+        { return wxUniCharRef(node, pos); }
  #else
      static wxUniCharRef CreateForString(iterator pos)
          { return wxUniCharRef(pos); }
@@ -249,7 +250,7 @@ private:
  private:
      // reference to the string and pointer to the character in string
  #if wxUSE_UNICODE_UTF8
-    wxStringImpl& m_str;
+    wxStringIteratorNode& m_node;
  #endif
      iterator m_pos;
  };
diff --git a/src/common/unichar.cpp b/src/common/unichar.cpp

index fa9890365fd572a55ffde76ae1b7647271ad1cbe..b83a0817a3f51e22c0d646d57cf141bd7551a74c 100644 (file)
--- a/src/common/unichar.cpp
+++ b/src/common/unichar.cpp
@@ -90,18 +90,78 @@ wxUniCharRef& wxUniCharRef::operator=(const wxUniChar& c)
  
      if ( lenNew == lenOld )
      {
+        // this is the simpler case: if the new value's UTF-8 code has the
+        // same length, we can just replace it:
+
          iterator pos(m_pos);
          for ( size_t i = 0; i < lenNew; ++i, ++pos )
              *pos = utf[i];
      }
      else
      {
-        size_t idx = m_pos - m_str.begin();
-
-        m_str.replace(m_pos, m_pos + lenOld, utf, lenNew);
-
-        // this is needed to keep m_pos valid:
-        m_pos = m_str.begin() + idx;
+        // the worse case is when the new value has either longer or shorter
+        // code -- in that case, we have to use wxStringImpl::replace() and
+        // this invalidates all iterators, so we have to update them too:
+
+        wxString& str = *wx_const_cast(wxString*, m_node.m_str);
+        wxStringImpl& strimpl = str.m_impl;
+
+        int iterDiff = lenNew - lenOld;
+        size_t posIdx = m_pos - strimpl.begin();
+
+        // compute positions of outstanding iterators for this string after the
+        // replacement is done (there is only a small number of iterators at
+        // any time, so we use an array on the stack to avoid unneeded
+        // allocation):
+        static const size_t STATIC_SIZE = 32;
+        size_t indexes_a[STATIC_SIZE];
+        size_t *indexes = indexes_a;
+        size_t iterNum = 0;
+        wxStringIteratorNode *it;
+        for ( it = str.m_iterators.ptr; it; it = it->m_next, ++iterNum )
+        {
+            wxASSERT( it->m_iter || it->m_citer );
+
+            if ( iterNum == STATIC_SIZE )
+            {
+                wxLogTrace( _T("utf8"), _T("unexpectedly many iterators") );
+
+                size_t total = iterNum + 1;
+                for ( wxStringIteratorNode *it2 = it; it2; it2 = it2->m_next )
+                    total++;
+                indexes = new size_t[total];
+                memcpy(indexes, indexes_a, sizeof(size_t) * STATIC_SIZE);
+            }
+
+            size_t idx = it->m_iter
+                         ? (*it->m_iter - strimpl.begin())
+                         : (*it->m_citer - strimpl.begin());
+
+            if ( idx > posIdx )
+                idx += iterDiff;
+
+            indexes[iterNum] = idx;
+        }
+
+        // update the string:
+        strimpl.replace(m_pos, m_pos + lenOld, utf, lenNew);
+
+        // finally, set the iterators to valid values again (note that this
+        // updates m_pos as well):
+        size_t i;
+        for ( i = 0, it = str.m_iterators.ptr; it; it = it->m_next, ++i )
+        {
+            wxASSERT( i < iterNum );
+            wxASSERT( it->m_iter || it->m_citer );
+
+            if ( it->m_iter )
+                *it->m_iter = strimpl.begin() + indexes[i];
+            else // it->m_citer
+                *it->m_citer = strimpl.begin() + indexes[i];
+        }
+
+        if ( indexes != indexes_a )
+            delete[] indexes;
      }
  
      return *this;
diff --git a/tests/strings/unicode.cpp b/tests/strings/unicode.cpp

index dc28e5c8c6ad8cb6acf80e99f7b6c9bb8577a211..4f58739545802bc2caf7a0b24f24517500710207 100644 (file)
--- a/tests/strings/unicode.cpp
+++ b/tests/strings/unicode.cpp
@@ -61,6 +61,9 @@ private:
          CPPUNIT_TEST( ConversionUTF32 );
          CPPUNIT_TEST( IsConvOk );
  #endif // wxUSE_WCHAR_T
+#if wxUSE_UNICODE
+        CPPUNIT_TEST( Iteration );
+#endif
      CPPUNIT_TEST_SUITE_END();
  
      void ToFromAscii();
@@ -73,6 +76,9 @@ private:
      void ConversionUTF16();
      void ConversionUTF32();
      void IsConvOk();
+#if wxUSE_UNICODE
+    void Iteration();
+#endif
  
      // test if converting s using the given encoding gives ws and vice versa
      //
@@ -355,3 +361,56 @@ void UnicodeTestCase::IsConvOk()
  
  #endif // wxUSE_WCHAR_T
  
+#if wxUSE_UNICODE
+void UnicodeTestCase::Iteration()
+{
+    // "czech" in Czech ("cestina"):
+    static const char *textUTF8 = "\304\215e\305\241tina";
+    static const wchar_t textUTF16[] = {0x10D, 0x65, 0x161, 0x74, 0x69, 0x6E, 0x61, 0};
+
+    wxString text(wxString::FromUTF8(textUTF8));
+    CPPUNIT_ASSERT( wxStrcmp(text.wc_str(), textUTF16) == 0 );
+
+    // verify the string was decoded correctly:
+    {
+        size_t idx = 0;
+        for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx )
+        {
+            CPPUNIT_ASSERT( *i == textUTF16[idx] );
+        }
+    }
+
+    // overwrite the string with something that is shorter in UTF-8:
+    {
+        for ( wxString::iterator i = text.begin(); i != text.end(); ++i )
+            *i = 'x';
+    }
+
+    // restore the original text now:
+    {
+        wxString::iterator end1 = text.end();
+        wxString::const_iterator end2 = text.end();
+
+        size_t idx = 0;
+        for ( wxString::iterator i = text.begin(); i != text.end(); ++i, ++idx )
+        {
+            *i = textUTF16[idx];
+
+            CPPUNIT_ASSERT( end1 == text.end() );
+            CPPUNIT_ASSERT( end2 == text.end() );
+        }
+
+        CPPUNIT_ASSERT( end1 == text.end() );
+        CPPUNIT_ASSERT( end2 == text.end() );
+    }
+
+    // and verify it again:
+    {
+        size_t idx = 0;
+        for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx )
+        {
+            CPPUNIT_ASSERT( *i == textUTF16[idx] );
+        }
+    }
+}
+#endif // wxUSE_UNICODE
author	Václav Slavík <vslavik@fastmail.fm>
	Tue, 15 May 2007 13:16:01 +0000 (13:16 +0000)
committer	Václav Slavík <vslavik@fastmail.fm>
	Tue, 15 May 2007 13:16:01 +0000 (13:16 +0000)
include/wx/string.h		patch \| blob \| blame \| history
include/wx/unichar.h		patch \| blob \| blame \| history
src/common/unichar.cpp		patch \| blob \| blame \| history
tests/strings/unicode.cpp		patch \| blob \| blame \| history