include/wx/stringops.h

   1 ///////////////////////////////////////////////////////////////////////////////
   2 // Name:        wx/stringops.h
   3 // Purpose:     implementation of wxString primitive operations
   4 // Author:      Vaclav Slavik
   5 // Modified by:
   6 // Created:     2007-04-16
   7 // Copyright:   (c) 2007 REA Elektronik GmbH
   8 // Licence:     wxWindows licence
   9 ///////////////////////////////////////////////////////////////////////////////
  10
  11 #ifndef _WX_WXSTRINGOPS_H__
  12 #define _WX_WXSTRINGOPS_H__
  13
  14 #include "wx/chartype.h"
  15 #include "wx/stringimpl.h"
  16 #include "wx/unichar.h"
  17 #include "wx/buffer.h"
  18
  19 // This header contains wxStringOperations "namespace" class that implements
  20 // elementary operations on string data as static methods; wxString methods and
  21 // iterators are implemented in terms of it. Two implementations are available,
  22 // one for UTF-8 encoded char* string and one for "raw" wchar_t* strings (or
  23 // char* in ANSI build).
  24
  25 // FIXME-UTF8: only wchar after we remove ANSI build
  26 #if wxUSE_UNICODE_WCHAR || !wxUSE_UNICODE
  27 struct WXDLLIMPEXP_BASE wxStringOperationsWchar
  28 {
  29     // moves the iterator to the next Unicode character
  30     template <typename Iterator>
  31     static void IncIter(Iterator& i) { ++i; }
  32
  33     // moves the iterator to the previous Unicode character
  34     template <typename Iterator>
  35     static void DecIter(Iterator& i) { --i; }
  36
  37     // moves the iterator by n Unicode characters
  38     template <typename Iterator>
  39     static Iterator AddToIter(const Iterator& i, ptrdiff_t n)
  40         { return i + n; }
  41
  42     // returns distance of the two iterators in Unicode characters
  43     template <typename Iterator>
  44     static ptrdiff_t DiffIters(const Iterator& i1, const Iterator& i2)
  45         { return i1 - i2; }
  46
  47     // encodes the character to a form used to represent it in internal
  48     // representation (returns a string in UTF8 version)
  49     static wxChar EncodeChar(const wxUniChar& ch) { return (wxChar)ch; }
  50
  51     static wxUniChar DecodeChar(const wxStringImpl::const_iterator& i)
  52         { return *i; }
  53 };
  54 #endif // wxUSE_UNICODE_WCHAR || !wxUSE_UNICODE
  55
  56
  57 #if wxUSE_UNICODE_UTF8
  58 struct WXDLLIMPEXP_BASE wxStringOperationsUtf8
  59 {
  60     // checks correctness of UTF-8 sequence
  61     static bool IsValidUtf8String(const char *c,
  62                                   size_t len = wxStringImpl::npos);
  63     static bool IsValidUtf8LeadByte(unsigned char c)
  64     {
  65         return (c <= 0x7F) || (c >= 0xC2 && c <= 0xF4);
  66     }
  67
  68     // table of offsets to skip forward when iterating over UTF-8 sequence
  69     static const unsigned char ms_utf8IterTable[256];
  70
  71
  72     template<typename Iterator>
  73     static void IncIter(Iterator& i)
  74     {
  75         wxASSERT( IsValidUtf8LeadByte(*i) );
  76         i += ms_utf8IterTable[(unsigned char)*i];
  77     }
  78
  79     template<typename Iterator>
  80     static void DecIter(Iterator& i)
  81     {
  82         wxASSERT( IsValidUtf8LeadByte(*i) );
  83
  84         // Non-lead bytes are all in the 0x80..0xBF range (i.e. 10xxxxxx in
  85         // binary), so we just have to go back until we hit a byte that is
  86         // either < 0x80 (i.e. 0xxxxxxx in binary) or 0xC0..0xFF (11xxxxxx in
  87         // binary; this includes some invalid values, but we can ignore it
  88         // here, because we assume valid UTF-8 input for the purpose of
  89         // efficient implementation).
  90         --i;
  91         while ( ((*i) & 0xC0) == 0x80 /* 2 highest bits are '10' */ )
  92             --i;
  93     }
  94
  95     template<typename Iterator>
  96     static Iterator AddToIter(const Iterator& i, ptrdiff_t n)
  97     {
  98         Iterator out(i);
  99
 100         if ( n > 0 )
 101         {
 102             for ( ptrdiff_t j = 0; j < n; ++j )
 103                 IncIter(out);
 104         }
 105         else if ( n < 0 )
 106         {
 107             for ( ptrdiff_t j = 0; j > n; --j )
 108                 DecIter(out);
 109         }
 110
 111         return out;
 112     }
 113
 114     template<typename Iterator>
 115     static ptrdiff_t DiffIters(Iterator i1, Iterator i2)
 116     {
 117         ptrdiff_t dist = 0;
 118
 119         if ( i1 < i2 )
 120         {
 121             while ( i1 != i2 )
 122             {
 123                 IncIter(i1);
 124                 dist--;
 125             }
 126         }
 127         else if ( i2 < i1 )
 128         {
 129             while ( i2 != i1 )
 130             {
 131                 IncIter(i2);
 132                 dist++;
 133             }
 134         }
 135
 136         return dist;
 137     }
 138
 139     // encodes the character as UTF-8:
 140     typedef wxUniChar::Utf8CharBuffer Utf8CharBuffer;
 141     static Utf8CharBuffer EncodeChar(const wxUniChar& ch)
 142         { return ch.AsUTF8(); }
 143
 144     // returns n copies of ch encoded in UTF-8 string
 145     static wxCharBuffer EncodeNChars(size_t n, const wxUniChar& ch);
 146
 147     // returns the length of UTF-8 encoding of the character with lead byte 'c'
 148     static size_t GetUtf8CharLength(char c)
 149     {
 150         wxASSERT( IsValidUtf8LeadByte(c) );
 151         return ms_utf8IterTable[(unsigned char)c];
 152     }
 153
 154     // decodes single UTF-8 character from UTF-8 string
 155     static wxUniChar DecodeChar(wxStringImpl::const_iterator i)
 156     {
 157         if ( (unsigned char)*i < 0x80 )
 158             return (int)*i;
 159         return DecodeNonAsciiChar(i);
 160     }
 161
 162 private:
 163     static wxUniChar DecodeNonAsciiChar(wxStringImpl::const_iterator i);
 164 };
 165 #endif // wxUSE_UNICODE_UTF8
 166
 167
 168 #if wxUSE_UNICODE_UTF8
 169 typedef wxStringOperationsUtf8 wxStringOperations;
 170 #else
 171 typedef wxStringOperationsWchar wxStringOperations;
 172 #endif
 173
 174 #endif  // _WX_WXSTRINGOPS_H_