include/wx/stringops.h

   1 ///////////////////////////////////////////////////////////////////////////////
   2 // Name:        wx/stringops.h
   3 // Purpose:     implementation of wxString primitive operations
   4 // Author:      Vaclav Slavik
   5 // Modified by:
   6 // Created:     2007-04-16
   7 // RCS-ID:      $Id$
   8 // Copyright:   (c) 2007 REA Elektronik GmbH
   9 // Licence:     wxWindows licence
  10 ///////////////////////////////////////////////////////////////////////////////
  11
  12 #ifndef _WX_WXSTRINGOPS_H__
  13 #define _WX_WXSTRINGOPS_H__
  14
  15 #include "wx/chartype.h"
  16 #include "wx/stringimpl.h"
  17 #include "wx/unichar.h"
  18 #include "wx/buffer.h"
  19
  20 // This header contains wxStringOperations "namespace" class that implements
  21 // elementary operations on string data as static methods; wxString methods and
  22 // iterators are implemented in terms of it. Two implementations are available,
  23 // one for UTF-8 encoded char* string and one for "raw" wchar_t* strings (or
  24 // char* in ANSI build).
  25
  26 // FIXME-UTF8: only wchar after we remove ANSI build
  27 #if wxUSE_UNICODE_WCHAR || !wxUSE_UNICODE
  28 struct WXDLLIMPEXP_BASE wxStringOperationsWchar
  29 {
  30     // moves the iterator to the next Unicode character
  31     template <typename Iterator>
  32     static void IncIter(Iterator& i) { ++i; }
  33
  34     // moves the iterator to the previous Unicode character
  35     template <typename Iterator>
  36     static void DecIter(Iterator& i) { --i; }
  37
  38     // moves the iterator by n Unicode characters
  39     template <typename Iterator>
  40     static Iterator AddToIter(const Iterator& i, ptrdiff_t n)
  41         { return i + n; }
  42
  43     // returns distance of the two iterators in Unicode characters
  44     template <typename Iterator>
  45     static ptrdiff_t DiffIters(const Iterator& i1, const Iterator& i2)
  46         { return i1 - i2; }
  47
  48     // encodes the character to a form used to represent it in internal
  49     // representation (returns a string in UTF8 version)
  50     static wxChar EncodeChar(const wxUniChar& ch) { return (wxChar)ch; }
  51
  52     static wxUniChar DecodeChar(const wxStringImpl::const_iterator& i)
  53         { return *i; }
  54 };
  55 #endif // wxUSE_UNICODE_WCHAR || !wxUSE_UNICODE
  56
  57
  58 #if wxUSE_UNICODE_UTF8
  59 struct WXDLLIMPEXP_BASE wxStringOperationsUtf8
  60 {
  61     // checks correctness of UTF-8 sequence
  62     static bool IsValidUtf8String(const char *c,
  63                                   size_t len = wxStringImpl::npos);
  64     static bool IsValidUtf8LeadByte(unsigned char c)
  65     {
  66         return (c <= 0x7F) || (c >= 0xC2 && c <= 0xF4);
  67     }
  68
  69     // table of offsets to skip forward when iterating over UTF-8 sequence
  70     static const unsigned char ms_utf8IterTable[256];
  71
  72
  73     template<typename Iterator>
  74     static void IncIter(Iterator& i)
  75     {
  76         wxASSERT( IsValidUtf8LeadByte(*i) );
  77         i += ms_utf8IterTable[(unsigned char)*i];
  78     }
  79
  80     template<typename Iterator>
  81     static void DecIter(Iterator& i)
  82     {
  83         wxASSERT( IsValidUtf8LeadByte(*i) );
  84
  85         // Non-lead bytes are all in the 0x80..0xBF range (i.e. 10xxxxxx in
  86         // binary), so we just have to go back until we hit a byte that is
  87         // either < 0x80 (i.e. 0xxxxxxx in binary) or 0xC0..0xFF (11xxxxxx in
  88         // binary; this includes some invalid values, but we can ignore it
  89         // here, because we assume valid UTF-8 input for the purpose of
  90         // efficient implementation).
  91         --i;
  92         while ( ((*i) & 0xC0) == 0x80 /* 2 highest bits are '10' */ )
  93             --i;
  94     }
  95
  96     template<typename Iterator>
  97     static Iterator AddToIter(const Iterator& i, ptrdiff_t n)
  98     {
  99         Iterator out(i);
 100
 101         if ( n > 0 )
 102         {
 103             for ( ptrdiff_t j = 0; j < n; ++j )
 104                 IncIter(out);
 105         }
 106         else if ( n < 0 )
 107         {
 108             for ( ptrdiff_t j = 0; j > n; --j )
 109                 DecIter(out);
 110         }
 111
 112         return out;
 113     }
 114
 115     template<typename Iterator>
 116     static ptrdiff_t DiffIters(Iterator i1, Iterator i2)
 117     {
 118         ptrdiff_t dist = 0;
 119
 120         if ( i1 < i2 )
 121         {
 122             while ( i1 != i2 )
 123             {
 124                 IncIter(i1);
 125                 dist--;
 126             }
 127         }
 128         else if ( i2 < i1 )
 129         {
 130             while ( i2 != i1 )
 131             {
 132                 IncIter(i2);
 133                 dist++;
 134             }
 135         }
 136
 137         return dist;
 138     }
 139
 140     // encodes the character as UTF-8:
 141     typedef wxUniChar::Utf8CharBuffer Utf8CharBuffer;
 142     static Utf8CharBuffer EncodeChar(const wxUniChar& ch)
 143         { return ch.AsUTF8(); }
 144
 145     // returns n copies of ch encoded in UTF-8 string
 146     static wxCharBuffer EncodeNChars(size_t n, const wxUniChar& ch);
 147
 148     // returns the length of UTF-8 encoding of the character with lead byte 'c'
 149     static size_t GetUtf8CharLength(char c)
 150     {
 151         wxASSERT( IsValidUtf8LeadByte(c) );
 152         return ms_utf8IterTable[(unsigned char)c];
 153     }
 154
 155     // decodes single UTF-8 character from UTF-8 string
 156     static wxUniChar DecodeChar(wxStringImpl::const_iterator i)
 157     {
 158         if ( (unsigned char)*i < 0x80 )
 159             return (int)*i;
 160         return DecodeNonAsciiChar(i);
 161     }
 162
 163 private:
 164     static wxUniChar DecodeNonAsciiChar(wxStringImpl::const_iterator i);
 165 };
 166 #endif // wxUSE_UNICODE_UTF8
 167
 168
 169 #if wxUSE_UNICODE_UTF8
 170 typedef wxStringOperationsUtf8 wxStringOperations;
 171 #else
 172 typedef wxStringOperationsWchar wxStringOperations;
 173 #endif
 174
 175 #endif  // _WX_WXSTRINGOPS_H_