| 1 | /////////////////////////////////////////////////////////////////////////////// |
| 2 | // Name: wx/stringops.h |
| 3 | // Purpose: implementation of wxString primitive operations |
| 4 | // Author: Vaclav Slavik |
| 5 | // Modified by: |
| 6 | // Created: 2007-04-16 |
| 7 | // RCS-ID: $Id$ |
| 8 | // Copyright: (c) 2007 REA Elektronik GmbH |
| 9 | // Licence: wxWindows licence |
| 10 | /////////////////////////////////////////////////////////////////////////////// |
| 11 | |
| 12 | #ifndef _WX_WXSTRINGOPS_H__ |
| 13 | #define _WX_WXSTRINGOPS_H__ |
| 14 | |
| 15 | #include "wx/chartype.h" |
| 16 | #include "wx/stringimpl.h" |
| 17 | #include "wx/unichar.h" |
| 18 | #include "wx/buffer.h" |
| 19 | |
| 20 | // This header contains wxStringOperations "namespace" class that implements |
| 21 | // elementary operations on string data as static methods; wxString methods and |
| 22 | // iterators are implemented in terms of it. Two implementations are available, |
| 23 | // one for UTF-8 encoded char* string and one for "raw" wchar_t* strings (or |
| 24 | // char* in ANSI build). |
| 25 | |
| 26 | // FIXME-UTF8: only wchar after we remove ANSI build |
| 27 | #if wxUSE_UNICODE_WCHAR || !wxUSE_UNICODE |
| 28 | struct WXDLLIMPEXP_BASE wxStringOperationsWchar |
| 29 | { |
| 30 | // moves the iterator to the next Unicode character |
| 31 | template <typename Iterator> |
| 32 | static void IncIter(Iterator& i) { ++i; } |
| 33 | |
| 34 | // moves the iterator to the previous Unicode character |
| 35 | template <typename Iterator> |
| 36 | static void DecIter(Iterator& i) { --i; } |
| 37 | |
| 38 | // moves the iterator by n Unicode characters |
| 39 | template <typename Iterator> |
| 40 | static Iterator AddToIter(const Iterator& i, ptrdiff_t n) |
| 41 | { return i + n; } |
| 42 | |
| 43 | // returns distance of the two iterators in Unicode characters |
| 44 | template <typename Iterator> |
| 45 | static ptrdiff_t DiffIters(const Iterator& i1, const Iterator& i2) |
| 46 | { return i1 - i2; } |
| 47 | |
| 48 | // encodes the character to a form used to represent it in internal |
| 49 | // representation (returns a string in UTF8 version) |
| 50 | static wxChar EncodeChar(const wxUniChar& ch) { return (wxChar)ch; } |
| 51 | |
| 52 | static wxUniChar DecodeChar(const wxStringImpl::const_iterator& i) |
| 53 | { return *i; } |
| 54 | }; |
| 55 | #endif // wxUSE_UNICODE_WCHAR || !wxUSE_UNICODE |
| 56 | |
| 57 | |
| 58 | #if wxUSE_UNICODE_UTF8 |
| 59 | struct WXDLLIMPEXP_BASE wxStringOperationsUtf8 |
| 60 | { |
| 61 | // checks correctness of UTF-8 sequence |
| 62 | static bool IsValidUtf8String(const char *c, |
| 63 | size_t len = wxStringImpl::npos); |
| 64 | static bool IsValidUtf8LeadByte(unsigned char c) |
| 65 | { |
| 66 | return (c <= 0x7F) || (c >= 0xC2 && c <= 0xF4); |
| 67 | } |
| 68 | |
| 69 | // table of offsets to skip forward when iterating over UTF-8 sequence |
| 70 | static const unsigned char ms_utf8IterTable[256]; |
| 71 | |
| 72 | |
| 73 | template<typename Iterator> |
| 74 | static void IncIter(Iterator& i) |
| 75 | { |
| 76 | wxASSERT( IsValidUtf8LeadByte(*i) ); |
| 77 | i += ms_utf8IterTable[(unsigned char)*i]; |
| 78 | } |
| 79 | |
| 80 | template<typename Iterator> |
| 81 | static void DecIter(Iterator& i) |
| 82 | { |
| 83 | wxASSERT( IsValidUtf8LeadByte(*i) ); |
| 84 | |
| 85 | // Non-lead bytes are all in the 0x80..0xBF range (i.e. 10xxxxxx in |
| 86 | // binary), so we just have to go back until we hit a byte that is |
| 87 | // either < 0x80 (i.e. 0xxxxxxx in binary) or 0xC0..0xFF (11xxxxxx in |
| 88 | // binary; this includes some invalid values, but we can ignore it |
| 89 | // here, because we assume valid UTF-8 input for the purpose of |
| 90 | // efficient implementation). |
| 91 | --i; |
| 92 | while ( ((*i) & 0xC0) == 0x80 /* 2 highest bits are '10' */ ) |
| 93 | --i; |
| 94 | } |
| 95 | |
| 96 | template<typename Iterator> |
| 97 | static Iterator AddToIter(const Iterator& i, ptrdiff_t n) |
| 98 | { |
| 99 | Iterator out(i); |
| 100 | |
| 101 | if ( n > 0 ) |
| 102 | { |
| 103 | for ( ptrdiff_t j = 0; j < n; ++j ) |
| 104 | IncIter(out); |
| 105 | } |
| 106 | else if ( n < 0 ) |
| 107 | { |
| 108 | for ( ptrdiff_t j = 0; j > n; --j ) |
| 109 | DecIter(out); |
| 110 | } |
| 111 | |
| 112 | return out; |
| 113 | } |
| 114 | |
| 115 | template<typename Iterator> |
| 116 | static ptrdiff_t DiffIters(Iterator i1, Iterator i2) |
| 117 | { |
| 118 | ptrdiff_t dist = 0; |
| 119 | |
| 120 | if ( i1 < i2 ) |
| 121 | { |
| 122 | while ( i1 != i2 ) |
| 123 | { |
| 124 | IncIter(i1); |
| 125 | dist--; |
| 126 | } |
| 127 | } |
| 128 | else if ( i2 < i1 ) |
| 129 | { |
| 130 | while ( i2 != i1 ) |
| 131 | { |
| 132 | IncIter(i2); |
| 133 | dist++; |
| 134 | } |
| 135 | } |
| 136 | |
| 137 | return dist; |
| 138 | } |
| 139 | |
| 140 | // encodes the character as UTF-8: |
| 141 | typedef wxUniChar::Utf8CharBuffer Utf8CharBuffer; |
| 142 | static Utf8CharBuffer EncodeChar(const wxUniChar& ch) |
| 143 | { return ch.AsUTF8(); } |
| 144 | |
| 145 | // returns n copies of ch encoded in UTF-8 string |
| 146 | static wxCharBuffer EncodeNChars(size_t n, const wxUniChar& ch); |
| 147 | |
| 148 | // returns the length of UTF-8 encoding of the character with lead byte 'c' |
| 149 | static size_t GetUtf8CharLength(char c) |
| 150 | { |
| 151 | wxASSERT( IsValidUtf8LeadByte(c) ); |
| 152 | return ms_utf8IterTable[(unsigned char)c]; |
| 153 | } |
| 154 | |
| 155 | // decodes single UTF-8 character from UTF-8 string |
| 156 | static wxUniChar DecodeChar(wxStringImpl::const_iterator i) |
| 157 | { |
| 158 | if ( (unsigned char)*i < 0x80 ) |
| 159 | return (int)*i; |
| 160 | return DecodeNonAsciiChar(i); |
| 161 | } |
| 162 | |
| 163 | private: |
| 164 | static wxUniChar DecodeNonAsciiChar(wxStringImpl::const_iterator i); |
| 165 | }; |
| 166 | #endif // wxUSE_UNICODE_UTF8 |
| 167 | |
| 168 | |
| 169 | #if wxUSE_UNICODE_UTF8 |
| 170 | typedef wxStringOperationsUtf8 wxStringOperations; |
| 171 | #else |
| 172 | typedef wxStringOperationsWchar wxStringOperations; |
| 173 | #endif |
| 174 | |
| 175 | #endif // _WX_WXSTRINGOPS_H_ |