]> git.saurik.com Git - wxWidgets.git/blob - src/stc/scintilla/src/UniConversion.cxx
Czech translations update from Zbyněk Schwarz.
[wxWidgets.git] / src / stc / scintilla / src / UniConversion.cxx
1 // Scintilla source code edit control
2 /** @file UniConversion.cxx
3 ** Functions to handle UTF-8 and UTF-16 strings.
4 **/
5 // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
6 // The License.txt file describes the conditions under which this software may be distributed.
7
8 #include <stdlib.h>
9
10 #include "UniConversion.h"
11
12 enum { SURROGATE_LEAD_FIRST = 0xD800 };
13 enum { SURROGATE_TRAIL_FIRST = 0xDC00 };
14 enum { SURROGATE_TRAIL_LAST = 0xDFFF };
15
16 unsigned int UTF8Length(const wchar_t *uptr, unsigned int tlen) {
17 unsigned int len = 0;
18 for (unsigned int i = 0; i < tlen && uptr[i];) {
19 unsigned int uch = uptr[i];
20 if (uch < 0x80) {
21 len++;
22 } else if (uch < 0x800) {
23 len += 2;
24 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
25 (uch <= SURROGATE_TRAIL_LAST)) {
26 len += 4;
27 i++;
28 } else {
29 len += 3;
30 }
31 i++;
32 }
33 return len;
34 }
35
36 void UTF8FromUTF16(const wchar_t *uptr, unsigned int tlen, char *putf, unsigned int len) {
37 int k = 0;
38 for (unsigned int i = 0; i < tlen && uptr[i];) {
39 unsigned int uch = uptr[i];
40 if (uch < 0x80) {
41 putf[k++] = static_cast<char>(uch);
42 } else if (uch < 0x800) {
43 putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
44 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
45 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
46 (uch <= SURROGATE_TRAIL_LAST)) {
47 // Half a surrogate pair
48 i++;
49 unsigned int xch = 0x10000 + ((uch & 0x3ff) << 10) + (uptr[i] & 0x3ff);
50 putf[k++] = static_cast<char>(0xF0 | (xch >> 18));
51 putf[k++] = static_cast<char>(0x80 | ((xch >> 12) & 0x3f));
52 putf[k++] = static_cast<char>(0x80 | ((xch >> 6) & 0x3f));
53 putf[k++] = static_cast<char>(0x80 | (xch & 0x3f));
54 } else {
55 putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
56 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
57 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
58 }
59 i++;
60 }
61 putf[len] = '\0';
62 }
63
64 unsigned int UTF8CharLength(unsigned char ch) {
65 if (ch < 0x80) {
66 return 1;
67 } else if (ch < 0x80 + 0x40 + 0x20) {
68 return 2;
69 } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
70 return 3;
71 } else {
72 return 4;
73 }
74 }
75
76 unsigned int UTF16Length(const char *s, unsigned int len) {
77 unsigned int ulen = 0;
78 unsigned int charLen;
79 for (unsigned int i=0; i<len;) {
80 unsigned char ch = static_cast<unsigned char>(s[i]);
81 if (ch < 0x80) {
82 charLen = 1;
83 } else if (ch < 0x80 + 0x40 + 0x20) {
84 charLen = 2;
85 } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
86 charLen = 3;
87 } else {
88 charLen = 4;
89 ulen++;
90 }
91 i += charLen;
92 ulen++;
93 }
94 return ulen;
95 }
96
97 unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsigned int tlen) {
98 unsigned int ui=0;
99 const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
100 unsigned int i=0;
101 while ((i<len) && (ui<tlen)) {
102 unsigned char ch = us[i++];
103 if (ch < 0x80) {
104 tbuf[ui] = ch;
105 } else if (ch < 0x80 + 0x40 + 0x20) {
106 tbuf[ui] = static_cast<wchar_t>((ch & 0x1F) << 6);
107 ch = us[i++];
108 tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (ch & 0x7F));
109 } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
110 tbuf[ui] = static_cast<wchar_t>((ch & 0xF) << 12);
111 ch = us[i++];
112 tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + ((ch & 0x7F) << 6));
113 ch = us[i++];
114 tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (ch & 0x7F));
115 } else {
116 // Outside the BMP so need two surrogates
117 int val = (ch & 0x7) << 18;
118 ch = us[i++];
119 val += (ch & 0x3F) << 12;
120 ch = us[i++];
121 val += (ch & 0x3F) << 6;
122 ch = us[i++];
123 val += (ch & 0x3F);
124 tbuf[ui] = static_cast<wchar_t>(((val - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
125 ui++;
126 tbuf[ui] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
127 }
128 ui++;
129 }
130 return ui;
131 }
132
133 int UTF8BytesOfLead[256];
134 static bool initialisedBytesOfLead = false;
135
136 static int BytesFromLead(int leadByte) {
137 if (leadByte < 0xC2) {
138 // Single byte or invalid
139 return 1;
140 } else if (leadByte < 0xE0) {
141 return 2;
142 } else if (leadByte < 0xF0) {
143 return 3;
144 } else if (leadByte < 0xF5) {
145 return 4;
146 } else {
147 // Characters longer than 4 bytes not possible in current UTF-8
148 return 1;
149 }
150 }
151
152 void UTF8BytesOfLeadInitialise() {
153 if (!initialisedBytesOfLead) {
154 for (int i=0;i<256;i++) {
155 UTF8BytesOfLead[i] = BytesFromLead(i);
156 }
157 initialisedBytesOfLead = true;
158 }
159 }
160
161 // Return both the width of the first character in the string and a status
162 // saying whether it is valid or invalid.
163 // Most invalid sequences return a width of 1 so are treated as isolated bytes but
164 // the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be
165 // reasonably treated as code points in some circumstances. They will, however,
166 // not have associated glyphs.
167 int UTF8Classify(const unsigned char *us, int len) {
168 // For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
169 if (*us < 0x80) {
170 // Single bytes easy
171 return 1;
172 } else if (*us > 0xf4) {
173 // Characters longer than 4 bytes not possible in current UTF-8
174 return UTF8MaskInvalid | 1;
175 } else if (*us >= 0xf0) {
176 // 4 bytes
177 if (len < 4)
178 return UTF8MaskInvalid | 1;
179 if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
180 if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
181 // *FFFE or *FFFF non-character
182 return UTF8MaskInvalid | 4;
183 }
184 if (*us == 0xf4) {
185 // Check if encoding a value beyond the last Unicode character 10FFFF
186 if (us[1] > 0x8f) {
187 return UTF8MaskInvalid | 1;
188 } else if (us[1] == 0x8f) {
189 if (us[2] > 0xbf) {
190 return UTF8MaskInvalid | 1;
191 } else if (us[2] == 0xbf) {
192 if (us[3] > 0xbf) {
193 return UTF8MaskInvalid | 1;
194 }
195 }
196 }
197 } else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
198 // Overlong
199 return UTF8MaskInvalid | 1;
200 }
201 return 4;
202 } else {
203 return UTF8MaskInvalid | 1;
204 }
205 } else if (*us >= 0xe0) {
206 // 3 bytes
207 if (len < 3)
208 return UTF8MaskInvalid | 1;
209 if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2])) {
210 if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
211 // Overlong
212 return UTF8MaskInvalid | 1;
213 }
214 if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) {
215 // Surrogate
216 return UTF8MaskInvalid | 1;
217 }
218 if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) {
219 // U+FFFE non-character - 3 bytes long
220 return UTF8MaskInvalid | 3;
221 }
222 if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) {
223 // U+FFFF non-character - 3 bytes long
224 return UTF8MaskInvalid | 3;
225 }
226 if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) {
227 // U+FDD0 .. U+FDEF
228 return UTF8MaskInvalid | 3;
229 }
230 return 3;
231 } else {
232 return UTF8MaskInvalid | 1;
233 }
234 } else if (*us >= 0xc2) {
235 // 2 bytes
236 if (len < 2)
237 return UTF8MaskInvalid | 1;
238 if (UTF8IsTrailByte(us[1])) {
239 return 2;
240 } else {
241 return UTF8MaskInvalid | 1;
242 }
243 } else {
244 // 0xc0 .. 0xc1 is overlong encoding
245 // 0x80 .. 0xbf is trail byte
246 return UTF8MaskInvalid | 1;
247 }
248 }