]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
b75a7d8f | 3 | /******************************************************************** |
51004dcb | 4 | * COPYRIGHT: |
b331163b | 5 | * Copyright (c) 1998-2014, International Business Machines Corporation and |
b75a7d8f A |
6 | * others. All Rights Reserved. |
7 | ********************************************************************/ | |
8 | /* | |
51004dcb | 9 | * File utf8tst.c |
b75a7d8f A |
10 | * |
11 | * Modification History: | |
12 | * | |
13 | * Date Name Description | |
51004dcb | 14 | * 07/24/2000 Madhu Creation |
b75a7d8f A |
15 | ******************************************************************************* |
16 | */ | |
17 | ||
18 | #include "unicode/utypes.h" | |
19 | #include "unicode/utf8.h" | |
0f5d89e8 | 20 | #include "unicode/utf_old.h" |
b75a7d8f A |
21 | #include "cmemory.h" |
22 | #include "cintltst.h" | |
23 | ||
73c04bcf A |
24 | /* lenient UTF-8 ------------------------------------------------------------ */ |
25 | ||
26 | /* | |
27 | * Lenient UTF-8 differs from conformant UTF-8 in that it allows surrogate | |
28 | * code points with their "natural" encoding. | |
29 | * Effectively, this allows a mix of UTF-8 and CESU-8 as well as encodings of | |
30 | * single surrogates. | |
31 | * | |
32 | * This is not conformant with UTF-8. | |
33 | * | |
34 | * Supplementary code points may be encoded as pairs of 3-byte sequences, but | |
35 | * the macros below do not attempt to assemble such pairs. | |
36 | */ | |
37 | ||
340931cb | 38 | #define L8_NEXT(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \ |
73c04bcf A |
39 | (c)=(uint8_t)(s)[(i)++]; \ |
40 | if((c)>=0x80) { \ | |
41 | if(U8_IS_LEAD(c)) { \ | |
42 | (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \ | |
43 | } else { \ | |
44 | (c)=U_SENTINEL; \ | |
45 | } \ | |
46 | } \ | |
340931cb | 47 | } UPRV_BLOCK_MACRO_END |
73c04bcf | 48 | |
340931cb | 49 | #define L8_PREV(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \ |
73c04bcf A |
50 | (c)=(uint8_t)(s)[--(i)]; \ |
51 | if((c)>=0x80) { \ | |
52 | if((c)<=0xbf) { \ | |
53 | (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \ | |
54 | } else { \ | |
55 | (c)=U_SENTINEL; \ | |
56 | } \ | |
57 | } \ | |
340931cb | 58 | } UPRV_BLOCK_MACRO_END |
73c04bcf A |
59 | |
60 | /* -------------------------------------------------------------------------- */ | |
61 | ||
0f5d89e8 A |
62 | // Obsolete macros from obsolete unicode/utf_old.h, for some old test data. |
63 | #ifndef UTF8_ERROR_VALUE_1 | |
64 | # define UTF8_ERROR_VALUE_1 0x15 | |
65 | #endif | |
66 | #ifndef UTF8_ERROR_VALUE_2 | |
67 | # define UTF8_ERROR_VALUE_2 0x9f | |
68 | #endif | |
69 | #ifndef UTF_ERROR_VALUE | |
70 | # define UTF_ERROR_VALUE 0xffff | |
71 | #endif | |
72 | #ifndef UTF_IS_ERROR | |
73 | # define UTF_IS_ERROR(c) \ | |
74 | (((c)&0xfffe)==0xfffe || (c)==UTF8_ERROR_VALUE_1 || (c)==UTF8_ERROR_VALUE_2) | |
75 | #endif | |
76 | ||
77 | #if !U_HIDE_OBSOLETE_UTF_OLD_H | |
78 | static void printUChars(const uint8_t *uchars, int16_t len){ | |
79 | int16_t i=0; | |
80 | for(i=0; i<len; i++){ | |
81 | log_err("0x%02x ", *(uchars+i)); | |
82 | } | |
83 | } | |
84 | #endif | |
b75a7d8f A |
85 | |
86 | static void TestCodeUnitValues(void); | |
87 | static void TestCharLength(void); | |
88 | static void TestGetChar(void); | |
89 | static void TestNextPrevChar(void); | |
51004dcb A |
90 | static void TestNulTerminated(void); |
91 | static void TestNextPrevNonCharacters(void); | |
92 | static void TestNextPrevCharUnsafe(void); | |
b75a7d8f | 93 | static void TestFwdBack(void); |
51004dcb | 94 | static void TestFwdBackUnsafe(void); |
b75a7d8f | 95 | static void TestSetChar(void); |
51004dcb | 96 | static void TestSetCharUnsafe(void); |
0f5d89e8 | 97 | static void TestTruncateIfIncomplete(void); |
b75a7d8f A |
98 | static void TestAppendChar(void); |
99 | static void TestAppend(void); | |
73c04bcf | 100 | static void TestSurrogates(void); |
b75a7d8f A |
101 | |
102 | void addUTF8Test(TestNode** root); | |
103 | ||
104 | void | |
105 | addUTF8Test(TestNode** root) | |
106 | { | |
51004dcb A |
107 | addTest(root, &TestCodeUnitValues, "utf8tst/TestCodeUnitValues"); |
108 | addTest(root, &TestCharLength, "utf8tst/TestCharLength"); | |
109 | addTest(root, &TestGetChar, "utf8tst/TestGetChar"); | |
110 | addTest(root, &TestNextPrevChar, "utf8tst/TestNextPrevChar"); | |
111 | addTest(root, &TestNulTerminated, "utf8tst/TestNulTerminated"); | |
112 | addTest(root, &TestNextPrevNonCharacters, "utf8tst/TestNextPrevNonCharacters"); | |
113 | addTest(root, &TestNextPrevCharUnsafe, "utf8tst/TestNextPrevCharUnsafe"); | |
114 | addTest(root, &TestFwdBack, "utf8tst/TestFwdBack"); | |
115 | addTest(root, &TestFwdBackUnsafe, "utf8tst/TestFwdBackUnsafe"); | |
116 | addTest(root, &TestSetChar, "utf8tst/TestSetChar"); | |
117 | addTest(root, &TestSetCharUnsafe, "utf8tst/TestSetCharUnsafe"); | |
0f5d89e8 | 118 | addTest(root, &TestTruncateIfIncomplete, "utf8tst/TestTruncateIfIncomplete"); |
51004dcb A |
119 | addTest(root, &TestAppendChar, "utf8tst/TestAppendChar"); |
120 | addTest(root, &TestAppend, "utf8tst/TestAppend"); | |
121 | addTest(root, &TestSurrogates, "utf8tst/TestSurrogates"); | |
b75a7d8f A |
122 | } |
123 | ||
124 | static void TestCodeUnitValues() | |
125 | { | |
0f5d89e8 | 126 | static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc2, 0xc4, 0xf0, 0xf4, 0x80, 0x81, 0xbc, 0xbe,}; |
51004dcb | 127 | |
b75a7d8f | 128 | int16_t i; |
b331163b | 129 | for(i=0; i<UPRV_LENGTHOF(codeunit); i++){ |
b75a7d8f A |
130 | uint8_t c=codeunit[i]; |
131 | log_verbose("Testing code unit value of %x\n", c); | |
132 | if(i<4){ | |
0f5d89e8 A |
133 | if( |
134 | #if !U_HIDE_OBSOLETE_UTF_OLD_H | |
135 | !UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || UTF8_IS_TRAIL(c) || | |
136 | #endif | |
137 | !U8_IS_SINGLE(c) || U8_IS_LEAD(c) || U8_IS_TRAIL(c)) { | |
b75a7d8f | 138 | log_err("ERROR: 0x%02x is a single byte but results in single: %c lead: %c trail: %c\n", |
0f5d89e8 | 139 | c, U8_IS_SINGLE(c) ? 'y' : 'n', U8_IS_LEAD(c) ? 'y' : 'n', U8_IS_TRAIL(c) ? 'y' : 'n'); |
b75a7d8f A |
140 | } |
141 | } else if(i< 8){ | |
0f5d89e8 A |
142 | if( |
143 | #if !U_HIDE_OBSOLETE_UTF_OLD_H | |
144 | !UTF8_IS_LEAD(c) || UTF8_IS_SINGLE(c) || UTF8_IS_TRAIL(c) || | |
145 | #endif | |
146 | !U8_IS_LEAD(c) || U8_IS_SINGLE(c) || U8_IS_TRAIL(c)) { | |
b75a7d8f | 147 | log_err("ERROR: 0x%02x is a lead byte but results in single: %c lead: %c trail: %c\n", |
0f5d89e8 | 148 | c, U8_IS_SINGLE(c) ? 'y' : 'n', U8_IS_LEAD(c) ? 'y' : 'n', U8_IS_TRAIL(c) ? 'y' : 'n'); |
b75a7d8f A |
149 | } |
150 | } else if(i< 12){ | |
0f5d89e8 A |
151 | if( |
152 | #if !U_HIDE_OBSOLETE_UTF_OLD_H | |
153 | !UTF8_IS_TRAIL(c) || UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || | |
154 | #endif | |
155 | !U8_IS_TRAIL(c) || U8_IS_SINGLE(c) || U8_IS_LEAD(c)){ | |
b75a7d8f | 156 | log_err("ERROR: 0x%02x is a trail byte but results in single: %c lead: %c trail: %c\n", |
0f5d89e8 | 157 | c, U8_IS_SINGLE(c) ? 'y' : 'n', U8_IS_LEAD(c) ? 'y' : 'n', U8_IS_TRAIL(c) ? 'y' : 'n'); |
b75a7d8f A |
158 | } |
159 | } | |
160 | } | |
161 | } | |
162 | ||
163 | static void TestCharLength() | |
164 | { | |
165 | static const uint32_t codepoint[]={ | |
166 | 1, 0x0061, | |
167 | 1, 0x007f, | |
168 | 2, 0x016f, | |
169 | 2, 0x07ff, | |
170 | 3, 0x0865, | |
171 | 3, 0x20ac, | |
172 | 4, 0x20402, | |
173 | 4, 0x23456, | |
174 | 4, 0x24506, | |
175 | 4, 0x20402, | |
176 | 4, 0x10402, | |
177 | 3, 0xd7ff, | |
178 | 3, 0xe000, | |
51004dcb | 179 | |
b75a7d8f | 180 | }; |
51004dcb | 181 | |
b75a7d8f | 182 | int16_t i; |
0f5d89e8 | 183 | #if !U_HIDE_OBSOLETE_UTF_OLD_H |
b75a7d8f | 184 | UBool multiple; |
0f5d89e8 | 185 | #endif |
b331163b | 186 | for(i=0; i<UPRV_LENGTHOF(codepoint); i=(int16_t)(i+2)){ |
b75a7d8f | 187 | UChar32 c=codepoint[i+1]; |
0f5d89e8 A |
188 | if( |
189 | #if !U_HIDE_OBSOLETE_UTF_OLD_H | |
190 | UTF8_CHAR_LENGTH(c) != (uint16_t)codepoint[i] || | |
191 | #endif | |
192 | U8_LENGTH(c) != (uint16_t)codepoint[i]) { | |
193 | log_err("The no: of code units for %lx:- Expected: %d Got: %d\n", c, codepoint[i], U8_LENGTH(c)); | |
b75a7d8f | 194 | }else{ |
0f5d89e8 | 195 | log_verbose("The no: of code units for %lx is %d\n",c, U8_LENGTH(c)); |
b75a7d8f | 196 | } |
0f5d89e8 | 197 | #if !U_HIDE_OBSOLETE_UTF_OLD_H |
b75a7d8f A |
198 | multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE); |
199 | if(UTF8_NEED_MULTIPLE_UCHAR(c) != multiple){ | |
200 | log_err("ERROR: UTF8_NEED_MULTIPLE_UCHAR failed for %lx\n", c); | |
201 | } | |
0f5d89e8 | 202 | #endif |
b75a7d8f A |
203 | } |
204 | } | |
205 | ||
206 | static void TestGetChar() | |
207 | { | |
208 | static const uint8_t input[]={ | |
209 | /* code unit,*/ | |
210 | 0x61, | |
211 | 0x7f, | |
212 | 0xe4, | |
51004dcb | 213 | 0xba, |
b75a7d8f | 214 | 0x8c, |
51004dcb A |
215 | 0xF0, |
216 | 0x90, | |
217 | 0x90, | |
b75a7d8f A |
218 | 0x81, |
219 | 0xc0, | |
220 | 0x65, | |
221 | 0x31, | |
222 | 0x9a, | |
223 | 0xc9 | |
224 | }; | |
225 | static const UChar32 result[]={ | |
51004dcb A |
226 | /* codepoint-unsafe, codepoint-safe(not strict) codepoint-safe(strict) */ |
227 | 0x61, 0x61, 0x61, | |
228 | 0x7f, 0x7f, 0x7f, | |
b75a7d8f A |
229 | 0x4e8c, 0x4e8c, 0x4e8c, |
230 | 0x4e8c, 0x4e8c, 0x4e8c , | |
231 | 0x4e8c, 0x4e8c, 0x4e8c, | |
232 | 0x10401, 0x10401, 0x10401 , | |
233 | 0x10401, 0x10401, 0x10401 , | |
234 | 0x10401, 0x10401, 0x10401 , | |
235 | 0x10401, 0x10401, 0x10401, | |
0f5d89e8 | 236 | -1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, |
51004dcb A |
237 | 0x65, 0x65, 0x65, |
238 | 0x31, 0x31, 0x31, | |
0f5d89e8 A |
239 | -1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, |
240 | -1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1 | |
b75a7d8f A |
241 | }; |
242 | uint16_t i=0; | |
51004dcb | 243 | UChar32 c, expected; |
b75a7d8f A |
244 | uint32_t offset=0; |
245 | ||
246 | for(offset=0; offset<sizeof(input); offset++) { | |
0f5d89e8 A |
247 | expected = result[i]; |
248 | if (expected >= 0 && offset < sizeof(input) - 1) { | |
249 | #if !U_HIDE_OBSOLETE_UTF_OLD_H | |
b75a7d8f | 250 | UTF8_GET_CHAR_UNSAFE(input, offset, c); |
0f5d89e8 A |
251 | if(c != expected) { |
252 | log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", | |
253 | offset, expected, c); | |
51004dcb | 254 | |
b75a7d8f | 255 | } |
0f5d89e8 | 256 | #endif |
b75a7d8f | 257 | U8_GET_UNSAFE(input, offset, c); |
0f5d89e8 A |
258 | if(c != expected) { |
259 | log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", | |
260 | offset, expected, c); | |
51004dcb | 261 | |
b75a7d8f A |
262 | } |
263 | } | |
51004dcb | 264 | expected=result[i+1]; |
0f5d89e8 A |
265 | #if !U_HIDE_OBSOLETE_UTF_OLD_H |
266 | UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, FALSE); | |
51004dcb A |
267 | if(c != expected){ |
268 | log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c); | |
269 | } | |
0f5d89e8 | 270 | #endif |
b75a7d8f | 271 | U8_GET(input, 0, offset, sizeof(input), c); |
51004dcb A |
272 | if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; } |
273 | if(c != expected){ | |
274 | log_err("ERROR: U8_GET failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c); | |
b75a7d8f A |
275 | } |
276 | ||
51004dcb A |
277 | U8_GET_OR_FFFD(input, 0, offset, sizeof(input), c); |
278 | if(expected<0) { expected=0xfffd; } | |
279 | if(c != expected){ | |
280 | log_err("ERROR: U8_GET_OR_FFFD failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c); | |
b75a7d8f | 281 | } |
0f5d89e8 | 282 | #if !U_HIDE_OBSOLETE_UTF_OLD_H |
b75a7d8f A |
283 | UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, TRUE); |
284 | if(c != result[i+2]){ | |
285 | log_err("ERROR: UTF8_GET_CHAR_SAFE(strict) failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c); | |
286 | } | |
0f5d89e8 | 287 | #endif |
51004dcb | 288 | i=(uint16_t)(i+3); |
b75a7d8f A |
289 | } |
290 | } | |
291 | ||
51004dcb | 292 | static void TestNextPrevChar() { |
0f5d89e8 A |
293 | static const uint8_t input[]={ |
294 | 0x61, | |
295 | 0xf0, 0x90, 0x90, 0x81, | |
296 | 0xc0, 0x80, // non-shortest form | |
297 | 0xf3, 0xbe, // truncated | |
298 | 0xc2, // truncated | |
299 | 0x61, | |
300 | 0x81, 0x90, 0x90, 0xf0, // "backwards" sequence | |
301 | 0x00 | |
302 | }; | |
b75a7d8f | 303 | static const UChar32 result[]={ |
0f5d89e8 A |
304 | /* next_safe_ns next_safe_s prev_safe_ns prev_safe_s */ |
305 | 0x0061, 0x0061, 0x0000, 0x0000, | |
306 | 0x10401, 0x10401, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, | |
307 | UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, | |
308 | UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, | |
309 | UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, | |
310 | UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x61, 0x61, | |
311 | UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, | |
312 | UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, | |
313 | UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, | |
314 | UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, | |
315 | 0x61, 0x61, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, | |
316 | UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x10401, 0x10401, | |
317 | UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF_ERROR_VALUE, UTF_ERROR_VALUE, | |
318 | UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, | |
319 | UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, | |
320 | 0x0000, 0x0000, 0x0061, 0x0061 | |
b75a7d8f A |
321 | }; |
322 | static const int32_t movedOffset[]={ | |
0f5d89e8 A |
323 | /* next_safe prev_safe_s */ |
324 | 1, 15, | |
325 | 5, 14, | |
326 | 3, 13, | |
327 | 4, 12, | |
328 | 5, 11, | |
329 | 6, 10, | |
330 | 7, 9, | |
331 | 9, 7, | |
332 | 9, 7, | |
333 | 10, 6, | |
334 | 11, 5, | |
335 | 12, 1, | |
336 | 13, 1, | |
337 | 14, 1, | |
338 | 15, 1, | |
339 | 16, 0, | |
b75a7d8f A |
340 | }; |
341 | ||
51004dcb | 342 | UChar32 c, expected; |
0f5d89e8 | 343 | uint32_t i=0, j=0; |
b75a7d8f A |
344 | uint32_t offset=0; |
345 | int32_t setOffset=0; | |
346 | for(offset=0; offset<sizeof(input); offset++){ | |
0f5d89e8 A |
347 | expected=result[i]; // next_safe_ns |
348 | #if !U_HIDE_OBSOLETE_UTF_OLD_H | |
349 | setOffset=offset; | |
350 | UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE); | |
351 | if(setOffset != movedOffset[j]) { | |
352 | log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", | |
353 | offset, movedOffset[j], setOffset); | |
354 | } | |
355 | if(c != expected) { | |
356 | log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c); | |
357 | } | |
358 | #endif | |
359 | setOffset=offset; | |
360 | U8_NEXT(input, setOffset, sizeof(input), c); | |
361 | if(setOffset != movedOffset[j]) { | |
362 | log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", | |
363 | offset, movedOffset[j], setOffset); | |
51004dcb | 364 | } |
51004dcb | 365 | if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; } |
0f5d89e8 A |
366 | if(c != expected) { |
367 | log_err("ERROR: U8_NEXT failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c); | |
51004dcb A |
368 | } |
369 | ||
370 | setOffset=offset; | |
371 | U8_NEXT_OR_FFFD(input, setOffset, sizeof(input), c); | |
0f5d89e8 | 372 | if(setOffset != movedOffset[j]) { |
51004dcb | 373 | log_err("ERROR: U8_NEXT_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", |
0f5d89e8 | 374 | offset, movedOffset[j], setOffset); |
51004dcb A |
375 | } |
376 | if(expected<0) { expected=0xfffd; } | |
0f5d89e8 A |
377 | if(c != expected) { |
378 | log_err("ERROR: U8_NEXT_OR_FFFD failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c); | |
51004dcb | 379 | } |
0f5d89e8 A |
380 | #if !U_HIDE_OBSOLETE_UTF_OLD_H |
381 | setOffset=offset; | |
382 | UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE); | |
383 | if(setOffset != movedOffset[j]) { | |
384 | log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", | |
385 | offset, movedOffset[j], setOffset); | |
386 | } | |
387 | expected=result[i+1]; // next_safe_s | |
388 | if(c != expected) { | |
389 | log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed at offset=%ld. Expected:%lx Got:%lx\n", | |
390 | offset, expected, c); | |
391 | } | |
392 | #endif | |
393 | i=i+4; | |
394 | j=j+2; | |
b75a7d8f A |
395 | } |
396 | ||
0f5d89e8 | 397 | i=j=0; |
b75a7d8f | 398 | for(offset=sizeof(input); offset > 0; --offset){ |
0f5d89e8 A |
399 | expected=result[i+2]; // prev_safe_ns |
400 | #if !U_HIDE_OBSOLETE_UTF_OLD_H | |
401 | setOffset=offset; | |
402 | UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE); | |
403 | if(setOffset != movedOffset[j+1]) { | |
404 | log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", | |
405 | offset, movedOffset[j+1], setOffset); | |
406 | } | |
407 | if(c != expected) { | |
408 | log_err("ERROR: UTF8_PREV_CHAR_SAFE failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c); | |
409 | } | |
410 | #endif | |
411 | setOffset=offset; | |
412 | U8_PREV(input, 0, setOffset, c); | |
413 | if(setOffset != movedOffset[j+1]) { | |
414 | log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", | |
415 | offset, movedOffset[j+1], setOffset); | |
51004dcb | 416 | } |
51004dcb | 417 | if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; } |
0f5d89e8 A |
418 | if(c != expected) { |
419 | log_err("ERROR: U8_PREV failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c); | |
51004dcb A |
420 | } |
421 | ||
422 | setOffset=offset; | |
423 | U8_PREV_OR_FFFD(input, 0, setOffset, c); | |
0f5d89e8 | 424 | if(setOffset != movedOffset[j+1]) { |
51004dcb | 425 | log_err("ERROR: U8_PREV_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", |
0f5d89e8 | 426 | offset, movedOffset[j+1], setOffset); |
51004dcb A |
427 | } |
428 | if(expected<0) { expected=0xfffd; } | |
0f5d89e8 A |
429 | if(c != expected) { |
430 | log_err("ERROR: U8_PREV_OR_FFFD failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c); | |
51004dcb | 431 | } |
0f5d89e8 A |
432 | #if !U_HIDE_OBSOLETE_UTF_OLD_H |
433 | setOffset=offset; | |
434 | UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE); | |
435 | if(setOffset != movedOffset[j+1]) { | |
436 | log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", | |
437 | offset, movedOffset[j+1], setOffset); | |
438 | } | |
439 | expected=result[i+3]; // prev_safe_s | |
440 | if(c != expected) { | |
441 | log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed at offset=%ld. Expected:%lx Got:%lx\n", | |
442 | offset, expected, c); | |
443 | } | |
444 | #endif | |
445 | i=i+4; | |
446 | j=j+2; | |
b75a7d8f | 447 | } |
51004dcb | 448 | } |
b75a7d8f | 449 | |
51004dcb A |
450 | /* keep this in sync with utf16tst.c's TestNulTerminated() */ |
451 | static void TestNulTerminated() { | |
452 | static const uint8_t input[]={ | |
453 | /* 0 */ 0x61, | |
454 | /* 1 */ 0xf0, 0x90, 0x90, 0x81, | |
0f5d89e8 A |
455 | /* 5 */ 0xc0, |
456 | /* 6 */ 0x80, | |
51004dcb A |
457 | /* 7 */ 0xdf, 0x80, |
458 | /* 9 */ 0xc2, | |
459 | /* 10 */ 0x62, | |
0f5d89e8 A |
460 | /* 11 */ 0xfd, |
461 | /* 12 */ 0xbe, | |
51004dcb A |
462 | /* 13 */ 0xe0, 0xa0, 0x80, |
463 | /* 16 */ 0xe2, 0x82, 0xac, | |
464 | /* 19 */ 0xf0, 0x90, 0x90, | |
465 | /* 22 */ 0x00 | |
466 | /* 23 */ | |
467 | }; | |
468 | static const UChar32 result[]={ | |
469 | 0x61, | |
470 | 0x10401, | |
0f5d89e8 A |
471 | U_SENTINEL, // C0 not a lead byte |
472 | U_SENTINEL, // 80 | |
51004dcb | 473 | 0x7c0, |
0f5d89e8 | 474 | U_SENTINEL, // C2 |
51004dcb | 475 | 0x62, |
0f5d89e8 A |
476 | U_SENTINEL, // FD not a lead byte |
477 | U_SENTINEL, // BE | |
51004dcb A |
478 | 0x800, |
479 | 0x20ac, | |
0f5d89e8 | 480 | U_SENTINEL, // truncated F0 90 90 |
51004dcb A |
481 | 0 |
482 | }; | |
483 | ||
484 | UChar32 c, c2, expected; | |
485 | int32_t i0, i=0, j, k, expectedIndex; | |
486 | int32_t cpIndex=0; | |
487 | do { | |
488 | i0=i; | |
489 | U8_NEXT(input, i, -1, c); | |
490 | expected=result[cpIndex]; | |
491 | if(c!=expected) { | |
492 | log_err("U8_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, expected); | |
493 | } | |
494 | j=i0; | |
495 | U8_NEXT_OR_FFFD(input, j, -1, c); | |
496 | if(expected<0) { expected=0xfffd; } | |
497 | if(c!=expected) { | |
498 | log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x\n", i0, c, expected); | |
499 | } | |
500 | if(j!=i) { | |
501 | log_err("U8_NEXT_OR_FFFD() moved to index %d but U8_NEXT() moved to %d\n", j, i); | |
b75a7d8f | 502 | } |
51004dcb A |
503 | j=i0; |
504 | U8_FWD_1(input, j, -1); | |
505 | if(j!=i) { | |
506 | log_err("U8_FWD_1() moved to index %d but U8_NEXT() moved to %d\n", j, i); | |
507 | } | |
508 | ++cpIndex; | |
509 | /* | |
510 | * Move by this many code points from the start. | |
511 | * U8_FWD_N() stops at the end of the string, that is, at the NUL if necessary. | |
512 | */ | |
513 | expectedIndex= (c==0) ? i-1 : i; | |
514 | k=0; | |
515 | U8_FWD_N(input, k, -1, cpIndex); | |
516 | if(k!=expectedIndex) { | |
517 | log_err("U8_FWD_N(code points from 0) moved to index %d but expected %d\n", k, expectedIndex); | |
518 | } | |
519 | } while(c!=0); | |
520 | ||
521 | i=0; | |
522 | do { | |
523 | j=i0=i; | |
524 | U8_NEXT(input, i, -1, c); | |
525 | do { | |
526 | U8_GET(input, 0, j, -1, c2); | |
527 | if(c2!=c) { | |
528 | log_err("U8_NEXT(from %d)=U+%04x != U+%04x=U8_GET(at %d)\n", i0, c, c2, j); | |
529 | } | |
530 | U8_GET_OR_FFFD(input, 0, j, -1, c2); | |
531 | expected= (c>=0) ? c : 0xfffd; | |
532 | if(c2!=expected) { | |
533 | log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x=U8_GET_OR_FFFD(at %d)\n", i0, expected, c2, j); | |
534 | } | |
535 | /* U8_SET_CP_LIMIT moves from a non-lead byte to the limit of the code point */ | |
536 | k=j+1; | |
537 | U8_SET_CP_LIMIT(input, 0, k, -1); | |
538 | if(k!=i) { | |
539 | log_err("U8_NEXT() moved to %d but U8_SET_CP_LIMIT(%d) moved to %d\n", i, j+1, k); | |
b75a7d8f | 540 | } |
51004dcb A |
541 | } while(++j<i); |
542 | } while(c!=0); | |
543 | } | |
544 | ||
545 | static void TestNextPrevNonCharacters() { | |
546 | /* test non-characters */ | |
547 | static const uint8_t nonChars[]={ | |
548 | 0xef, 0xb7, 0x90, /* U+fdd0 */ | |
549 | 0xef, 0xbf, 0xbf, /* U+feff */ | |
550 | 0xf0, 0x9f, 0xbf, 0xbe, /* U+1fffe */ | |
551 | 0xf0, 0xbf, 0xbf, 0xbf, /* U+3ffff */ | |
552 | 0xf4, 0x8f, 0xbf, 0xbe /* U+10fffe */ | |
553 | }; | |
554 | ||
555 | UChar32 ch; | |
556 | int32_t idx; | |
557 | ||
558 | for(idx=0; idx<(int32_t)sizeof(nonChars);) { | |
559 | U8_NEXT(nonChars, idx, sizeof(nonChars), ch); | |
560 | if(!U_IS_UNICODE_NONCHAR(ch)) { | |
561 | log_err("U8_NEXT(before %d) failed to read a non-character\n", idx); | |
562 | } | |
563 | } | |
564 | for(idx=(int32_t)sizeof(nonChars); idx>0;) { | |
565 | U8_PREV(nonChars, 0, idx, ch); | |
566 | if(!U_IS_UNICODE_NONCHAR(ch)) { | |
567 | log_err("U8_PREV(at %d) failed to read a non-character\n", idx); | |
b75a7d8f A |
568 | } |
569 | } | |
0f5d89e8 A |
570 | #if !U_HIDE_OBSOLETE_UTF_OLD_H |
571 | for(idx=0; idx<(int32_t)sizeof(nonChars);) { | |
572 | UChar32 expected= nonChars[idx]<0xf0 ? 0xffff : 0x10ffff; | |
573 | UTF8_NEXT_CHAR_SAFE(nonChars, idx, sizeof(nonChars), ch, TRUE); | |
574 | if(ch!=expected) { | |
575 | log_err("UTF8_NEXT_CHAR_SAFE(strict, before %d) failed to read a non-character\n", idx); | |
576 | } | |
577 | } | |
578 | for(idx=(int32_t)sizeof(nonChars); idx>0;) { | |
579 | UTF8_PREV_CHAR_SAFE(nonChars, 0, idx, ch, TRUE); | |
580 | UChar32 expected= nonChars[idx]<0xf0 ? 0xffff : 0x10ffff; | |
581 | if(ch!=expected) { | |
582 | log_err("UTF8_PREV_CHAR_SAFE(strict, at %d) failed to read a non-character\n", idx); | |
583 | } | |
584 | } | |
585 | #endif | |
b75a7d8f A |
586 | } |
587 | ||
51004dcb A |
588 | static void TestNextPrevCharUnsafe() { |
589 | /* | |
590 | * Use a (mostly) well-formed UTF-8 string and test at code point boundaries. | |
591 | * The behavior of _UNSAFE macros for ill-formed strings is undefined. | |
592 | */ | |
593 | static const uint8_t input[]={ | |
594 | 0x61, | |
595 | 0xf0, 0x90, 0x90, 0x81, | |
596 | 0xc0, 0x80, /* non-shortest form */ | |
597 | 0xe2, 0x82, 0xac, | |
598 | 0xc2, 0xa1, | |
599 | 0xf4, 0x8f, 0xbf, 0xbf, | |
600 | 0x00 | |
601 | }; | |
602 | static const UChar32 codePoints[]={ | |
603 | 0x61, | |
604 | 0x10401, | |
0f5d89e8 | 605 | -1, |
51004dcb A |
606 | 0x20ac, |
607 | 0xa1, | |
608 | 0x10ffff, | |
609 | 0 | |
610 | }; | |
611 | ||
0f5d89e8 | 612 | UChar32 c, expected; |
51004dcb A |
613 | int32_t i; |
614 | uint32_t offset; | |
0f5d89e8 | 615 | #if !U_HIDE_OBSOLETE_UTF_OLD_H |
51004dcb A |
616 | for(i=0, offset=0; offset<sizeof(input); ++i) { |
617 | UTF8_NEXT_CHAR_UNSAFE(input, offset, c); | |
0f5d89e8 A |
618 | expected = codePoints[i]; |
619 | if(expected >= 0 && c != expected) { | |
51004dcb | 620 | log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", |
0f5d89e8 A |
621 | offset, expected, c); |
622 | } | |
623 | if(offset==6) { | |
624 | // The obsolete UTF8_NEXT_CHAR_UNSAFE() skips 1+UTF8_COUNT_TRAIL_BYTES(lead) bytes | |
625 | // while the new one skips C0 80 together. | |
626 | ++offset; | |
51004dcb A |
627 | } |
628 | } | |
0f5d89e8 | 629 | #endif |
51004dcb A |
630 | for(i=0, offset=0; offset<sizeof(input); ++i) { |
631 | U8_NEXT_UNSAFE(input, offset, c); | |
0f5d89e8 A |
632 | expected = codePoints[i]; |
633 | if(expected >= 0 && c != expected) { | |
51004dcb | 634 | log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", |
0f5d89e8 | 635 | offset, expected, c); |
51004dcb A |
636 | } |
637 | } | |
0f5d89e8 | 638 | #if !U_HIDE_OBSOLETE_UTF_OLD_H |
b331163b | 639 | for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){ |
0f5d89e8 A |
640 | UTF8_PREV_CHAR_UNSAFE(input, offset, c); |
641 | expected = codePoints[i]; | |
642 | if(expected >= 0 && c != expected) { | |
643 | log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", | |
644 | offset, expected, c); | |
645 | } | |
51004dcb | 646 | } |
0f5d89e8 | 647 | #endif |
b331163b | 648 | for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){ |
0f5d89e8 A |
649 | U8_PREV_UNSAFE(input, offset, c); |
650 | expected = codePoints[i]; | |
651 | if(expected >= 0 && c != expected) { | |
652 | log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", | |
653 | offset, expected, c); | |
654 | } | |
51004dcb A |
655 | } |
656 | } | |
657 | ||
658 | static void TestFwdBack() { | |
0f5d89e8 A |
659 | static const uint8_t input[]={ |
660 | 0x61, | |
661 | 0xF0, 0x90, 0x90, 0x81, | |
662 | 0xff, | |
663 | 0x62, | |
664 | 0xc0, | |
665 | 0x80, | |
666 | 0x7f, | |
667 | 0x8f, | |
668 | 0xc0, | |
669 | 0x63, | |
670 | 0x81, | |
671 | 0x90, | |
672 | 0x90, | |
673 | 0xF0, | |
674 | 0x00 | |
675 | }; | |
676 | static const uint16_t fwd_safe[] ={1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}; | |
677 | static const uint16_t back_safe[] ={17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 1, 0}; | |
b75a7d8f | 678 | |
0f5d89e8 | 679 | static const uint16_t Nvalue[]= {0, 1, 2, 4, 1, 2, 1, 5}; |
b75a7d8f | 680 | static const uint16_t fwd_N_safe[] ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */ |
0f5d89e8 | 681 | static const uint16_t back_N_safe[] ={18, 17, 15, 11, 10, 8, 7, 0}; |
b75a7d8f | 682 | |
51004dcb | 683 | uint32_t offsafe=0; |
b75a7d8f A |
684 | |
685 | uint32_t i=0; | |
0f5d89e8 | 686 | #if !U_HIDE_OBSOLETE_UTF_OLD_H |
b75a7d8f A |
687 | while(offsafe < sizeof(input)){ |
688 | UTF8_FWD_1_SAFE(input, offsafe, sizeof(input)); | |
689 | if(offsafe != fwd_safe[i]){ | |
690 | log_err("ERROR: Forward_safe offset expected:%d, Got:%d\n", fwd_safe[i], offsafe); | |
691 | } | |
692 | i++; | |
693 | } | |
0f5d89e8 A |
694 | #endif |
695 | offsafe=0; | |
b75a7d8f A |
696 | i=0; |
697 | while(offsafe < sizeof(input)){ | |
698 | U8_FWD_1(input, offsafe, sizeof(input)); | |
699 | if(offsafe != fwd_safe[i]){ | |
700 | log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i], offsafe); | |
701 | } | |
702 | i++; | |
703 | } | |
0f5d89e8 | 704 | #if !U_HIDE_OBSOLETE_UTF_OLD_H |
b75a7d8f A |
705 | i=0; |
706 | offsafe=sizeof(input); | |
707 | while(offsafe > 0){ | |
708 | UTF8_BACK_1_SAFE(input, 0, offsafe); | |
709 | if(offsafe != back_safe[i]){ | |
51004dcb | 710 | log_err("ERROR: Backward_safe offset expected:%d, Got:%d\n", back_safe[i], offsafe); |
b75a7d8f A |
711 | } |
712 | i++; | |
713 | } | |
0f5d89e8 | 714 | #endif |
b75a7d8f A |
715 | i=0; |
716 | offsafe=sizeof(input); | |
717 | while(offsafe > 0){ | |
718 | U8_BACK_1(input, 0, offsafe); | |
719 | if(offsafe != back_safe[i]){ | |
51004dcb | 720 | log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_safe[i], offsafe); |
b75a7d8f A |
721 | } |
722 | i++; | |
723 | } | |
0f5d89e8 | 724 | #if !U_HIDE_OBSOLETE_UTF_OLD_H |
b75a7d8f | 725 | offsafe=0; |
b331163b | 726 | for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){ |
b75a7d8f A |
727 | UTF8_FWD_N_SAFE(input, offsafe, sizeof(input), Nvalue[i]); |
728 | if(offsafe != fwd_N_safe[i]){ | |
729 | log_err("ERROR: Forward_N_safe offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe); | |
730 | } | |
51004dcb | 731 | |
b75a7d8f | 732 | } |
0f5d89e8 | 733 | #endif |
b75a7d8f | 734 | offsafe=0; |
b331163b | 735 | for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){ |
b75a7d8f A |
736 | U8_FWD_N(input, offsafe, sizeof(input), Nvalue[i]); |
737 | if(offsafe != fwd_N_safe[i]){ | |
738 | log_err("ERROR: U8_FWD_N offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe); | |
739 | } | |
b75a7d8f | 740 | |
b75a7d8f | 741 | } |
0f5d89e8 | 742 | #if !U_HIDE_OBSOLETE_UTF_OLD_H |
b75a7d8f | 743 | offsafe=sizeof(input); |
b331163b | 744 | for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){ |
b75a7d8f A |
745 | UTF8_BACK_N_SAFE(input, 0, offsafe, Nvalue[i]); |
746 | if(offsafe != back_N_safe[i]){ | |
747 | log_err("ERROR: backward_N_safe offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe); | |
748 | } | |
749 | } | |
0f5d89e8 | 750 | #endif |
b75a7d8f | 751 | offsafe=sizeof(input); |
b331163b | 752 | for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){ |
b75a7d8f A |
753 | U8_BACK_N(input, 0, offsafe, Nvalue[i]); |
754 | if(offsafe != back_N_safe[i]){ | |
755 | log_err("ERROR: U8_BACK_N offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe); | |
756 | } | |
757 | } | |
758 | } | |
759 | ||
0f5d89e8 A |
760 | /** |
761 | * Ticket #13636 - Visual Studio 2017 has problems optimizing this function. | |
762 | * As a workaround, we will turn off optimization just for this function on VS2017 and above. | |
763 | */ | |
764 | #if defined(_MSC_VER) && (_MSC_VER > 1900) | |
765 | #pragma optimize( "", off ) | |
766 | #endif | |
767 | ||
51004dcb A |
768 | static void TestFwdBackUnsafe() { |
769 | /* | |
770 | * Use a (mostly) well-formed UTF-8 string and test at code point boundaries. | |
771 | * The behavior of _UNSAFE macros for ill-formed strings is undefined. | |
772 | */ | |
773 | static const uint8_t input[]={ | |
774 | 0x61, | |
775 | 0xf0, 0x90, 0x90, 0x81, | |
776 | 0xc0, 0x80, /* non-shortest form */ | |
777 | 0xe2, 0x82, 0xac, | |
778 | 0xc2, 0xa1, | |
779 | 0xf4, 0x8f, 0xbf, 0xbf, | |
780 | 0x00 | |
781 | }; | |
0f5d89e8 A |
782 | // forward unsafe skips only C0 |
783 | static const int8_t boundaries[]={ 0, 1, 5, 6, 7, 10, 12, 16, 17 }; | |
784 | // backward unsafe skips C0 80 together | |
785 | static const int8_t backBoundaries[]={ 0, 1, 5, 7, 10, 12, 16, 17 }; | |
51004dcb A |
786 | |
787 | int32_t offset; | |
788 | int32_t i; | |
0f5d89e8 | 789 | #if !U_HIDE_OBSOLETE_UTF_OLD_H |
b331163b | 790 | for(i=1, offset=0; offset<UPRV_LENGTHOF(input); ++i) { |
51004dcb A |
791 | UTF8_FWD_1_UNSAFE(input, offset); |
792 | if(offset != boundaries[i]){ | |
793 | log_err("ERROR: UTF8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset); | |
794 | } | |
795 | } | |
0f5d89e8 | 796 | #endif |
b331163b | 797 | for(i=1, offset=0; offset<UPRV_LENGTHOF(input); ++i) { |
51004dcb A |
798 | U8_FWD_1_UNSAFE(input, offset); |
799 | if(offset != boundaries[i]){ | |
800 | log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset); | |
801 | } | |
802 | } | |
0f5d89e8 A |
803 | #if !U_HIDE_OBSOLETE_UTF_OLD_H |
804 | for(i=UPRV_LENGTHOF(backBoundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) { | |
51004dcb | 805 | UTF8_BACK_1_UNSAFE(input, offset); |
0f5d89e8 A |
806 | if(offset != backBoundaries[i]){ |
807 | log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[i], offset); | |
51004dcb A |
808 | } |
809 | } | |
0f5d89e8 A |
810 | #endif |
811 | for(i=UPRV_LENGTHOF(backBoundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) { | |
51004dcb | 812 | U8_BACK_1_UNSAFE(input, offset); |
0f5d89e8 A |
813 | if(offset != backBoundaries[i]){ |
814 | log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[i], offset); | |
51004dcb A |
815 | } |
816 | } | |
0f5d89e8 | 817 | #if !U_HIDE_OBSOLETE_UTF_OLD_H |
b331163b | 818 | for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) { |
51004dcb A |
819 | offset=0; |
820 | UTF8_FWD_N_UNSAFE(input, offset, i); | |
821 | if(offset != boundaries[i]) { | |
822 | log_err("ERROR: UTF8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset); | |
823 | } | |
824 | } | |
0f5d89e8 | 825 | #endif |
b331163b | 826 | for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) { |
51004dcb A |
827 | offset=0; |
828 | U8_FWD_N_UNSAFE(input, offset, i); | |
829 | if(offset != boundaries[i]) { | |
830 | log_err("ERROR: U8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset); | |
831 | } | |
832 | } | |
0f5d89e8 A |
833 | #if !U_HIDE_OBSOLETE_UTF_OLD_H |
834 | for(i=0; i<UPRV_LENGTHOF(backBoundaries); ++i) { | |
835 | int32_t j=UPRV_LENGTHOF(backBoundaries)-1-i; | |
b331163b | 836 | offset=UPRV_LENGTHOF(input); |
51004dcb | 837 | UTF8_BACK_N_UNSAFE(input, offset, i); |
0f5d89e8 A |
838 | if(offset != backBoundaries[j]) { |
839 | log_err("ERROR: UTF8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[j], offset); | |
51004dcb A |
840 | } |
841 | } | |
0f5d89e8 A |
842 | #endif |
843 | for(i=0; i<UPRV_LENGTHOF(backBoundaries); ++i) { | |
844 | int32_t j=UPRV_LENGTHOF(backBoundaries)-1-i; | |
b331163b | 845 | offset=UPRV_LENGTHOF(input); |
51004dcb | 846 | U8_BACK_N_UNSAFE(input, offset, i); |
0f5d89e8 A |
847 | if(offset != backBoundaries[j]) { |
848 | log_err("ERROR: U8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[j], offset); | |
51004dcb A |
849 | } |
850 | } | |
851 | } | |
852 | ||
0f5d89e8 A |
853 | /** |
854 | * Ticket #13636 - Turn optimization back on. | |
855 | */ | |
856 | #if defined(_MSC_VER) && (_MSC_VER > 1900) | |
857 | #pragma optimize( "", on ) | |
858 | #endif | |
859 | ||
51004dcb | 860 | static void TestSetChar() { |
b75a7d8f A |
861 | static const uint8_t input[] |
862 | = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0xfe, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x00 }; | |
b75a7d8f | 863 | static const int16_t start_safe[] |
51004dcb | 864 | = {0, 1, 1, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }; |
b75a7d8f | 865 | static const int16_t limit_safe[] |
51004dcb A |
866 | = {0, 1, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }; |
867 | ||
b75a7d8f A |
868 | uint32_t i=0; |
869 | int32_t offset=0, setOffset=0; | |
b331163b A |
870 | for(offset=0; offset<=UPRV_LENGTHOF(input); offset++){ |
871 | if (offset<UPRV_LENGTHOF(input)){ | |
0f5d89e8 | 872 | #if !U_HIDE_OBSOLETE_UTF_OLD_H |
51004dcb A |
873 | setOffset=offset; |
874 | UTF8_SET_CHAR_START_SAFE(input, 0, setOffset); | |
875 | if(setOffset != start_safe[i]){ | |
876 | log_err("ERROR: UTF8_SET_CHAR_START_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset); | |
877 | } | |
0f5d89e8 | 878 | #endif |
51004dcb A |
879 | setOffset=offset; |
880 | U8_SET_CP_START(input, 0, setOffset); | |
881 | if(setOffset != start_safe[i]){ | |
882 | log_err("ERROR: U8_SET_CP_START failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset); | |
883 | } | |
884 | } | |
0f5d89e8 | 885 | #if !U_HIDE_OBSOLETE_UTF_OLD_H |
51004dcb | 886 | setOffset=offset; |
340931cb | 887 | UTF8_SET_CHAR_LIMIT_SAFE(input,0, setOffset, (int32_t)sizeof(input)); |
51004dcb A |
888 | if(setOffset != limit_safe[i]){ |
889 | log_err("ERROR: UTF8_SET_CHAR_LIMIT_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset); | |
890 | } | |
0f5d89e8 | 891 | #endif |
51004dcb | 892 | setOffset=offset; |
340931cb | 893 | U8_SET_CP_LIMIT(input,0, setOffset, (int32_t)sizeof(input)); |
51004dcb A |
894 | if(setOffset != limit_safe[i]){ |
895 | log_err("ERROR: U8_SET_CP_LIMIT failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset); | |
896 | } | |
b75a7d8f | 897 | |
51004dcb A |
898 | i++; |
899 | } | |
900 | } | |
b75a7d8f | 901 | |
51004dcb A |
902 | static void TestSetCharUnsafe() { |
903 | static const uint8_t input[] | |
904 | = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0x2e, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x80, 0x80, 0x00 }; | |
905 | static const int16_t start_unsafe[] | |
906 | = {0, 1, 1, 1, 4, 5, 6, 7, 8, 9, 9, 9, 12, 12, 12, 15 }; | |
907 | static const int16_t limit_unsafe[] | |
908 | = {0, 1, 4, 4, 4, 5, 6, 7, 9, 9, 10, 10, 10, 15, 15, 15, 16 }; | |
b75a7d8f | 909 | |
51004dcb A |
910 | uint32_t i=0; |
911 | int32_t offset=0, setOffset=0; | |
b331163b A |
912 | for(offset=0; offset<=UPRV_LENGTHOF(input); offset++){ |
913 | if (offset<UPRV_LENGTHOF(input)){ | |
0f5d89e8 | 914 | #if !U_HIDE_OBSOLETE_UTF_OLD_H |
51004dcb A |
915 | setOffset=offset; |
916 | UTF8_SET_CHAR_START_UNSAFE(input, setOffset); | |
917 | if(setOffset != start_unsafe[i]){ | |
918 | log_err("ERROR: UTF8_SET_CHAR_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset); | |
919 | } | |
0f5d89e8 | 920 | #endif |
51004dcb A |
921 | setOffset=offset; |
922 | U8_SET_CP_START_UNSAFE(input, setOffset); | |
923 | if(setOffset != start_unsafe[i]){ | |
924 | log_err("ERROR: U8_SET_CP_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset); | |
925 | } | |
926 | } | |
b75a7d8f | 927 | |
51004dcb | 928 | if (offset != 0) { /* Can't have it go off the end of the array */ |
0f5d89e8 | 929 | #if !U_HIDE_OBSOLETE_UTF_OLD_H |
51004dcb A |
930 | setOffset=offset; |
931 | UTF8_SET_CHAR_LIMIT_UNSAFE(input, setOffset); | |
932 | if(setOffset != limit_unsafe[i]){ | |
933 | log_err("ERROR: UTF8_SET_CHAR_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset); | |
934 | } | |
0f5d89e8 | 935 | #endif |
51004dcb A |
936 | setOffset=offset; |
937 | U8_SET_CP_LIMIT_UNSAFE(input, setOffset); | |
938 | if(setOffset != limit_unsafe[i]){ | |
939 | log_err("ERROR: U8_SET_CP_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset); | |
940 | } | |
941 | } | |
942 | ||
943 | i++; | |
b75a7d8f A |
944 | } |
945 | } | |
946 | ||
0f5d89e8 A |
947 | static void TestTruncateIfIncomplete() { |
948 | // Difference from U8_SET_CP_START(): | |
949 | // U8_TRUNCATE_IF_INCOMPLETE() does not look at s[length]. | |
950 | // Therefore, if the last byte is a lead byte, then this macro truncates | |
951 | // even if the byte at the input index cannot continue a valid sequence | |
952 | // (including when that is not a trail byte). | |
953 | // On the other hand, if the last byte is a trail byte, then the two macros behave the same. | |
954 | static const struct { | |
955 | const char *s; | |
956 | int32_t expected; | |
957 | } cases[] = { | |
958 | { "", 0 }, | |
959 | { "a", 1 }, | |
960 | { "\x80", 1 }, | |
961 | { "\xC1", 1 }, | |
962 | { "\xC2", 0 }, | |
963 | { "\xE0", 0 }, | |
964 | { "\xF4", 0 }, | |
965 | { "\xF5", 1 }, | |
966 | { "\x80\x80", 2 }, | |
967 | { "\xC2\xA0", 2 }, | |
968 | { "\xE0\x9F", 2 }, | |
969 | { "\xE0\xA0", 0 }, | |
970 | { "\xED\x9F", 0 }, | |
971 | { "\xED\xA0", 2 }, | |
972 | { "\xF0\x8F", 2 }, | |
973 | { "\xF0\x90", 0 }, | |
974 | { "\xF4\x8F", 0 }, | |
975 | { "\xF4\x90", 2 }, | |
976 | { "\xF5\x80", 2 }, | |
977 | { "\x80\x80\x80", 3 }, | |
978 | { "\xC2\xA0\x80", 3 }, | |
979 | { "\xE0\xA0\x80", 3 }, | |
980 | { "\xF0\x8F\x80", 3 }, | |
981 | { "\xF0\x90\x80", 0 }, | |
982 | { "\xF4\x8F\x80", 0 }, | |
983 | { "\xF4\x90\x80", 3 }, | |
984 | { "\xF5\x80\x80", 3 }, | |
985 | { "\x80\x80\x80\x80", 4 }, | |
986 | { "\xC2\xA0\x80\x80", 4 }, | |
987 | { "\xE0\xA0\x80\x80", 4 }, | |
988 | { "\xF0\x90\x80\x80", 4 }, | |
989 | { "\xF5\x80\x80\x80", 4 } | |
990 | }; | |
991 | int32_t i; | |
992 | for (i = 0; i < UPRV_LENGTHOF(cases); ++i) { | |
993 | const char *s = cases[i].s; | |
994 | int32_t expected = cases[i].expected; | |
995 | int32_t length = (int32_t)strlen(s); | |
996 | int32_t adjusted = length; | |
997 | U8_TRUNCATE_IF_INCOMPLETE(s, 0, adjusted); | |
998 | if (adjusted != expected) { | |
999 | log_err("ERROR: U8_TRUNCATE_IF_INCOMPLETE failed for i=%d, length=%d. Expected:%d Got:%d\n", | |
1000 | (int)i, (int)length, (int)expected, (int)adjusted); | |
1001 | } | |
1002 | } | |
1003 | } | |
1004 | ||
b75a7d8f | 1005 | static void TestAppendChar(){ |
0f5d89e8 | 1006 | #if !U_HIDE_OBSOLETE_UTF_OLD_H |
b75a7d8f A |
1007 | static const uint8_t s[11]={0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}; |
1008 | static const uint32_t test[]={ | |
51004dcb | 1009 | /* append-position(unsafe), CHAR to be appended */ |
b75a7d8f A |
1010 | 0, 0x10401, |
1011 | 2, 0x0028, | |
51004dcb | 1012 | 2, 0x007f, |
b75a7d8f A |
1013 | 3, 0xd801, |
1014 | 1, 0x20402, | |
1015 | 8, 0x10401, | |
1016 | 5, 0xc0, | |
1017 | 5, 0xc1, | |
1018 | 5, 0xfd, | |
1019 | 6, 0x80, | |
1020 | 6, 0x81, | |
1021 | 6, 0xbf, | |
1022 | 7, 0xfe, | |
1023 | ||
51004dcb | 1024 | /* append-position(safe), CHAR to be appended */ |
b75a7d8f | 1025 | 0, 0x10401, |
51004dcb | 1026 | 2, 0x0028, |
b75a7d8f A |
1027 | 3, 0x7f, |
1028 | 3, 0xd801, /* illegal for UTF-8 starting with Unicode 3.2 */ | |
1029 | 1, 0x20402, | |
1030 | 9, 0x10401, | |
1031 | 5, 0xc0, | |
1032 | 5, 0xc1, | |
1033 | 5, 0xfd, | |
1034 | 6, 0x80, | |
1035 | 6, 0x81, | |
1036 | 6, 0xbf, | |
1037 | 7, 0xfe, | |
51004dcb | 1038 | |
b75a7d8f A |
1039 | }; |
1040 | static const uint16_t movedOffset[]={ | |
51004dcb | 1041 | /* offset-moved-to(unsafe) */ |
b75a7d8f | 1042 | 4, /*for append-pos: 0 , CHAR 0x10401*/ |
51004dcb | 1043 | 3, |
b75a7d8f A |
1044 | 3, |
1045 | 6, | |
1046 | 5, | |
1047 | 12, | |
1048 | 7, | |
51004dcb | 1049 | 7, |
b75a7d8f A |
1050 | 7, |
1051 | 8, | |
1052 | 8, | |
1053 | 8, | |
1054 | 9, | |
1055 | ||
51004dcb | 1056 | /* offset-moved-to(safe) */ |
b75a7d8f A |
1057 | 4, /*for append-pos: 0, CHAR 0x10401*/ |
1058 | 3, | |
1059 | 4, | |
1060 | 6, | |
1061 | 5, | |
1062 | 11, | |
1063 | 7, | |
51004dcb | 1064 | 7, |
b75a7d8f A |
1065 | 7, |
1066 | 8, | |
1067 | 8, | |
1068 | 8, | |
1069 | 9, | |
51004dcb | 1070 | |
b75a7d8f | 1071 | }; |
51004dcb | 1072 | |
b75a7d8f A |
1073 | static const uint8_t result[][11]={ |
1074 | /*unsafe*/ | |
51004dcb A |
1075 | {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}, |
1076 | {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}, | |
1077 | {0x61, 0x62, 0x7f, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}, | |
1078 | {0x61, 0x62, 0x63, 0xed, 0xa0, 0x81, 0x67, 0x68, 0x69, 0x6a, 0x00}, | |
1079 | {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}, | |
b75a7d8f | 1080 | {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0xF0, 0x90, 0x90}, |
51004dcb | 1081 | |
b75a7d8f A |
1082 | {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00}, |
1083 | {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00}, | |
1084 | {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00}, | |
51004dcb | 1085 | |
b75a7d8f A |
1086 | {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00}, |
1087 | {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00}, | |
1088 | {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00}, | |
1089 | ||
1090 | {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00}, | |
1091 | /*safe*/ | |
51004dcb A |
1092 | {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}, |
1093 | {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}, | |
b75a7d8f | 1094 | {0x61, 0x62, 0x63, 0x7f, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}, |
51004dcb A |
1095 | {0x61, 0x62, 0x63, 0xef, 0xbf, 0xbf, 0x67, 0x68, 0x69, 0x6a, 0x00}, |
1096 | {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}, | |
b75a7d8f | 1097 | {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xc2, 0x9f}, /*gets UTF8_ERROR_VALUE_2 which takes 2 bytes 0xc0, 0x9f*/ |
51004dcb | 1098 | |
b75a7d8f A |
1099 | {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00}, |
1100 | {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00}, | |
1101 | {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00}, | |
51004dcb | 1102 | |
b75a7d8f A |
1103 | {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00}, |
1104 | {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00}, | |
1105 | {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00}, | |
51004dcb | 1106 | |
b75a7d8f | 1107 | {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00}, |
51004dcb | 1108 | |
b75a7d8f A |
1109 | }; |
1110 | uint16_t i, count=0; | |
1111 | uint8_t str[12]; | |
1112 | uint32_t offset; | |
1113 | /* UChar32 c=0;*/ | |
b331163b A |
1114 | uint16_t size=UPRV_LENGTHOF(s); |
1115 | for(i=0; i<UPRV_LENGTHOF(test); i=(uint16_t)(i+2)){ | |
b75a7d8f | 1116 | uprv_memcpy(str, s, size); |
51004dcb | 1117 | offset=test[i]; |
b75a7d8f A |
1118 | if(count<13){ |
1119 | UTF8_APPEND_CHAR_UNSAFE(str, offset, test[i+1]); | |
1120 | if(offset != movedOffset[count]){ | |
51004dcb | 1121 | log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n", |
b75a7d8f | 1122 | count, movedOffset[count], offset); |
51004dcb | 1123 | |
b75a7d8f A |
1124 | } |
1125 | if(uprv_memcmp(str, result[count], size) !=0){ | |
1126 | log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed for count=%d. \nExpected:", count); | |
1127 | printUChars(result[count], size); | |
1128 | log_err("\nGot: "); | |
1129 | printUChars(str, size); | |
1130 | log_err("\n"); | |
1131 | } | |
1132 | }else{ | |
1133 | UTF8_APPEND_CHAR_SAFE(str, offset, size, test[i+1]); | |
1134 | if(offset != movedOffset[count]){ | |
51004dcb | 1135 | log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n", |
b75a7d8f | 1136 | count, movedOffset[count], offset); |
51004dcb | 1137 | |
b75a7d8f A |
1138 | } |
1139 | if(uprv_memcmp(str, result[count], size) !=0){ | |
1140 | log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed for count=%d. \nExpected:", count); | |
1141 | printUChars(result[count], size); | |
1142 | log_err("\nGot: "); | |
1143 | printUChars(str, size); | |
1144 | log_err("\n"); | |
1145 | } | |
1146 | /*call the API instead of MACRO | |
1147 | uprv_memcpy(str, s, size); | |
51004dcb | 1148 | offset=test[i]; |
b75a7d8f | 1149 | c=test[i+1]; |
51004dcb A |
1150 | if((uint32_t)(c)<=0x7f) { |
1151 | (str)[(offset)++]=(uint8_t)(c); | |
1152 | } else { | |
1153 | (offset)=utf8_appendCharSafeBody(str, (int32_t)(offset), (int32_t)(size), c); | |
b75a7d8f A |
1154 | } |
1155 | if(offset != movedOffset[count]){ | |
51004dcb | 1156 | log_err("ERROR: utf8_appendCharSafeBody() failed to move the offset correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n", |
b75a7d8f | 1157 | count, movedOffset[count], offset); |
51004dcb | 1158 | |
b75a7d8f A |
1159 | } |
1160 | if(uprv_memcmp(str, result[count], size) !=0){ | |
1161 | log_err("ERROR: utf8_appendCharSafeBody() failed for count=%d. \nExpected:", count); | |
1162 | printUChars(result[count], size); | |
1163 | printf("\nGot: "); | |
1164 | printUChars(str, size); | |
1165 | printf("\n"); | |
1166 | } | |
1167 | */ | |
1168 | } | |
1169 | count++; | |
51004dcb | 1170 | } |
0f5d89e8 | 1171 | #endif |
b75a7d8f A |
1172 | } |
1173 | ||
1174 | static void TestAppend() { | |
1175 | static const UChar32 codePoints[]={ | |
1176 | 0x61, 0xdf, 0x901, 0x3040, | |
1177 | 0xac00, 0xd800, 0xdbff, 0xdcde, | |
1178 | 0xdffd, 0xe000, 0xffff, 0x10000, | |
1179 | 0x12345, 0xe0021, 0x10ffff, 0x110000, | |
1180 | 0x234567, 0x7fffffff, -1, -1000, | |
1181 | 0, 0x400 | |
1182 | }; | |
1183 | static const uint8_t expectUnsafe[]={ | |
1184 | 0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80, | |
1185 | 0xea, 0xb0, 0x80, 0xed, 0xa0, 0x80, 0xed, 0xaf, 0xbf, 0xed, 0xb3, 0x9e, | |
1186 | 0xed, 0xbf, 0xbd, 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x80, 0x80, | |
1187 | 0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf, /* not 0x110000 */ | |
1188 | /* none from this line */ | |
1189 | 0, 0xd0, 0x80 | |
1190 | }, expectSafe[]={ | |
1191 | 0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80, | |
1192 | 0xea, 0xb0, 0x80, /* no surrogates */ | |
1193 | /* no surrogates */ 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x80, 0x80, | |
1194 | 0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf, /* not 0x110000 */ | |
1195 | /* none from this line */ | |
1196 | 0, 0xd0, 0x80 | |
1197 | }; | |
1198 | ||
1199 | uint8_t buffer[100]; | |
1200 | UChar32 c; | |
1201 | int32_t i, length; | |
1202 | UBool isError, expectIsError, wrongIsError; | |
1203 | ||
1204 | length=0; | |
b331163b | 1205 | for(i=0; i<UPRV_LENGTHOF(codePoints); ++i) { |
b75a7d8f A |
1206 | c=codePoints[i]; |
1207 | if(c<0 || 0x10ffff<c) { | |
1208 | continue; /* skip non-code points for U8_APPEND_UNSAFE */ | |
1209 | } | |
1210 | ||
1211 | U8_APPEND_UNSAFE(buffer, length, c); | |
1212 | } | |
b331163b | 1213 | if(length!=UPRV_LENGTHOF(expectUnsafe) || 0!=memcmp(buffer, expectUnsafe, length)) { |
b75a7d8f A |
1214 | log_err("U8_APPEND_UNSAFE did not generate the expected output\n"); |
1215 | } | |
1216 | ||
1217 | length=0; | |
1218 | wrongIsError=FALSE; | |
b331163b | 1219 | for(i=0; i<UPRV_LENGTHOF(codePoints); ++i) { |
b75a7d8f A |
1220 | c=codePoints[i]; |
1221 | expectIsError= c<0 || 0x10ffff<c || U_IS_SURROGATE(c); | |
1222 | isError=FALSE; | |
1223 | ||
b331163b | 1224 | U8_APPEND(buffer, length, UPRV_LENGTHOF(buffer), c, isError); |
b75a7d8f A |
1225 | wrongIsError|= isError!=expectIsError; |
1226 | } | |
1227 | if(wrongIsError) { | |
1228 | log_err("U8_APPEND did not set isError correctly\n"); | |
1229 | } | |
b331163b | 1230 | if(length!=UPRV_LENGTHOF(expectSafe) || 0!=memcmp(buffer, expectSafe, length)) { |
b75a7d8f A |
1231 | log_err("U8_APPEND did not generate the expected output\n"); |
1232 | } | |
1233 | } | |
1234 | ||
73c04bcf A |
1235 | static void |
1236 | TestSurrogates() { | |
1237 | static const uint8_t b[]={ | |
1238 | 0xc3, 0x9f, /* 00DF */ | |
1239 | 0xed, 0x9f, 0xbf, /* D7FF */ | |
1240 | 0xed, 0xa0, 0x81, /* D801 */ | |
1241 | 0xed, 0xbf, 0xbe, /* DFFE */ | |
1242 | 0xee, 0x80, 0x80, /* E000 */ | |
1243 | 0xf0, 0x97, 0xbf, 0xbe /* 17FFE */ | |
1244 | }; | |
1245 | static const UChar32 cp[]={ | |
1246 | 0xdf, 0xd7ff, 0xd801, 0xdffe, 0xe000, 0x17ffe | |
1247 | }; | |
1248 | ||
1249 | UChar32 cu, cs, cl; | |
1250 | int32_t i, j, k, iu, is, il, length; | |
1251 | ||
1252 | k=0; /* index into cp[] */ | |
b331163b | 1253 | length=UPRV_LENGTHOF(b); |
73c04bcf A |
1254 | for(i=0; i<length;) { |
1255 | j=i; | |
1256 | U8_NEXT_UNSAFE(b, j, cu); | |
1257 | iu=j; | |
1258 | ||
1259 | j=i; | |
1260 | U8_NEXT(b, j, length, cs); | |
1261 | is=j; | |
1262 | ||
1263 | j=i; | |
1264 | L8_NEXT(b, j, length, cl); | |
1265 | il=j; | |
1266 | ||
1267 | if(cu!=cp[k]) { | |
1268 | log_err("U8_NEXT_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]); | |
1269 | } | |
1270 | ||
1271 | /* U8_NEXT() returns <0 for surrogate code points */ | |
1272 | if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) { | |
1273 | log_err("U8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu); | |
1274 | } | |
1275 | ||
1276 | /* L8_NEXT() returns surrogate code points like U8_NEXT_UNSAFE() */ | |
1277 | if(cl!=cu) { | |
1278 | log_err("L8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu); | |
1279 | } | |
1280 | ||
0f5d89e8 A |
1281 | // U8_NEXT() skips only the first byte of a surrogate byte sequence. |
1282 | if(U_IS_SURROGATE(cu) ? is!=(i+1) : is!=iu) { | |
1283 | log_err("U8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i); | |
1284 | } | |
1285 | if(il!=iu) { | |
1286 | log_err("L8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i); | |
73c04bcf A |
1287 | } |
1288 | ||
1289 | ++k; /* next code point */ | |
1290 | i=iu; /* advance by one UTF-8 sequence */ | |
1291 | } | |
1292 | ||
1293 | while(i>0) { | |
1294 | --k; /* previous code point */ | |
1295 | ||
1296 | j=i; | |
1297 | U8_PREV_UNSAFE(b, j, cu); | |
1298 | iu=j; | |
1299 | ||
1300 | j=i; | |
1301 | U8_PREV(b, 0, j, cs); | |
1302 | is=j; | |
1303 | ||
1304 | j=i; | |
1305 | L8_PREV(b, 0, j, cl); | |
1306 | il=j; | |
1307 | ||
1308 | if(cu!=cp[k]) { | |
1309 | log_err("U8_PREV_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]); | |
1310 | } | |
1311 | ||
1312 | /* U8_PREV() returns <0 for surrogate code points */ | |
1313 | if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) { | |
1314 | log_err("U8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu); | |
1315 | } | |
1316 | ||
1317 | /* L8_PREV() returns surrogate code points like U8_PREV_UNSAFE() */ | |
1318 | if(cl!=cu) { | |
1319 | log_err("L8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu); | |
1320 | } | |
1321 | ||
0f5d89e8 A |
1322 | // U8_PREV() skips only the last byte of a surrogate byte sequence. |
1323 | if(U_IS_SURROGATE(cu) ? is!=(i-1) : is!=iu) { | |
1324 | log_err("U8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i); | |
1325 | } | |
1326 | if(il !=iu) { | |
1327 | log_err("L8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i); | |
73c04bcf A |
1328 | } |
1329 | ||
1330 | i=iu; /* go back by one UTF-8 sequence */ | |
1331 | } | |
1332 | } |