]> git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/test/intltest/convtest.cpp
ICU-59131.0.1.tar.gz
[apple/icu.git] / icuSources / test / intltest / convtest.cpp
... / ...
CommitLineData
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 2003-2014, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: convtest.cpp
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2003jul15
16* created by: Markus W. Scherer
17*
18* Test file for data-driven conversion tests.
19*/
20
21#include "unicode/utypes.h"
22
23#if !UCONFIG_NO_LEGACY_CONVERSION
24/*
25 * Note: Turning off all of convtest.cpp if !UCONFIG_NO_LEGACY_CONVERSION
26 * is slightly unnecessary - it removes tests for Unicode charsets
27 * like UTF-8 that should work.
28 * However, there is no easy way for the test to detect whether a test case
29 * is for a Unicode charset, so it would be difficult to only exclude those.
30 * Also, regular testing of ICU is done with all modules on, therefore
31 * not testing conversion for a custom configuration like this should be ok.
32 */
33
34#include "unicode/ucnv.h"
35#include "unicode/unistr.h"
36#include "unicode/parsepos.h"
37#include "unicode/uniset.h"
38#include "unicode/ustring.h"
39#include "unicode/ures.h"
40#include "convtest.h"
41#include "cmemory.h"
42#include "unicode/tstdtmod.h"
43#include <string.h>
44#include <stdlib.h>
45
46enum {
47 // characters used in test data for callbacks
48 SUB_CB='?',
49 SKIP_CB='0',
50 STOP_CB='.',
51 ESC_CB='&'
52};
53
54ConversionTest::ConversionTest() {
55 UErrorCode errorCode=U_ZERO_ERROR;
56 utf8Cnv=ucnv_open("UTF-8", &errorCode);
57 ucnv_setToUCallBack(utf8Cnv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
58 if(U_FAILURE(errorCode)) {
59 errln("unable to open UTF-8 converter");
60 }
61}
62
63ConversionTest::~ConversionTest() {
64 ucnv_close(utf8Cnv);
65}
66
67void
68ConversionTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
69 if (exec) logln("TestSuite ConversionTest: ");
70 switch (index) {
71#if !UCONFIG_NO_FILE_IO
72 case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break;
73 case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break;
74 case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break;
75 case 3: name="TestDefaultIgnorableCallback"; if (exec) TestDefaultIgnorableCallback(); break;
76#else
77 case 0:
78 case 1:
79 case 2:
80 case 3: name="skip"; break;
81#endif
82 case 4: name="TestGetUnicodeSet2"; if (exec) TestGetUnicodeSet2(); break;
83 default: name=""; break; //needed to end loop
84 }
85}
86
87// test data interface ----------------------------------------------------- ***
88
89void
90ConversionTest::TestToUnicode() {
91 ConversionCase cc;
92 char charset[100], cbopt[4];
93 const char *option;
94 UnicodeString s, unicode;
95 int32_t offsetsLength;
96 UConverterToUCallback callback;
97
98 TestDataModule *dataModule;
99 TestData *testData;
100 const DataMap *testCase;
101 UErrorCode errorCode;
102 int32_t i;
103
104 errorCode=U_ZERO_ERROR;
105 dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode);
106 if(U_SUCCESS(errorCode)) {
107 testData=dataModule->createTestData("toUnicode", errorCode);
108 if(U_SUCCESS(errorCode)) {
109 for(i=0; testData->nextCase(testCase, errorCode); ++i) {
110 if(U_FAILURE(errorCode)) {
111 errln("error retrieving conversion/toUnicode test case %d - %s",
112 i, u_errorName(errorCode));
113 errorCode=U_ZERO_ERROR;
114 continue;
115 }
116
117 cc.caseNr=i;
118
119 s=testCase->getString("charset", errorCode);
120 s.extract(0, 0x7fffffff, charset, sizeof(charset), "");
121 cc.charset=charset;
122
123 cc.bytes=testCase->getBinary(cc.bytesLength, "bytes", errorCode);
124 unicode=testCase->getString("unicode", errorCode);
125 cc.unicode=unicode.getBuffer();
126 cc.unicodeLength=unicode.length();
127
128 offsetsLength=0;
129 cc.offsets=testCase->getIntVector(offsetsLength, "offsets", errorCode);
130 if(offsetsLength==0) {
131 cc.offsets=NULL;
132 } else if(offsetsLength!=unicode.length()) {
133 errln("toUnicode[%d] unicode[%d] and offsets[%d] must have the same length",
134 i, unicode.length(), offsetsLength);
135 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
136 }
137
138 cc.finalFlush= 0!=testCase->getInt28("flush", errorCode);
139 cc.fallbacks= 0!=testCase->getInt28("fallbacks", errorCode);
140
141 s=testCase->getString("errorCode", errorCode);
142 if(s==UNICODE_STRING("invalid", 7)) {
143 cc.outErrorCode=U_INVALID_CHAR_FOUND;
144 } else if(s==UNICODE_STRING("illegal", 7)) {
145 cc.outErrorCode=U_ILLEGAL_CHAR_FOUND;
146 } else if(s==UNICODE_STRING("truncated", 9)) {
147 cc.outErrorCode=U_TRUNCATED_CHAR_FOUND;
148 } else if(s==UNICODE_STRING("illesc", 6)) {
149 cc.outErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE;
150 } else if(s==UNICODE_STRING("unsuppesc", 9)) {
151 cc.outErrorCode=U_UNSUPPORTED_ESCAPE_SEQUENCE;
152 } else {
153 cc.outErrorCode=U_ZERO_ERROR;
154 }
155
156 s=testCase->getString("callback", errorCode);
157 s.extract(0, 0x7fffffff, cbopt, sizeof(cbopt), "");
158 cc.cbopt=cbopt;
159 switch(cbopt[0]) {
160 case SUB_CB:
161 callback=UCNV_TO_U_CALLBACK_SUBSTITUTE;
162 break;
163 case SKIP_CB:
164 callback=UCNV_TO_U_CALLBACK_SKIP;
165 break;
166 case STOP_CB:
167 callback=UCNV_TO_U_CALLBACK_STOP;
168 break;
169 case ESC_CB:
170 callback=UCNV_TO_U_CALLBACK_ESCAPE;
171 break;
172 default:
173 callback=NULL;
174 break;
175 }
176 option=callback==NULL ? cbopt : cbopt+1;
177 if(*option==0) {
178 option=NULL;
179 }
180
181 cc.invalidChars=testCase->getBinary(cc.invalidLength, "invalidChars", errorCode);
182
183 if(U_FAILURE(errorCode)) {
184 errln("error parsing conversion/toUnicode test case %d - %s",
185 i, u_errorName(errorCode));
186 errorCode=U_ZERO_ERROR;
187 } else {
188 logln("TestToUnicode[%d] %s", i, charset);
189 ToUnicodeCase(cc, callback, option);
190 }
191 }
192 delete testData;
193 }
194 delete dataModule;
195 }
196 else {
197 dataerrln("Could not load test conversion data");
198 }
199}
200
201void
202ConversionTest::TestFromUnicode() {
203 ConversionCase cc;
204 char charset[100], cbopt[4];
205 const char *option;
206 UnicodeString s, unicode, invalidUChars;
207 int32_t offsetsLength, index;
208 UConverterFromUCallback callback;
209
210 TestDataModule *dataModule;
211 TestData *testData;
212 const DataMap *testCase;
213 const UChar *p;
214 UErrorCode errorCode;
215 int32_t i, length;
216
217 errorCode=U_ZERO_ERROR;
218 dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode);
219 if(U_SUCCESS(errorCode)) {
220 testData=dataModule->createTestData("fromUnicode", errorCode);
221 if(U_SUCCESS(errorCode)) {
222 for(i=0; testData->nextCase(testCase, errorCode); ++i) {
223 if(U_FAILURE(errorCode)) {
224 errln("error retrieving conversion/fromUnicode test case %d - %s",
225 i, u_errorName(errorCode));
226 errorCode=U_ZERO_ERROR;
227 continue;
228 }
229
230 cc.caseNr=i;
231
232 s=testCase->getString("charset", errorCode);
233 s.extract(0, 0x7fffffff, charset, sizeof(charset), "");
234 cc.charset=charset;
235
236 unicode=testCase->getString("unicode", errorCode);
237 cc.unicode=unicode.getBuffer();
238 cc.unicodeLength=unicode.length();
239 cc.bytes=testCase->getBinary(cc.bytesLength, "bytes", errorCode);
240
241 offsetsLength=0;
242 cc.offsets=testCase->getIntVector(offsetsLength, "offsets", errorCode);
243 if(offsetsLength==0) {
244 cc.offsets=NULL;
245 } else if(offsetsLength!=cc.bytesLength) {
246 errln("fromUnicode[%d] bytes[%d] and offsets[%d] must have the same length",
247 i, cc.bytesLength, offsetsLength);
248 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
249 }
250
251 cc.finalFlush= 0!=testCase->getInt28("flush", errorCode);
252 cc.fallbacks= 0!=testCase->getInt28("fallbacks", errorCode);
253
254 s=testCase->getString("errorCode", errorCode);
255 if(s==UNICODE_STRING("invalid", 7)) {
256 cc.outErrorCode=U_INVALID_CHAR_FOUND;
257 } else if(s==UNICODE_STRING("illegal", 7)) {
258 cc.outErrorCode=U_ILLEGAL_CHAR_FOUND;
259 } else if(s==UNICODE_STRING("truncated", 9)) {
260 cc.outErrorCode=U_TRUNCATED_CHAR_FOUND;
261 } else {
262 cc.outErrorCode=U_ZERO_ERROR;
263 }
264
265 s=testCase->getString("callback", errorCode);
266 cc.setSub=0; // default: no subchar
267
268 if((index=s.indexOf((UChar)0))>0) {
269 // read NUL-separated subchar first, if any
270 // copy the subchar from Latin-1 characters
271 // start after the NUL
272 p=s.getTerminatedBuffer();
273 length=index+1;
274 p+=length;
275 length=s.length()-length;
276 if(length<=0 || length>=(int32_t)sizeof(cc.subchar)) {
277 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
278 } else {
279 int32_t j;
280
281 for(j=0; j<length; ++j) {
282 cc.subchar[j]=(char)p[j];
283 }
284 // NUL-terminate the subchar
285 cc.subchar[j]=0;
286 cc.setSub=1;
287 }
288
289 // remove the NUL and subchar from s
290 s.truncate(index);
291 } else if((index=s.indexOf((UChar)0x3d))>0) /* '=' */ {
292 // read a substitution string, separated by an equal sign
293 p=s.getBuffer()+index+1;
294 length=s.length()-(index+1);
295 if(length<0 || length>=UPRV_LENGTHOF(cc.subString)) {
296 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
297 } else {
298 u_memcpy(cc.subString, p, length);
299 // NUL-terminate the subString
300 cc.subString[length]=0;
301 cc.setSub=-1;
302 }
303
304 // remove the equal sign and subString from s
305 s.truncate(index);
306 }
307
308 s.extract(0, 0x7fffffff, cbopt, sizeof(cbopt), "");
309 cc.cbopt=cbopt;
310 switch(cbopt[0]) {
311 case SUB_CB:
312 callback=UCNV_FROM_U_CALLBACK_SUBSTITUTE;
313 break;
314 case SKIP_CB:
315 callback=UCNV_FROM_U_CALLBACK_SKIP;
316 break;
317 case STOP_CB:
318 callback=UCNV_FROM_U_CALLBACK_STOP;
319 break;
320 case ESC_CB:
321 callback=UCNV_FROM_U_CALLBACK_ESCAPE;
322 break;
323 default:
324 callback=NULL;
325 break;
326 }
327 option=callback==NULL ? cbopt : cbopt+1;
328 if(*option==0) {
329 option=NULL;
330 }
331
332 invalidUChars=testCase->getString("invalidUChars", errorCode);
333 cc.invalidUChars=invalidUChars.getBuffer();
334 cc.invalidLength=invalidUChars.length();
335
336 if(U_FAILURE(errorCode)) {
337 errln("error parsing conversion/fromUnicode test case %d - %s",
338 i, u_errorName(errorCode));
339 errorCode=U_ZERO_ERROR;
340 } else {
341 logln("TestFromUnicode[%d] %s", i, charset);
342 FromUnicodeCase(cc, callback, option);
343 }
344 }
345 delete testData;
346 }
347 delete dataModule;
348 }
349 else {
350 dataerrln("Could not load test conversion data");
351 }
352}
353
354static const UChar ellipsis[]={ 0x2e, 0x2e, 0x2e };
355
356void
357ConversionTest::TestGetUnicodeSet() {
358 char charset[100];
359 UnicodeString s, map, mapnot;
360 int32_t which;
361
362 ParsePosition pos;
363 UnicodeSet cnvSet, mapSet, mapnotSet, diffSet;
364 UnicodeSet *cnvSetPtr = &cnvSet;
365 LocalUConverterPointer cnv;
366
367 TestDataModule *dataModule;
368 TestData *testData;
369 const DataMap *testCase;
370 UErrorCode errorCode;
371 int32_t i;
372
373 errorCode=U_ZERO_ERROR;
374 dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode);
375 if(U_SUCCESS(errorCode)) {
376 testData=dataModule->createTestData("getUnicodeSet", errorCode);
377 if(U_SUCCESS(errorCode)) {
378 for(i=0; testData->nextCase(testCase, errorCode); ++i) {
379 if(U_FAILURE(errorCode)) {
380 errln("error retrieving conversion/getUnicodeSet test case %d - %s",
381 i, u_errorName(errorCode));
382 errorCode=U_ZERO_ERROR;
383 continue;
384 }
385
386 s=testCase->getString("charset", errorCode);
387 s.extract(0, 0x7fffffff, charset, sizeof(charset), "");
388
389 map=testCase->getString("map", errorCode);
390 mapnot=testCase->getString("mapnot", errorCode);
391
392 which=testCase->getInt28("which", errorCode);
393
394 if(U_FAILURE(errorCode)) {
395 errln("error parsing conversion/getUnicodeSet test case %d - %s",
396 i, u_errorName(errorCode));
397 errorCode=U_ZERO_ERROR;
398 continue;
399 }
400
401 // test this test case
402 mapSet.clear();
403 mapnotSet.clear();
404
405 pos.setIndex(0);
406 mapSet.applyPattern(map, pos, 0, NULL, errorCode);
407 if(U_FAILURE(errorCode) || pos.getIndex()!=map.length()) {
408 errln("error creating the map set for conversion/getUnicodeSet test case %d - %s\n"
409 " error index %d index %d U+%04x",
410 i, u_errorName(errorCode), pos.getErrorIndex(), pos.getIndex(), map.char32At(pos.getIndex()));
411 errorCode=U_ZERO_ERROR;
412 continue;
413 }
414
415 pos.setIndex(0);
416 mapnotSet.applyPattern(mapnot, pos, 0, NULL, errorCode);
417 if(U_FAILURE(errorCode) || pos.getIndex()!=mapnot.length()) {
418 errln("error creating the mapnot set for conversion/getUnicodeSet test case %d - %s\n"
419 " error index %d index %d U+%04x",
420 i, u_errorName(errorCode), pos.getErrorIndex(), pos.getIndex(), mapnot.char32At(pos.getIndex()));
421 errorCode=U_ZERO_ERROR;
422 continue;
423 }
424
425 logln("TestGetUnicodeSet[%d] %s", i, charset);
426
427 cnv.adoptInstead(cnv_open(charset, errorCode));
428 if(U_FAILURE(errorCode)) {
429 errcheckln(errorCode, "error opening \"%s\" for conversion/getUnicodeSet test case %d - %s",
430 charset, i, u_errorName(errorCode));
431 errorCode=U_ZERO_ERROR;
432 continue;
433 }
434
435 ucnv_getUnicodeSet(cnv.getAlias(), cnvSetPtr->toUSet(), (UConverterUnicodeSet)which, &errorCode);
436
437 if(U_FAILURE(errorCode)) {
438 errln("error in ucnv_getUnicodeSet(\"%s\") for conversion/getUnicodeSet test case %d - %s",
439 charset, i, u_errorName(errorCode));
440 errorCode=U_ZERO_ERROR;
441 continue;
442 }
443
444 // are there items that must be in cnvSet but are not?
445 (diffSet=mapSet).removeAll(cnvSet);
446 if(!diffSet.isEmpty()) {
447 diffSet.toPattern(s, TRUE);
448 if(s.length()>100) {
449 s.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
450 }
451 errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - conversion/getUnicodeSet test case %d",
452 charset, i);
453 errln(s);
454 }
455
456 // are there items that must not be in cnvSet but are?
457 (diffSet=mapnotSet).retainAll(cnvSet);
458 if(!diffSet.isEmpty()) {
459 diffSet.toPattern(s, TRUE);
460 if(s.length()>100) {
461 s.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
462 }
463 errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - conversion/getUnicodeSet test case %d",
464 charset, i);
465 errln(s);
466 }
467 }
468 delete testData;
469 }
470 delete dataModule;
471 }
472 else {
473 dataerrln("Could not load test conversion data");
474 }
475}
476
477U_CDECL_BEGIN
478static void U_CALLCONV
479getUnicodeSetCallback(const void *context,
480 UConverterFromUnicodeArgs * /*fromUArgs*/,
481 const UChar* /*codeUnits*/,
482 int32_t /*length*/,
483 UChar32 codePoint,
484 UConverterCallbackReason reason,
485 UErrorCode *pErrorCode) {
486 if(reason<=UCNV_IRREGULAR) {
487 ((UnicodeSet *)context)->remove(codePoint); // the converter cannot convert this code point
488 *pErrorCode=U_ZERO_ERROR; // skip
489 } // else ignore the reset, close and clone calls.
490}
491U_CDECL_END
492
493// Compare ucnv_getUnicodeSet() with the set of characters that can be converted.
494void
495ConversionTest::TestGetUnicodeSet2() {
496 // Build a string with all code points.
497 UChar32 cpLimit;
498 int32_t s0Length;
499 if(quick) {
500 cpLimit=s0Length=0x10000; // BMP only
501 } else {
502 cpLimit=0x110000;
503 s0Length=0x10000+0x200000; // BMP + surrogate pairs
504 }
505 UChar *s0=new UChar[s0Length];
506 if(s0==NULL) {
507 return;
508 }
509 UChar *s=s0;
510 UChar32 c;
511 UChar c2;
512 // low BMP
513 for(c=0; c<=0xd7ff; ++c) {
514 *s++=(UChar)c;
515 }
516 // trail surrogates
517 for(c=0xdc00; c<=0xdfff; ++c) {
518 *s++=(UChar)c;
519 }
520 // lead surrogates
521 // (after trails so that there is not even one surrogate pair in between)
522 for(c=0xd800; c<=0xdbff; ++c) {
523 *s++=(UChar)c;
524 }
525 // high BMP
526 for(c=0xe000; c<=0xffff; ++c) {
527 *s++=(UChar)c;
528 }
529 // supplementary code points = surrogate pairs
530 if(cpLimit==0x110000) {
531 for(c=0xd800; c<=0xdbff; ++c) {
532 for(c2=0xdc00; c2<=0xdfff; ++c2) {
533 *s++=(UChar)c;
534 *s++=c2;
535 }
536 }
537 }
538
539 static const char *const cnvNames[]={
540 "UTF-8",
541 "UTF-7",
542 "UTF-16",
543 "US-ASCII",
544 "ISO-8859-1",
545 "windows-1252",
546 "Shift-JIS",
547 "ibm-1390", // EBCDIC_STATEFUL table
548 "ibm-16684", // DBCS-only extension table based on EBCDIC_STATEFUL table
549 "HZ",
550 "ISO-2022-JP",
551 "JIS7",
552 "ISO-2022-CN",
553 "ISO-2022-CN-EXT",
554 "LMBCS"
555 };
556 LocalUConverterPointer cnv;
557 char buffer[1024];
558 int32_t i;
559 for(i=0; i<UPRV_LENGTHOF(cnvNames); ++i) {
560 UErrorCode errorCode=U_ZERO_ERROR;
561 cnv.adoptInstead(cnv_open(cnvNames[i], errorCode));
562 if(U_FAILURE(errorCode)) {
563 errcheckln(errorCode, "failed to open converter %s - %s", cnvNames[i], u_errorName(errorCode));
564 continue;
565 }
566 UnicodeSet expected;
567 ucnv_setFromUCallBack(cnv.getAlias(), getUnicodeSetCallback, &expected, NULL, NULL, &errorCode);
568 if(U_FAILURE(errorCode)) {
569 errln("failed to set the callback on converter %s - %s", cnvNames[i], u_errorName(errorCode));
570 continue;
571 }
572 UConverterUnicodeSet which;
573 for(which=UCNV_ROUNDTRIP_SET; which<UCNV_SET_COUNT; which=(UConverterUnicodeSet)((int)which+1)) {
574 if(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
575 ucnv_setFallback(cnv.getAlias(), TRUE);
576 }
577 expected.add(0, cpLimit-1);
578 s=s0;
579 UBool flush;
580 do {
581 char *t=buffer;
582 flush=(UBool)(s==s0+s0Length);
583 ucnv_fromUnicode(cnv.getAlias(), &t, buffer+sizeof(buffer), (const UChar **)&s, s0+s0Length, NULL, flush, &errorCode);
584 if(U_FAILURE(errorCode)) {
585 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
586 errorCode=U_ZERO_ERROR;
587 continue;
588 } else {
589 break; // unexpected error, should not occur
590 }
591 }
592 } while(!flush);
593 UnicodeSet set;
594 ucnv_getUnicodeSet(cnv.getAlias(), set.toUSet(), which, &errorCode);
595 if(cpLimit<0x110000) {
596 set.remove(cpLimit, 0x10ffff);
597 }
598 if(which==UCNV_ROUNDTRIP_SET) {
599 // ignore PUA code points because they will be converted even if they
600 // are fallbacks and when other fallbacks are turned off,
601 // but ucnv_getUnicodeSet(UCNV_ROUNDTRIP_SET) delivers true roundtrips
602 expected.remove(0xe000, 0xf8ff);
603 expected.remove(0xf0000, 0xffffd);
604 expected.remove(0x100000, 0x10fffd);
605 set.remove(0xe000, 0xf8ff);
606 set.remove(0xf0000, 0xffffd);
607 set.remove(0x100000, 0x10fffd);
608 }
609 if(set!=expected) {
610 // First try to see if we have different sets because ucnv_getUnicodeSet()
611 // added strings: The above conversion method does not tell us what strings might be convertible.
612 // Remove strings from the set and compare again.
613 // Unfortunately, there are no good, direct set methods for finding out whether there are strings
614 // in the set, nor for enumerating or removing just them.
615 // Intersect all code points with the set. The intersection will not contain strings.
616 UnicodeSet temp(0, 0x10ffff);
617 temp.retainAll(set);
618 set=temp;
619 }
620 if(set!=expected) {
621 UnicodeSet diffSet;
622 UnicodeString out;
623
624 // are there items that must be in the set but are not?
625 (diffSet=expected).removeAll(set);
626 if(!diffSet.isEmpty()) {
627 diffSet.toPattern(out, TRUE);
628 if(out.length()>100) {
629 out.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
630 }
631 errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - which set: %d",
632 cnvNames[i], which);
633 errln(out);
634 }
635
636 // are there items that must not be in the set but are?
637 (diffSet=set).removeAll(expected);
638 if(!diffSet.isEmpty()) {
639 diffSet.toPattern(out, TRUE);
640 if(out.length()>100) {
641 out.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
642 }
643 errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - which set: %d",
644 cnvNames[i], which);
645 errln(out);
646 }
647 }
648 }
649 }
650
651 delete [] s0;
652}
653
654// Test all codepoints which has the default ignorable Unicode property are ignored if they have no mapping
655// If there are any failures, the hard coded list (IS_DEFAULT_IGNORABLE_CODE_POINT) in ucnv_err.c should be updated
656void
657ConversionTest::TestDefaultIgnorableCallback() {
658 UErrorCode status = U_ZERO_ERROR;
659 const char *cnv_name = "euc-jp-2007";
660 const char *pattern_ignorable = "[:Default_Ignorable_Code_Point:]";
661 const char *pattern_not_ignorable = "[:^Default_Ignorable_Code_Point:]";
662
663 UnicodeSet *set_ignorable = new UnicodeSet(pattern_ignorable, status);
664 if (U_FAILURE(status)) {
665 dataerrln("Unable to create Unicodeset: %s - %s\n", pattern_ignorable, u_errorName(status));
666 return;
667 }
668
669 UnicodeSet *set_not_ignorable = new UnicodeSet(pattern_not_ignorable, status);
670 if (U_FAILURE(status)) {
671 dataerrln("Unable to create Unicodeset: %s - %s\n", pattern_not_ignorable, u_errorName(status));
672 return;
673 }
674
675 UConverter *cnv = cnv_open(cnv_name, status);
676 if (U_FAILURE(status)) {
677 dataerrln("Unable to open converter: %s - %s\n", cnv_name, u_errorName(status));
678 return;
679 }
680
681 // set callback for the converter
682 ucnv_setFromUCallBack(cnv, UCNV_FROM_U_CALLBACK_SUBSTITUTE, NULL, NULL, NULL, &status);
683
684 UChar32 input[1];
685 char output[10];
686 int32_t outputLength;
687
688 // test default ignorables are ignored
689 int size = set_ignorable->size();
690 for (int i = 0; i < size; i++) {
691 status = U_ZERO_ERROR;
692 outputLength= 0;
693
694 input[0] = set_ignorable->charAt(i);
695
696 outputLength = ucnv_fromUChars(cnv, output, 10, UnicodeString::fromUTF32(input, 1).getTerminatedBuffer(), -1, &status);
697 if (U_FAILURE(status) || outputLength != 0) {
698 errln("Ignorable code point: U+%04X not skipped as expected - %s", input[0], u_errorName(status));
699 }
700 }
701
702 // test non-ignorables are not ignored
703 size = set_not_ignorable->size();
704 for (int i = 0; i < size; i++) {
705 status = U_ZERO_ERROR;
706 outputLength= 0;
707
708 input[0] = set_not_ignorable->charAt(i);
709
710 if (input[0] == 0) {
711 continue;
712 }
713
714 outputLength = ucnv_fromUChars(cnv, output, 10, UnicodeString::fromUTF32(input, 1).getTerminatedBuffer(), -1, &status);
715 if (U_FAILURE(status) || outputLength <= 0) {
716 errln("Non-ignorable code point: U+%04X skipped unexpectedly - %s", input[0], u_errorName(status));
717 }
718 }
719
720 ucnv_close(cnv);
721 delete set_not_ignorable;
722 delete set_ignorable;
723}
724
725// open testdata or ICU data converter ------------------------------------- ***
726
727UConverter *
728ConversionTest::cnv_open(const char *name, UErrorCode &errorCode) {
729 if(name!=NULL && *name=='+') {
730 // Converter names that start with '+' are ignored in ICU4J tests.
731 ++name;
732 }
733 if(name!=NULL && *name=='*') {
734 /* loadTestData(): set the data directory */
735 return ucnv_openPackage(loadTestData(errorCode), name+1, &errorCode);
736 } else {
737 return ucnv_open(name, &errorCode);
738 }
739}
740
741// output helpers ---------------------------------------------------------- ***
742
743static inline char
744hexDigit(uint8_t digit) {
745 return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit);
746}
747
748static char *
749printBytes(const uint8_t *bytes, int32_t length, char *out) {
750 uint8_t b;
751
752 if(length>0) {
753 b=*bytes++;
754 --length;
755 *out++=hexDigit((uint8_t)(b>>4));
756 *out++=hexDigit((uint8_t)(b&0xf));
757 }
758
759 while(length>0) {
760 b=*bytes++;
761 --length;
762 *out++=' ';
763 *out++=hexDigit((uint8_t)(b>>4));
764 *out++=hexDigit((uint8_t)(b&0xf));
765 }
766 *out++=0;
767 return out;
768}
769
770static char *
771printUnicode(const UChar *unicode, int32_t length, char *out) {
772 UChar32 c;
773 int32_t i;
774
775 for(i=0; i<length;) {
776 if(i>0) {
777 *out++=' ';
778 }
779 U16_NEXT(unicode, i, length, c);
780 // write 4..6 digits
781 if(c>=0x100000) {
782 *out++='1';
783 }
784 if(c>=0x10000) {
785 *out++=hexDigit((uint8_t)((c>>16)&0xf));
786 }
787 *out++=hexDigit((uint8_t)((c>>12)&0xf));
788 *out++=hexDigit((uint8_t)((c>>8)&0xf));
789 *out++=hexDigit((uint8_t)((c>>4)&0xf));
790 *out++=hexDigit((uint8_t)(c&0xf));
791 }
792 *out++=0;
793 return out;
794}
795
796static char *
797printOffsets(const int32_t *offsets, int32_t length, char *out) {
798 int32_t i, o, d;
799
800 if(offsets==NULL) {
801 length=0;
802 }
803
804 for(i=0; i<length; ++i) {
805 if(i>0) {
806 *out++=' ';
807 }
808 o=offsets[i];
809
810 // print all offsets with 2 characters each (-x, -9..99, xx)
811 if(o<-9) {
812 *out++='-';
813 *out++='x';
814 } else if(o<0) {
815 *out++='-';
816 *out++=(char)('0'-o);
817 } else if(o<=99) {
818 *out++=(d=o/10)==0 ? ' ' : (char)('0'+d);
819 *out++=(char)('0'+o%10);
820 } else /* o>99 */ {
821 *out++='x';
822 *out++='x';
823 }
824 }
825 *out++=0;
826 return out;
827}
828
829// toUnicode test worker functions ----------------------------------------- ***
830
831static int32_t
832stepToUnicode(ConversionCase &cc, UConverter *cnv,
833 UChar *result, int32_t resultCapacity,
834 int32_t *resultOffsets, /* also resultCapacity */
835 int32_t step,
836 UErrorCode *pErrorCode) {
837 const char *source, *sourceLimit, *bytesLimit;
838 UChar *target, *targetLimit, *resultLimit;
839 UBool flush;
840
841 source=(const char *)cc.bytes;
842 target=result;
843 bytesLimit=source+cc.bytesLength;
844 resultLimit=result+resultCapacity;
845
846 if(step>=0) {
847 // call ucnv_toUnicode() with in/out buffers no larger than (step) at a time
848 // move only one buffer (in vs. out) at a time to be extra mean
849 // step==0 performs bulk conversion and generates offsets
850
851 // initialize the partial limits for the loop
852 if(step==0) {
853 // use the entire buffers
854 sourceLimit=bytesLimit;
855 targetLimit=resultLimit;
856 flush=cc.finalFlush;
857 } else {
858 // start with empty partial buffers
859 sourceLimit=source;
860 targetLimit=target;
861 flush=FALSE;
862
863 // output offsets only for bulk conversion
864 resultOffsets=NULL;
865 }
866
867 for(;;) {
868 // resetting the opposite conversion direction must not affect this one
869 ucnv_resetFromUnicode(cnv);
870
871 // convert
872 ucnv_toUnicode(cnv,
873 &target, targetLimit,
874 &source, sourceLimit,
875 resultOffsets,
876 flush, pErrorCode);
877
878 // check pointers and errors
879 if(source>sourceLimit || target>targetLimit) {
880 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
881 break;
882 } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
883 if(target!=targetLimit) {
884 // buffer overflow must only be set when the target is filled
885 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
886 break;
887 } else if(targetLimit==resultLimit) {
888 // not just a partial overflow
889 break;
890 }
891
892 // the partial target is filled, set a new limit, reset the error and continue
893 targetLimit=(resultLimit-target)>=step ? target+step : resultLimit;
894 *pErrorCode=U_ZERO_ERROR;
895 } else if(U_FAILURE(*pErrorCode)) {
896 // some other error occurred, done
897 break;
898 } else {
899 if(source!=sourceLimit) {
900 // when no error occurs, then the input must be consumed
901 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
902 break;
903 }
904
905 if(sourceLimit==bytesLimit) {
906 // we are done
907 break;
908 }
909
910 // the partial conversion succeeded, set a new limit and continue
911 sourceLimit=(bytesLimit-source)>=step ? source+step : bytesLimit;
912 flush=(UBool)(cc.finalFlush && sourceLimit==bytesLimit);
913 }
914 }
915 } else /* step<0 */ {
916 /*
917 * step==-1: call only ucnv_getNextUChar()
918 * otherwise alternate between ucnv_toUnicode() and ucnv_getNextUChar()
919 * if step==-2 or -3, then give ucnv_toUnicode() the whole remaining input,
920 * else give it at most (-step-2)/2 bytes
921 */
922 UChar32 c;
923
924 // end the loop by getting an index out of bounds error
925 for(;;) {
926 // resetting the opposite conversion direction must not affect this one
927 ucnv_resetFromUnicode(cnv);
928
929 // convert
930 if((step&1)!=0 /* odd: -1, -3, -5, ... */) {
931 sourceLimit=source; // use sourceLimit not as a real limit
932 // but to remember the pre-getNextUChar source pointer
933 c=ucnv_getNextUChar(cnv, &source, bytesLimit, pErrorCode);
934
935 // check pointers and errors
936 if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
937 if(source!=bytesLimit) {
938 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
939 } else {
940 *pErrorCode=U_ZERO_ERROR;
941 }
942 break;
943 } else if(U_FAILURE(*pErrorCode)) {
944 break;
945 }
946 // source may not move if c is from previous overflow
947
948 if(target==resultLimit) {
949 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
950 break;
951 }
952 if(c<=0xffff) {
953 *target++=(UChar)c;
954 } else {
955 *target++=U16_LEAD(c);
956 if(target==resultLimit) {
957 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
958 break;
959 }
960 *target++=U16_TRAIL(c);
961 }
962
963 // alternate between -n-1 and -n but leave -1 alone
964 if(step<-1) {
965 ++step;
966 }
967 } else /* step is even */ {
968 // allow only one UChar output
969 targetLimit=target<resultLimit ? target+1 : resultLimit;
970
971 // as with ucnv_getNextUChar(), we always flush (if we go to bytesLimit)
972 // and never output offsets
973 if(step==-2) {
974 sourceLimit=bytesLimit;
975 } else {
976 sourceLimit=source+(-step-2)/2;
977 if(sourceLimit>bytesLimit) {
978 sourceLimit=bytesLimit;
979 }
980 }
981
982 ucnv_toUnicode(cnv,
983 &target, targetLimit,
984 &source, sourceLimit,
985 NULL, (UBool)(sourceLimit==bytesLimit), pErrorCode);
986
987 // check pointers and errors
988 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
989 if(target!=targetLimit) {
990 // buffer overflow must only be set when the target is filled
991 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
992 break;
993 } else if(targetLimit==resultLimit) {
994 // not just a partial overflow
995 break;
996 }
997
998 // the partial target is filled, set a new limit and continue
999 *pErrorCode=U_ZERO_ERROR;
1000 } else if(U_FAILURE(*pErrorCode)) {
1001 // some other error occurred, done
1002 break;
1003 } else {
1004 if(source!=sourceLimit) {
1005 // when no error occurs, then the input must be consumed
1006 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1007 break;
1008 }
1009
1010 // we are done (flush==TRUE) but we continue, to get the index out of bounds error above
1011 }
1012
1013 --step;
1014 }
1015 }
1016 }
1017
1018 return (int32_t)(target-result);
1019}
1020
1021UBool
1022ConversionTest::ToUnicodeCase(ConversionCase &cc, UConverterToUCallback callback, const char *option) {
1023 // open the converter
1024 IcuTestErrorCode errorCode(*this, "ToUnicodeCase");
1025 LocalUConverterPointer cnv(cnv_open(cc.charset, errorCode));
1026 // with no data, the above crashes with "pointer being freed was not allocated" for charset "x11-compound-text", see #13078
1027 if(errorCode.isFailure()) {
1028 errcheckln(errorCode, "toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_open() failed - %s",
1029 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, errorCode.errorName());
1030 errorCode.reset();
1031 return FALSE;
1032 }
1033
1034 // set the callback
1035 if(callback!=NULL) {
1036 ucnv_setToUCallBack(cnv.getAlias(), callback, option, NULL, NULL, errorCode);
1037 if(U_FAILURE(errorCode)) {
1038 errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setToUCallBack() failed - %s",
1039 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
1040 return FALSE;
1041 }
1042 }
1043
1044 int32_t resultOffsets[256];
1045 UChar result[256];
1046 int32_t resultLength;
1047 UBool ok;
1048
1049 static const struct {
1050 int32_t step;
1051 const char *name;
1052 } steps[]={
1053 { 0, "bulk" }, // must be first for offsets to be checked
1054 { 1, "step=1" },
1055 { 3, "step=3" },
1056 { 7, "step=7" },
1057 { -1, "getNext" },
1058 { -2, "toU(bulk)+getNext" },
1059 { -3, "getNext+toU(bulk)" },
1060 { -4, "toU(1)+getNext" },
1061 { -5, "getNext+toU(1)" },
1062 { -12, "toU(5)+getNext" },
1063 { -13, "getNext+toU(5)" },
1064 };
1065 int32_t i, step;
1066
1067 ok=TRUE;
1068 for(i=0; i<UPRV_LENGTHOF(steps) && ok; ++i) {
1069 step=steps[i].step;
1070 if(step<0 && !cc.finalFlush) {
1071 // skip ucnv_getNextUChar() if !finalFlush because
1072 // ucnv_getNextUChar() always implies flush
1073 continue;
1074 }
1075 if(step!=0) {
1076 // bulk test is first, then offsets are not checked any more
1077 cc.offsets=NULL;
1078 }
1079 else {
1080 memset(resultOffsets, -1, UPRV_LENGTHOF(resultOffsets));
1081 }
1082 memset(result, -1, UPRV_LENGTHOF(result));
1083 errorCode.reset();
1084 resultLength=stepToUnicode(cc, cnv.getAlias(),
1085 result, UPRV_LENGTHOF(result),
1086 step==0 ? resultOffsets : NULL,
1087 step, errorCode);
1088 ok=checkToUnicode(
1089 cc, cnv.getAlias(), steps[i].name,
1090 result, resultLength,
1091 cc.offsets!=NULL ? resultOffsets : NULL,
1092 errorCode);
1093 if(errorCode.isFailure() || !cc.finalFlush) {
1094 // reset if an error occurred or we did not flush
1095 // otherwise do nothing to make sure that flushing resets
1096 ucnv_resetToUnicode(cnv.getAlias());
1097 }
1098 if (cc.offsets != NULL && resultOffsets[resultLength] != -1) {
1099 errln("toUnicode[%d](%s) Conversion wrote too much to offsets at index %d",
1100 cc.caseNr, cc.charset, resultLength);
1101 }
1102 if (result[resultLength] != (UChar)-1) {
1103 errln("toUnicode[%d](%s) Conversion wrote too much to result at index %d",
1104 cc.caseNr, cc.charset, resultLength);
1105 }
1106 }
1107
1108 // not a real loop, just a convenience for breaking out of the block
1109 while(ok && cc.finalFlush) {
1110 // test ucnv_toUChars()
1111 memset(result, 0, sizeof(result));
1112
1113 errorCode.reset();
1114 resultLength=ucnv_toUChars(cnv.getAlias(),
1115 result, UPRV_LENGTHOF(result),
1116 (const char *)cc.bytes, cc.bytesLength,
1117 errorCode);
1118 ok=checkToUnicode(
1119 cc, cnv.getAlias(), "toUChars",
1120 result, resultLength,
1121 NULL,
1122 errorCode);
1123 if(!ok) {
1124 break;
1125 }
1126
1127 // test preflighting
1128 // keep the correct result for simple checking
1129 errorCode.reset();
1130 resultLength=ucnv_toUChars(cnv.getAlias(),
1131 NULL, 0,
1132 (const char *)cc.bytes, cc.bytesLength,
1133 errorCode);
1134 if(errorCode.get()==U_STRING_NOT_TERMINATED_WARNING || errorCode.get()==U_BUFFER_OVERFLOW_ERROR) {
1135 errorCode.reset();
1136 }
1137 ok=checkToUnicode(
1138 cc, cnv.getAlias(), "preflight toUChars",
1139 result, resultLength,
1140 NULL,
1141 errorCode);
1142 break;
1143 }
1144
1145 errorCode.reset(); // all errors have already been reported
1146 return ok;
1147}
1148
1149UBool
1150ConversionTest::checkToUnicode(ConversionCase &cc, UConverter *cnv, const char *name,
1151 const UChar *result, int32_t resultLength,
1152 const int32_t *resultOffsets,
1153 UErrorCode resultErrorCode) {
1154 char resultInvalidChars[8];
1155 int8_t resultInvalidLength;
1156 UErrorCode errorCode;
1157
1158 const char *msg;
1159
1160 // reset the message; NULL will mean "ok"
1161 msg=NULL;
1162
1163 errorCode=U_ZERO_ERROR;
1164 resultInvalidLength=sizeof(resultInvalidChars);
1165 ucnv_getInvalidChars(cnv, resultInvalidChars, &resultInvalidLength, &errorCode);
1166 if(U_FAILURE(errorCode)) {
1167 errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) ucnv_getInvalidChars() failed - %s",
1168 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, u_errorName(errorCode));
1169 return FALSE;
1170 }
1171
1172 // check everything that might have gone wrong
1173 if(cc.unicodeLength!=resultLength) {
1174 msg="wrong result length";
1175 } else if(0!=u_memcmp(cc.unicode, result, cc.unicodeLength)) {
1176 msg="wrong result string";
1177 } else if(cc.offsets!=NULL && 0!=memcmp(cc.offsets, resultOffsets, cc.unicodeLength*sizeof(*cc.offsets))) {
1178 msg="wrong offsets";
1179 } else if(cc.outErrorCode!=resultErrorCode) {
1180 msg="wrong error code";
1181 } else if(cc.invalidLength!=resultInvalidLength) {
1182 msg="wrong length of last invalid input";
1183 } else if(0!=memcmp(cc.invalidChars, resultInvalidChars, cc.invalidLength)) {
1184 msg="wrong last invalid input";
1185 }
1186
1187 if(msg==NULL) {
1188 return TRUE;
1189 } else {
1190 char buffer[2000]; // one buffer for all strings
1191 char *s, *bytesString, *unicodeString, *resultString,
1192 *offsetsString, *resultOffsetsString,
1193 *invalidCharsString, *resultInvalidCharsString;
1194
1195 bytesString=s=buffer;
1196 s=printBytes(cc.bytes, cc.bytesLength, bytesString);
1197 s=printUnicode(cc.unicode, cc.unicodeLength, unicodeString=s);
1198 s=printUnicode(result, resultLength, resultString=s);
1199 s=printOffsets(cc.offsets, cc.unicodeLength, offsetsString=s);
1200 s=printOffsets(resultOffsets, resultLength, resultOffsetsString=s);
1201 s=printBytes(cc.invalidChars, cc.invalidLength, invalidCharsString=s);
1202 s=printBytes((uint8_t *)resultInvalidChars, resultInvalidLength, resultInvalidCharsString=s);
1203
1204 if((s-buffer)>(int32_t)sizeof(buffer)) {
1205 errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) fatal error: checkToUnicode() test output buffer overflow writing %d chars\n",
1206 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, (int)(s-buffer));
1207 exit(1);
1208 }
1209
1210 errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) failed: %s\n"
1211 " bytes <%s>[%d]\n"
1212 " expected <%s>[%d]\n"
1213 " result <%s>[%d]\n"
1214 " offsets <%s>\n"
1215 " result offsets <%s>\n"
1216 " error code expected %s got %s\n"
1217 " invalidChars expected <%s> got <%s>\n",
1218 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, msg,
1219 bytesString, cc.bytesLength,
1220 unicodeString, cc.unicodeLength,
1221 resultString, resultLength,
1222 offsetsString,
1223 resultOffsetsString,
1224 u_errorName(cc.outErrorCode), u_errorName(resultErrorCode),
1225 invalidCharsString, resultInvalidCharsString);
1226
1227 return FALSE;
1228 }
1229}
1230
1231// fromUnicode test worker functions --------------------------------------- ***
1232
1233static int32_t
1234stepFromUTF8(ConversionCase &cc,
1235 UConverter *utf8Cnv, UConverter *cnv,
1236 char *result, int32_t resultCapacity,
1237 int32_t step,
1238 UErrorCode *pErrorCode) {
1239 const char *source, *sourceLimit, *utf8Limit;
1240 UChar pivotBuffer[32];
1241 UChar *pivotSource, *pivotTarget, *pivotLimit;
1242 char *target, *targetLimit, *resultLimit;
1243 UBool flush;
1244
1245 source=cc.utf8;
1246 pivotSource=pivotTarget=pivotBuffer;
1247 target=result;
1248 utf8Limit=source+cc.utf8Length;
1249 resultLimit=result+resultCapacity;
1250
1251 // call ucnv_convertEx() with in/out buffers no larger than (step) at a time
1252 // move only one buffer (in vs. out) at a time to be extra mean
1253 // step==0 performs bulk conversion
1254
1255 // initialize the partial limits for the loop
1256 if(step==0) {
1257 // use the entire buffers
1258 sourceLimit=utf8Limit;
1259 targetLimit=resultLimit;
1260 flush=cc.finalFlush;
1261
1262 pivotLimit=pivotBuffer+UPRV_LENGTHOF(pivotBuffer);
1263 } else {
1264 // start with empty partial buffers
1265 sourceLimit=source;
1266 targetLimit=target;
1267 flush=FALSE;
1268
1269 // empty pivot is not allowed, make it of length step
1270 pivotLimit=pivotBuffer+step;
1271 }
1272
1273 for(;;) {
1274 // resetting the opposite conversion direction must not affect this one
1275 ucnv_resetFromUnicode(utf8Cnv);
1276 ucnv_resetToUnicode(cnv);
1277
1278 // convert
1279 ucnv_convertEx(cnv, utf8Cnv,
1280 &target, targetLimit,
1281 &source, sourceLimit,
1282 pivotBuffer, &pivotSource, &pivotTarget, pivotLimit,
1283 FALSE, flush, pErrorCode);
1284
1285 // check pointers and errors
1286 if(source>sourceLimit || target>targetLimit) {
1287 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1288 break;
1289 } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
1290 if(target!=targetLimit) {
1291 // buffer overflow must only be set when the target is filled
1292 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1293 break;
1294 } else if(targetLimit==resultLimit) {
1295 // not just a partial overflow
1296 break;
1297 }
1298
1299 // the partial target is filled, set a new limit, reset the error and continue
1300 targetLimit=(resultLimit-target)>=step ? target+step : resultLimit;
1301 *pErrorCode=U_ZERO_ERROR;
1302 } else if(U_FAILURE(*pErrorCode)) {
1303 if(pivotSource==pivotBuffer) {
1304 // toUnicode error, should not occur
1305 // toUnicode errors are tested in cintltst TestConvertExFromUTF8()
1306 break;
1307 } else {
1308 // fromUnicode error
1309 // some other error occurred, done
1310 break;
1311 }
1312 } else {
1313 if(source!=sourceLimit) {
1314 // when no error occurs, then the input must be consumed
1315 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1316 break;
1317 }
1318
1319 if(sourceLimit==utf8Limit) {
1320 // we are done
1321 if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) {
1322 // ucnv_convertEx() warns about not terminating the output
1323 // but ucnv_fromUnicode() does not and so
1324 // checkFromUnicode() does not expect it
1325 *pErrorCode=U_ZERO_ERROR;
1326 }
1327 break;
1328 }
1329
1330 // the partial conversion succeeded, set a new limit and continue
1331 sourceLimit=(utf8Limit-source)>=step ? source+step : utf8Limit;
1332 flush=(UBool)(cc.finalFlush && sourceLimit==utf8Limit);
1333 }
1334 }
1335
1336 return (int32_t)(target-result);
1337}
1338
1339static int32_t
1340stepFromUnicode(ConversionCase &cc, UConverter *cnv,
1341 char *result, int32_t resultCapacity,
1342 int32_t *resultOffsets, /* also resultCapacity */
1343 int32_t step,
1344 UErrorCode *pErrorCode) {
1345 const UChar *source, *sourceLimit, *unicodeLimit;
1346 char *target, *targetLimit, *resultLimit;
1347 UBool flush;
1348
1349 source=cc.unicode;
1350 target=result;
1351 unicodeLimit=source+cc.unicodeLength;
1352 resultLimit=result+resultCapacity;
1353
1354 // call ucnv_fromUnicode() with in/out buffers no larger than (step) at a time
1355 // move only one buffer (in vs. out) at a time to be extra mean
1356 // step==0 performs bulk conversion and generates offsets
1357
1358 // initialize the partial limits for the loop
1359 if(step==0) {
1360 // use the entire buffers
1361 sourceLimit=unicodeLimit;
1362 targetLimit=resultLimit;
1363 flush=cc.finalFlush;
1364 } else {
1365 // start with empty partial buffers
1366 sourceLimit=source;
1367 targetLimit=target;
1368 flush=FALSE;
1369
1370 // output offsets only for bulk conversion
1371 resultOffsets=NULL;
1372 }
1373
1374 for(;;) {
1375 // resetting the opposite conversion direction must not affect this one
1376 ucnv_resetToUnicode(cnv);
1377
1378 // convert
1379 ucnv_fromUnicode(cnv,
1380 &target, targetLimit,
1381 &source, sourceLimit,
1382 resultOffsets,
1383 flush, pErrorCode);
1384
1385 // check pointers and errors
1386 if(source>sourceLimit || target>targetLimit) {
1387 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1388 break;
1389 } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
1390 if(target!=targetLimit) {
1391 // buffer overflow must only be set when the target is filled
1392 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1393 break;
1394 } else if(targetLimit==resultLimit) {
1395 // not just a partial overflow
1396 break;
1397 }
1398
1399 // the partial target is filled, set a new limit, reset the error and continue
1400 targetLimit=(resultLimit-target)>=step ? target+step : resultLimit;
1401 *pErrorCode=U_ZERO_ERROR;
1402 } else if(U_FAILURE(*pErrorCode)) {
1403 // some other error occurred, done
1404 break;
1405 } else {
1406 if(source!=sourceLimit) {
1407 // when no error occurs, then the input must be consumed
1408 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1409 break;
1410 }
1411
1412 if(sourceLimit==unicodeLimit) {
1413 // we are done
1414 break;
1415 }
1416
1417 // the partial conversion succeeded, set a new limit and continue
1418 sourceLimit=(unicodeLimit-source)>=step ? source+step : unicodeLimit;
1419 flush=(UBool)(cc.finalFlush && sourceLimit==unicodeLimit);
1420 }
1421 }
1422
1423 return (int32_t)(target-result);
1424}
1425
1426UBool
1427ConversionTest::FromUnicodeCase(ConversionCase &cc, UConverterFromUCallback callback, const char *option) {
1428 UConverter *cnv;
1429 UErrorCode errorCode;
1430
1431 // open the converter
1432 errorCode=U_ZERO_ERROR;
1433 cnv=cnv_open(cc.charset, errorCode);
1434 if(U_FAILURE(errorCode)) {
1435 errcheckln(errorCode, "fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_open() failed - %s",
1436 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
1437 return FALSE;
1438 }
1439 ucnv_resetToUnicode(utf8Cnv);
1440
1441 // set the callback
1442 if(callback!=NULL) {
1443 ucnv_setFromUCallBack(cnv, callback, option, NULL, NULL, &errorCode);
1444 if(U_FAILURE(errorCode)) {
1445 errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setFromUCallBack() failed - %s",
1446 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
1447 ucnv_close(cnv);
1448 return FALSE;
1449 }
1450 }
1451
1452 // set the fallbacks flag
1453 // TODO change with Jitterbug 2401, then add a similar call for toUnicode too
1454 ucnv_setFallback(cnv, cc.fallbacks);
1455
1456 // set the subchar
1457 int32_t length;
1458
1459 if(cc.setSub>0) {
1460 length=(int32_t)strlen(cc.subchar);
1461 ucnv_setSubstChars(cnv, cc.subchar, (int8_t)length, &errorCode);
1462 if(U_FAILURE(errorCode)) {
1463 errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setSubstChars() failed - %s",
1464 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
1465 ucnv_close(cnv);
1466 return FALSE;
1467 }
1468 } else if(cc.setSub<0) {
1469 ucnv_setSubstString(cnv, cc.subString, -1, &errorCode);
1470 if(U_FAILURE(errorCode)) {
1471 errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setSubstString() failed - %s",
1472 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
1473 ucnv_close(cnv);
1474 return FALSE;
1475 }
1476 }
1477
1478 // convert unicode to utf8
1479 char utf8[256];
1480 cc.utf8=utf8;
1481 u_strToUTF8(utf8, UPRV_LENGTHOF(utf8), &cc.utf8Length,
1482 cc.unicode, cc.unicodeLength,
1483 &errorCode);
1484 if(U_FAILURE(errorCode)) {
1485 // skip UTF-8 testing of a string with an unpaired surrogate,
1486 // or of one that's too long
1487 // toUnicode errors are tested in cintltst TestConvertExFromUTF8()
1488 cc.utf8Length=-1;
1489 }
1490
1491 int32_t resultOffsets[256];
1492 char result[256];
1493 int32_t resultLength;
1494 UBool ok;
1495
1496 static const struct {
1497 int32_t step;
1498 const char *name, *utf8Name;
1499 } steps[]={
1500 { 0, "bulk", "utf8" }, // must be first for offsets to be checked
1501 { 1, "step=1", "utf8 step=1" },
1502 { 3, "step=3", "utf8 step=3" },
1503 { 7, "step=7", "utf8 step=7" }
1504 };
1505 int32_t i, step;
1506
1507 ok=TRUE;
1508 for(i=0; i<UPRV_LENGTHOF(steps) && ok; ++i) {
1509 step=steps[i].step;
1510 memset(resultOffsets, -1, UPRV_LENGTHOF(resultOffsets));
1511 memset(result, -1, UPRV_LENGTHOF(result));
1512 errorCode=U_ZERO_ERROR;
1513 resultLength=stepFromUnicode(cc, cnv,
1514 result, UPRV_LENGTHOF(result),
1515 step==0 ? resultOffsets : NULL,
1516 step, &errorCode);
1517 ok=checkFromUnicode(
1518 cc, cnv, steps[i].name,
1519 (uint8_t *)result, resultLength,
1520 cc.offsets!=NULL ? resultOffsets : NULL,
1521 errorCode);
1522 if(U_FAILURE(errorCode) || !cc.finalFlush) {
1523 // reset if an error occurred or we did not flush
1524 // otherwise do nothing to make sure that flushing resets
1525 ucnv_resetFromUnicode(cnv);
1526 }
1527 if (resultOffsets[resultLength] != -1) {
1528 errln("fromUnicode[%d](%s) Conversion wrote too much to offsets at index %d",
1529 cc.caseNr, cc.charset, resultLength);
1530 }
1531 if (result[resultLength] != (char)-1) {
1532 errln("fromUnicode[%d](%s) Conversion wrote too much to result at index %d",
1533 cc.caseNr, cc.charset, resultLength);
1534 }
1535
1536 // bulk test is first, then offsets are not checked any more
1537 cc.offsets=NULL;
1538
1539 // test direct conversion from UTF-8
1540 if(cc.utf8Length>=0) {
1541 errorCode=U_ZERO_ERROR;
1542 resultLength=stepFromUTF8(cc, utf8Cnv, cnv,
1543 result, UPRV_LENGTHOF(result),
1544 step, &errorCode);
1545 ok=checkFromUnicode(
1546 cc, cnv, steps[i].utf8Name,
1547 (uint8_t *)result, resultLength,
1548 NULL,
1549 errorCode);
1550 if(U_FAILURE(errorCode) || !cc.finalFlush) {
1551 // reset if an error occurred or we did not flush
1552 // otherwise do nothing to make sure that flushing resets
1553 ucnv_resetToUnicode(utf8Cnv);
1554 ucnv_resetFromUnicode(cnv);
1555 }
1556 }
1557 }
1558
1559 // not a real loop, just a convenience for breaking out of the block
1560 while(ok && cc.finalFlush) {
1561 // test ucnv_fromUChars()
1562 memset(result, 0, sizeof(result));
1563
1564 errorCode=U_ZERO_ERROR;
1565 resultLength=ucnv_fromUChars(cnv,
1566 result, UPRV_LENGTHOF(result),
1567 cc.unicode, cc.unicodeLength,
1568 &errorCode);
1569 ok=checkFromUnicode(
1570 cc, cnv, "fromUChars",
1571 (uint8_t *)result, resultLength,
1572 NULL,
1573 errorCode);
1574 if(!ok) {
1575 break;
1576 }
1577
1578 // test preflighting
1579 // keep the correct result for simple checking
1580 errorCode=U_ZERO_ERROR;
1581 resultLength=ucnv_fromUChars(cnv,
1582 NULL, 0,
1583 cc.unicode, cc.unicodeLength,
1584 &errorCode);
1585 if(errorCode==U_STRING_NOT_TERMINATED_WARNING || errorCode==U_BUFFER_OVERFLOW_ERROR) {
1586 errorCode=U_ZERO_ERROR;
1587 }
1588 ok=checkFromUnicode(
1589 cc, cnv, "preflight fromUChars",
1590 (uint8_t *)result, resultLength,
1591 NULL,
1592 errorCode);
1593 break;
1594 }
1595
1596 ucnv_close(cnv);
1597 return ok;
1598}
1599
1600UBool
1601ConversionTest::checkFromUnicode(ConversionCase &cc, UConverter *cnv, const char *name,
1602 const uint8_t *result, int32_t resultLength,
1603 const int32_t *resultOffsets,
1604 UErrorCode resultErrorCode) {
1605 UChar resultInvalidUChars[8];
1606 int8_t resultInvalidLength;
1607 UErrorCode errorCode;
1608
1609 const char *msg;
1610
1611 // reset the message; NULL will mean "ok"
1612 msg=NULL;
1613
1614 errorCode=U_ZERO_ERROR;
1615 resultInvalidLength=UPRV_LENGTHOF(resultInvalidUChars);
1616 ucnv_getInvalidUChars(cnv, resultInvalidUChars, &resultInvalidLength, &errorCode);
1617 if(U_FAILURE(errorCode)) {
1618 errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) ucnv_getInvalidUChars() failed - %s",
1619 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, u_errorName(errorCode));
1620 return FALSE;
1621 }
1622
1623 // check everything that might have gone wrong
1624 if(cc.bytesLength!=resultLength) {
1625 msg="wrong result length";
1626 } else if(0!=memcmp(cc.bytes, result, cc.bytesLength)) {
1627 msg="wrong result string";
1628 } else if(cc.offsets!=NULL && 0!=memcmp(cc.offsets, resultOffsets, cc.bytesLength*sizeof(*cc.offsets))) {
1629 msg="wrong offsets";
1630 } else if(cc.outErrorCode!=resultErrorCode) {
1631 msg="wrong error code";
1632 } else if(cc.invalidLength!=resultInvalidLength) {
1633 msg="wrong length of last invalid input";
1634 } else if(0!=u_memcmp(cc.invalidUChars, resultInvalidUChars, cc.invalidLength)) {
1635 msg="wrong last invalid input";
1636 }
1637
1638 if(msg==NULL) {
1639 return TRUE;
1640 } else {
1641 char buffer[2000]; // one buffer for all strings
1642 char *s, *unicodeString, *bytesString, *resultString,
1643 *offsetsString, *resultOffsetsString,
1644 *invalidCharsString, *resultInvalidUCharsString;
1645
1646 unicodeString=s=buffer;
1647 s=printUnicode(cc.unicode, cc.unicodeLength, unicodeString);
1648 s=printBytes(cc.bytes, cc.bytesLength, bytesString=s);
1649 s=printBytes(result, resultLength, resultString=s);
1650 s=printOffsets(cc.offsets, cc.bytesLength, offsetsString=s);
1651 s=printOffsets(resultOffsets, resultLength, resultOffsetsString=s);
1652 s=printUnicode(cc.invalidUChars, cc.invalidLength, invalidCharsString=s);
1653 s=printUnicode(resultInvalidUChars, resultInvalidLength, resultInvalidUCharsString=s);
1654
1655 if((s-buffer)>(int32_t)sizeof(buffer)) {
1656 errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) fatal error: checkFromUnicode() test output buffer overflow writing %d chars\n",
1657 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, (int)(s-buffer));
1658 exit(1);
1659 }
1660
1661 errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) failed: %s\n"
1662 " unicode <%s>[%d]\n"
1663 " expected <%s>[%d]\n"
1664 " result <%s>[%d]\n"
1665 " offsets <%s>\n"
1666 " result offsets <%s>\n"
1667 " error code expected %s got %s\n"
1668 " invalidChars expected <%s> got <%s>\n",
1669 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, msg,
1670 unicodeString, cc.unicodeLength,
1671 bytesString, cc.bytesLength,
1672 resultString, resultLength,
1673 offsetsString,
1674 resultOffsetsString,
1675 u_errorName(cc.outErrorCode), u_errorName(resultErrorCode),
1676 invalidCharsString, resultInvalidUCharsString);
1677
1678 return FALSE;
1679 }
1680}
1681
1682#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */