]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/cintltst/cucdtst.c
ICU-59173.0.1.tar.gz
[apple/icu.git] / icuSources / test / cintltst / cucdtst.c
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 1997-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8 /*******************************************************************************
9 *
10 * File CUCDTST.C
11 *
12 * Modification History:
13 * Name Description
14 * Madhu Katragadda Ported for C API, added tests for string functions
15 ********************************************************************************
16 */
17
18 #include <string.h>
19 #include <math.h>
20 #include <stdlib.h>
21
22 #include "unicode/utypes.h"
23 #include "unicode/uchar.h"
24 #include "unicode/putil.h"
25 #include "unicode/ustring.h"
26 #include "unicode/uloc.h"
27 #include "unicode/unorm2.h"
28
29 #include "cintltst.h"
30 #include "putilimp.h"
31 #include "uparse.h"
32 #include "ucase.h"
33 #include "ubidi_props.h"
34 #include "uprops.h"
35 #include "uset_imp.h"
36 #include "usc_impl.h"
37 #include "udatamem.h"
38 #include "cucdapi.h"
39 #include "cmemory.h"
40
41 /* prototypes --------------------------------------------------------------- */
42
43 static void TestUpperLower(void);
44 static void TestLetterNumber(void);
45 static void TestMisc(void);
46 static void TestPOSIX(void);
47 static void TestControlPrint(void);
48 static void TestIdentifier(void);
49 static void TestUnicodeData(void);
50 static void TestCodeUnit(void);
51 static void TestCodePoint(void);
52 static void TestCharLength(void);
53 static void TestCharNames(void);
54 static void TestUCharFromNameUnderflow(void);
55 static void TestMirroring(void);
56 static void TestUScriptRunAPI(void);
57 static void TestAdditionalProperties(void);
58 static void TestNumericProperties(void);
59 static void TestPropertyNames(void);
60 static void TestPropertyValues(void);
61 static void TestConsistency(void);
62 static void TestUBiDiProps(void);
63 static void TestCaseFolding(void);
64
65 /* internal methods used */
66 static int32_t MakeProp(char* str);
67 static int32_t MakeDir(char* str);
68
69 /* helpers ------------------------------------------------------------------ */
70
71 static void
72 parseUCDFile(const char *filename,
73 char *fields[][2], int32_t fieldCount,
74 UParseLineFn *lineFn, void *context,
75 UErrorCode *pErrorCode) {
76 char path[256];
77 char backupPath[256];
78
79 if(U_FAILURE(*pErrorCode)) {
80 return;
81 }
82
83 /* Look inside ICU_DATA first */
84 strcpy(path, u_getDataDirectory());
85 strcat(path, ".." U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING);
86 strcat(path, filename);
87
88 /* As a fallback, try to guess where the source data was located
89 * at the time ICU was built, and look there.
90 */
91 strcpy(backupPath, ctest_dataSrcDir());
92 strcat(backupPath, U_FILE_SEP_STRING);
93 strcat(backupPath, "unidata" U_FILE_SEP_STRING);
94 strcat(backupPath, filename);
95
96 u_parseDelimitedFile(path, ';', fields, fieldCount, lineFn, context, pErrorCode);
97 if(*pErrorCode==U_FILE_ACCESS_ERROR) {
98 *pErrorCode=U_ZERO_ERROR;
99 u_parseDelimitedFile(backupPath, ';', fields, fieldCount, lineFn, context, pErrorCode);
100 }
101 if(U_FAILURE(*pErrorCode)) {
102 log_err_status(*pErrorCode, "error parsing %s: %s\n", filename, u_errorName(*pErrorCode));
103 }
104 }
105
106 /* test data ---------------------------------------------------------------- */
107
108 static const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf";
109 static const int32_t tagValues[] =
110 {
111 /* Mn */ U_NON_SPACING_MARK,
112 /* Mc */ U_COMBINING_SPACING_MARK,
113 /* Me */ U_ENCLOSING_MARK,
114 /* Nd */ U_DECIMAL_DIGIT_NUMBER,
115 /* Nl */ U_LETTER_NUMBER,
116 /* No */ U_OTHER_NUMBER,
117 /* Zs */ U_SPACE_SEPARATOR,
118 /* Zl */ U_LINE_SEPARATOR,
119 /* Zp */ U_PARAGRAPH_SEPARATOR,
120 /* Cc */ U_CONTROL_CHAR,
121 /* Cf */ U_FORMAT_CHAR,
122 /* Cs */ U_SURROGATE,
123 /* Co */ U_PRIVATE_USE_CHAR,
124 /* Cn */ U_UNASSIGNED,
125 /* Lu */ U_UPPERCASE_LETTER,
126 /* Ll */ U_LOWERCASE_LETTER,
127 /* Lt */ U_TITLECASE_LETTER,
128 /* Lm */ U_MODIFIER_LETTER,
129 /* Lo */ U_OTHER_LETTER,
130 /* Pc */ U_CONNECTOR_PUNCTUATION,
131 /* Pd */ U_DASH_PUNCTUATION,
132 /* Ps */ U_START_PUNCTUATION,
133 /* Pe */ U_END_PUNCTUATION,
134 /* Po */ U_OTHER_PUNCTUATION,
135 /* Sm */ U_MATH_SYMBOL,
136 /* Sc */ U_CURRENCY_SYMBOL,
137 /* Sk */ U_MODIFIER_SYMBOL,
138 /* So */ U_OTHER_SYMBOL,
139 /* Pi */ U_INITIAL_PUNCTUATION,
140 /* Pf */ U_FINAL_PUNCTUATION
141 };
142
143 static const char dirStrings[][5] = {
144 "L",
145 "R",
146 "EN",
147 "ES",
148 "ET",
149 "AN",
150 "CS",
151 "B",
152 "S",
153 "WS",
154 "ON",
155 "LRE",
156 "LRO",
157 "AL",
158 "RLE",
159 "RLO",
160 "PDF",
161 "NSM",
162 "BN",
163 /* new in Unicode 6.3/ICU 52 */
164 "FSI",
165 "LRI",
166 "RLI",
167 "PDI"
168 };
169
170 void addUnicodeTest(TestNode** root);
171
172 void addUnicodeTest(TestNode** root)
173 {
174 addTest(root, &TestCodeUnit, "tsutil/cucdtst/TestCodeUnit");
175 addTest(root, &TestCodePoint, "tsutil/cucdtst/TestCodePoint");
176 addTest(root, &TestCharLength, "tsutil/cucdtst/TestCharLength");
177 addTest(root, &TestBinaryValues, "tsutil/cucdtst/TestBinaryValues");
178 addTest(root, &TestUnicodeData, "tsutil/cucdtst/TestUnicodeData");
179 addTest(root, &TestAdditionalProperties, "tsutil/cucdtst/TestAdditionalProperties");
180 addTest(root, &TestNumericProperties, "tsutil/cucdtst/TestNumericProperties");
181 addTest(root, &TestUpperLower, "tsutil/cucdtst/TestUpperLower");
182 addTest(root, &TestLetterNumber, "tsutil/cucdtst/TestLetterNumber");
183 addTest(root, &TestMisc, "tsutil/cucdtst/TestMisc");
184 addTest(root, &TestPOSIX, "tsutil/cucdtst/TestPOSIX");
185 addTest(root, &TestControlPrint, "tsutil/cucdtst/TestControlPrint");
186 addTest(root, &TestIdentifier, "tsutil/cucdtst/TestIdentifier");
187 addTest(root, &TestCharNames, "tsutil/cucdtst/TestCharNames");
188 addTest(root, &TestUCharFromNameUnderflow, "tsutil/cucdtst/TestUCharFromNameUnderflow");
189 addTest(root, &TestMirroring, "tsutil/cucdtst/TestMirroring");
190 addTest(root, &TestUScriptCodeAPI, "tsutil/cucdtst/TestUScriptCodeAPI");
191 addTest(root, &TestHasScript, "tsutil/cucdtst/TestHasScript");
192 addTest(root, &TestGetScriptExtensions, "tsutil/cucdtst/TestGetScriptExtensions");
193 addTest(root, &TestScriptMetadataAPI, "tsutil/cucdtst/TestScriptMetadataAPI");
194 addTest(root, &TestUScriptRunAPI, "tsutil/cucdtst/TestUScriptRunAPI");
195 addTest(root, &TestPropertyNames, "tsutil/cucdtst/TestPropertyNames");
196 addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues");
197 addTest(root, &TestConsistency, "tsutil/cucdtst/TestConsistency");
198 addTest(root, &TestUBiDiProps, "tsutil/cucdtst/TestUBiDiProps");
199 addTest(root, &TestCaseFolding, "tsutil/cucdtst/TestCaseFolding");
200 }
201
202 /*==================================================== */
203 /* test u_toupper() and u_tolower() */
204 /*==================================================== */
205 static void TestUpperLower()
206 {
207 const UChar upper[] = {0x41, 0x42, 0x00b2, 0x01c4, 0x01c6, 0x01c9, 0x01c8, 0x01c9, 0x000c, 0x0000};
208 const UChar lower[] = {0x61, 0x62, 0x00b2, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01c9, 0x000c, 0x0000};
209 U_STRING_DECL(upperTest, "abcdefg123hij.?:klmno", 21);
210 U_STRING_DECL(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
211 int32_t i;
212
213 U_STRING_INIT(upperTest, "abcdefg123hij.?:klmno", 21);
214 U_STRING_INIT(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
215
216 /*
217 Checks LetterLike Symbols which were previously a source of confusion
218 [Bertrand A. D. 02/04/98]
219 */
220 for (i=0x2100;i<0x2138;i++)
221 {
222 /* Unicode 5.0 adds lowercase U+214E (TURNED SMALL F) to U+2132 (TURNED CAPITAL F) */
223 if(i!=0x2126 && i!=0x212a && i!=0x212b && i!=0x2132)
224 {
225 if (i != (int)u_tolower(i)) /* itself */
226 log_err("Failed case conversion with itself: U+%04x\n", i);
227 if (i != (int)u_toupper(i))
228 log_err("Failed case conversion with itself: U+%04x\n", i);
229 }
230 }
231
232 for(i=0; i < u_strlen(upper); i++){
233 if(u_tolower(upper[i]) != lower[i]){
234 log_err("FAILED u_tolower() for %lx Expected %lx Got %lx\n", upper[i], lower[i], u_tolower(upper[i]));
235 }
236 }
237
238 log_verbose("testing upper lower\n");
239 for (i = 0; i < 21; i++) {
240
241 if (u_isalpha(upperTest[i]) && !u_islower(upperTest[i]))
242 {
243 log_err("Failed isLowerCase test at %c\n", upperTest[i]);
244 }
245 else if (u_isalpha(lowerTest[i]) && !u_isupper(lowerTest[i]))
246 {
247 log_err("Failed isUpperCase test at %c\n", lowerTest[i]);
248 }
249 else if (upperTest[i] != u_tolower(lowerTest[i]))
250 {
251 log_err("Failed case conversion from %c To %c :\n", lowerTest[i], upperTest[i]);
252 }
253 else if (lowerTest[i] != u_toupper(upperTest[i]))
254 {
255 log_err("Failed case conversion : %c To %c \n", upperTest[i], lowerTest[i]);
256 }
257 else if (upperTest[i] != u_tolower(upperTest[i]))
258 {
259 log_err("Failed case conversion with itself: %c\n", upperTest[i]);
260 }
261 else if (lowerTest[i] != u_toupper(lowerTest[i]))
262 {
263 log_err("Failed case conversion with itself: %c\n", lowerTest[i]);
264 }
265 }
266 log_verbose("done testing upper lower\n");
267
268 log_verbose("testing u_istitle\n");
269 {
270 static const UChar expected[] = {
271 0x1F88,
272 0x1F89,
273 0x1F8A,
274 0x1F8B,
275 0x1F8C,
276 0x1F8D,
277 0x1F8E,
278 0x1F8F,
279 0x1F88,
280 0x1F89,
281 0x1F8A,
282 0x1F8B,
283 0x1F8C,
284 0x1F8D,
285 0x1F8E,
286 0x1F8F,
287 0x1F98,
288 0x1F99,
289 0x1F9A,
290 0x1F9B,
291 0x1F9C,
292 0x1F9D,
293 0x1F9E,
294 0x1F9F,
295 0x1F98,
296 0x1F99,
297 0x1F9A,
298 0x1F9B,
299 0x1F9C,
300 0x1F9D,
301 0x1F9E,
302 0x1F9F,
303 0x1FA8,
304 0x1FA9,
305 0x1FAA,
306 0x1FAB,
307 0x1FAC,
308 0x1FAD,
309 0x1FAE,
310 0x1FAF,
311 0x1FA8,
312 0x1FA9,
313 0x1FAA,
314 0x1FAB,
315 0x1FAC,
316 0x1FAD,
317 0x1FAE,
318 0x1FAF,
319 0x1FBC,
320 0x1FBC,
321 0x1FCC,
322 0x1FCC,
323 0x1FFC,
324 0x1FFC,
325 };
326 int32_t num = UPRV_LENGTHOF(expected);
327 for(i=0; i<num; i++){
328 if(!u_istitle(expected[i])){
329 log_err("u_istitle failed for 0x%4X. Expected TRUE, got FALSE\n",expected[i]);
330 }
331 }
332
333 }
334 }
335
336 /* compare two sets and verify that their difference or intersection is empty */
337 static UBool
338 showADiffB(const USet *a, const USet *b,
339 const char *a_name, const char *b_name,
340 UBool expect, UBool diffIsError) {
341 USet *aa;
342 int32_t i, start, end, length;
343 UErrorCode errorCode;
344
345 /*
346 * expect:
347 * TRUE -> a-b should be empty, that is, b should contain all of a
348 * FALSE -> a&b should be empty, that is, a should contain none of b (and vice versa)
349 */
350 if(expect ? uset_containsAll(b, a) : uset_containsNone(a, b)) {
351 return TRUE;
352 }
353
354 /* clone a to aa because a is const */
355 aa=uset_open(1, 0);
356 if(aa==NULL) {
357 /* unusual problem - out of memory? */
358 return FALSE;
359 }
360 uset_addAll(aa, a);
361
362 /* compute the set in question */
363 if(expect) {
364 /* a-b */
365 uset_removeAll(aa, b);
366 } else {
367 /* a&b */
368 uset_retainAll(aa, b);
369 }
370
371 /* aa is not empty because of the initial tests above; show its contents */
372 errorCode=U_ZERO_ERROR;
373 i=0;
374 for(;;) {
375 length=uset_getItem(aa, i, &start, &end, NULL, 0, &errorCode);
376 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
377 break; /* done */
378 }
379 if(U_FAILURE(errorCode)) {
380 log_err("error comparing %s with %s at difference item %d: %s\n",
381 a_name, b_name, i, u_errorName(errorCode));
382 break;
383 }
384 if(length!=0) {
385 break; /* done with code points, got a string or -1 */
386 }
387
388 if(diffIsError) {
389 if(expect) {
390 log_err("error: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
391 } else {
392 log_err("error: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
393 }
394 } else {
395 if(expect) {
396 log_verbose("info: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
397 } else {
398 log_verbose("info: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
399 }
400 }
401
402 ++i;
403 }
404
405 uset_close(aa);
406 return FALSE;
407 }
408
409 static UBool
410 showAMinusB(const USet *a, const USet *b,
411 const char *a_name, const char *b_name,
412 UBool diffIsError) {
413 return showADiffB(a, b, a_name, b_name, TRUE, diffIsError);
414 }
415
416 static UBool
417 showAIntersectB(const USet *a, const USet *b,
418 const char *a_name, const char *b_name,
419 UBool diffIsError) {
420 return showADiffB(a, b, a_name, b_name, FALSE, diffIsError);
421 }
422
423 static UBool
424 compareUSets(const USet *a, const USet *b,
425 const char *a_name, const char *b_name,
426 UBool diffIsError) {
427 /*
428 * Use an arithmetic & not a logical && so that both branches
429 * are always taken and all differences are shown.
430 */
431 return
432 showAMinusB(a, b, a_name, b_name, diffIsError) &
433 showAMinusB(b, a, b_name, a_name, diffIsError);
434 }
435
436 /* test isLetter(u_isapha()) and isDigit(u_isdigit()) */
437 static void TestLetterNumber()
438 {
439 UChar i = 0x0000;
440
441 log_verbose("Testing for isalpha\n");
442 for (i = 0x0041; i < 0x005B; i++) {
443 if (!u_isalpha(i))
444 {
445 log_err("Failed isLetter test at %.4X\n", i);
446 }
447 }
448 for (i = 0x0660; i < 0x066A; i++) {
449 if (u_isalpha(i))
450 {
451 log_err("Failed isLetter test with numbers at %.4X\n", i);
452 }
453 }
454
455 log_verbose("Testing for isdigit\n");
456 for (i = 0x0660; i < 0x066A; i++) {
457 if (!u_isdigit(i))
458 {
459 log_verbose("Failed isNumber test at %.4X\n", i);
460 }
461 }
462
463 log_verbose("Testing for isalnum\n");
464 for (i = 0x0041; i < 0x005B; i++) {
465 if (!u_isalnum(i))
466 {
467 log_err("Failed isAlNum test at %.4X\n", i);
468 }
469 }
470 for (i = 0x0660; i < 0x066A; i++) {
471 if (!u_isalnum(i))
472 {
473 log_err("Failed isAlNum test at %.4X\n", i);
474 }
475 }
476
477 {
478 /*
479 * The following checks work only starting from Unicode 4.0.
480 * Check the version number here.
481 */
482 static UVersionInfo u401={ 4, 0, 1, 0 };
483 UVersionInfo version;
484 u_getUnicodeVersion(version);
485 if(version[0]<4 || 0==memcmp(version, u401, 4)) {
486 return;
487 }
488 }
489
490 {
491 /*
492 * Sanity check:
493 * Verify that exactly the digit characters have decimal digit values.
494 * This assumption is used in the implementation of u_digit()
495 * (which checks nt=de)
496 * compared with the parallel java.lang.Character.digit()
497 * (which checks Nd).
498 *
499 * This was not true in Unicode 3.2 and earlier.
500 * Unicode 4.0 fixed discrepancies.
501 * Unicode 4.0.1 re-introduced problems in this area due to an
502 * unintentionally incomplete last-minute change.
503 */
504 U_STRING_DECL(digitsPattern, "[:Nd:]", 6);
505 U_STRING_DECL(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
506
507 USet *digits, *decimalValues;
508 UErrorCode errorCode;
509
510 U_STRING_INIT(digitsPattern, "[:Nd:]", 6);
511 U_STRING_INIT(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
512 errorCode=U_ZERO_ERROR;
513 digits=uset_openPattern(digitsPattern, 6, &errorCode);
514 decimalValues=uset_openPattern(decimalValuesPattern, 24, &errorCode);
515
516 if(U_SUCCESS(errorCode)) {
517 compareUSets(digits, decimalValues, "[:Nd:]", "[:Numeric_Type=Decimal:]", TRUE);
518 }
519
520 uset_close(digits);
521 uset_close(decimalValues);
522 }
523 }
524
525 static void testSampleCharProps(UBool propFn(UChar32), const char *propName,
526 const UChar32 *sampleChars, int32_t sampleCharsLength,
527 UBool expected) {
528 int32_t i;
529 for (i = 0; i < sampleCharsLength; ++i) {
530 UBool result = propFn(sampleChars[i]);
531 if (result != expected) {
532 log_err("error: character property function %s(U+%04x)=%d is wrong\n",
533 propName, sampleChars[i], result);
534 }
535 }
536 }
537
538 /* Tests for isDefined(u_isdefined)(, isBaseForm(u_isbase()), isSpaceChar(u_isspace()), isWhiteSpace(), u_CharDigitValue() */
539 static void TestMisc()
540 {
541 static const UChar32 sampleSpaces[] = {0x0020, 0x00a0, 0x2000, 0x2001, 0x2005};
542 static const UChar32 sampleNonSpaces[] = {0x61, 0x62, 0x63, 0x64, 0x74};
543 static const UChar32 sampleUndefined[] = {0xfff1, 0xfff7, 0xfa6e};
544 static const UChar32 sampleDefined[] = {0x523E, 0x4f88, 0xfffd};
545 static const UChar32 sampleBase[] = {0x0061, 0x0031, 0x03d2};
546 static const UChar32 sampleNonBase[] = {0x002B, 0x0020, 0x203B};
547 /* static const UChar sampleChars[] = {0x000a, 0x0045, 0x4e00, 0xDC00, 0xFFE8, 0xFFF0};*/
548 static const UChar32 sampleDigits[]= {0x0030, 0x0662, 0x0F23, 0x0ED5};
549 static const UChar32 sampleNonDigits[] = {0x0010, 0x0041, 0x0122, 0x68FE};
550 static const UChar32 sampleWhiteSpaces[] = {0x2008, 0x2009, 0x200a, 0x001c, 0x000c};
551 static const UChar32 sampleNonWhiteSpaces[] = {0x61, 0x62, 0x3c, 0x28, 0x3f, 0x85, 0x2007, 0xffef};
552
553 static const int32_t sampleDigitValues[] = {0, 2, 3, 5};
554
555 uint32_t mask;
556
557 int32_t i;
558 char icuVersion[U_MAX_VERSION_STRING_LENGTH];
559 UVersionInfo realVersion;
560
561 memset(icuVersion, 0, U_MAX_VERSION_STRING_LENGTH);
562
563 testSampleCharProps(u_isspace, "u_isspace", sampleSpaces, UPRV_LENGTHOF(sampleSpaces), TRUE);
564 testSampleCharProps(u_isspace, "u_isspace", sampleNonSpaces, UPRV_LENGTHOF(sampleNonSpaces), FALSE);
565
566 testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
567 sampleSpaces, UPRV_LENGTHOF(sampleSpaces), TRUE);
568 testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
569 sampleNonSpaces, UPRV_LENGTHOF(sampleNonSpaces), FALSE);
570
571 testSampleCharProps(u_isWhitespace, "u_isWhitespace",
572 sampleWhiteSpaces, UPRV_LENGTHOF(sampleWhiteSpaces), TRUE);
573 testSampleCharProps(u_isWhitespace, "u_isWhitespace",
574 sampleNonWhiteSpaces, UPRV_LENGTHOF(sampleNonWhiteSpaces), FALSE);
575
576 testSampleCharProps(u_isdefined, "u_isdefined",
577 sampleDefined, UPRV_LENGTHOF(sampleDefined), TRUE);
578 testSampleCharProps(u_isdefined, "u_isdefined",
579 sampleUndefined, UPRV_LENGTHOF(sampleUndefined), FALSE);
580
581 testSampleCharProps(u_isbase, "u_isbase", sampleBase, UPRV_LENGTHOF(sampleBase), TRUE);
582 testSampleCharProps(u_isbase, "u_isbase", sampleNonBase, UPRV_LENGTHOF(sampleNonBase), FALSE);
583
584 testSampleCharProps(u_isdigit, "u_isdigit", sampleDigits, UPRV_LENGTHOF(sampleDigits), TRUE);
585 testSampleCharProps(u_isdigit, "u_isdigit", sampleNonDigits, UPRV_LENGTHOF(sampleNonDigits), FALSE);
586
587 for (i = 0; i < UPRV_LENGTHOF(sampleDigits); i++) {
588 if (u_charDigitValue(sampleDigits[i]) != sampleDigitValues[i]) {
589 log_err("error: u_charDigitValue(U+04x)=%d != %d\n",
590 sampleDigits[i], u_charDigitValue(sampleDigits[i]), sampleDigitValues[i]);
591 }
592 }
593
594 /* Tests the ICU version #*/
595 u_getVersion(realVersion);
596 u_versionToString(realVersion, icuVersion);
597 if (strncmp(icuVersion, U_ICU_VERSION, uprv_min((int32_t)strlen(icuVersion), (int32_t)strlen(U_ICU_VERSION))) != 0)
598 {
599 log_err("ICU version test failed. Header says=%s, got=%s \n", U_ICU_VERSION, icuVersion);
600 }
601 #if defined(ICU_VERSION)
602 /* test only happens where we have configure.in with VERSION - sanity check. */
603 if(strcmp(U_ICU_VERSION, ICU_VERSION))
604 {
605 log_err("ICU version mismatch: Header says %s, build environment says %s.\n", U_ICU_VERSION, ICU_VERSION);
606 }
607 #endif
608
609 /* test U_GC_... */
610 if(
611 U_GET_GC_MASK(0x41)!=U_GC_LU_MASK ||
612 U_GET_GC_MASK(0x662)!=U_GC_ND_MASK ||
613 U_GET_GC_MASK(0xa0)!=U_GC_ZS_MASK ||
614 U_GET_GC_MASK(0x28)!=U_GC_PS_MASK ||
615 U_GET_GC_MASK(0x2044)!=U_GC_SM_MASK ||
616 U_GET_GC_MASK(0xe0063)!=U_GC_CF_MASK
617 ) {
618 log_err("error: U_GET_GC_MASK does not work properly\n");
619 }
620
621 mask=0;
622 mask=(mask&~U_GC_CN_MASK)|U_GC_CN_MASK;
623
624 mask=(mask&~U_GC_LU_MASK)|U_GC_LU_MASK;
625 mask=(mask&~U_GC_LL_MASK)|U_GC_LL_MASK;
626 mask=(mask&~U_GC_LT_MASK)|U_GC_LT_MASK;
627 mask=(mask&~U_GC_LM_MASK)|U_GC_LM_MASK;
628 mask=(mask&~U_GC_LO_MASK)|U_GC_LO_MASK;
629
630 mask=(mask&~U_GC_MN_MASK)|U_GC_MN_MASK;
631 mask=(mask&~U_GC_ME_MASK)|U_GC_ME_MASK;
632 mask=(mask&~U_GC_MC_MASK)|U_GC_MC_MASK;
633
634 mask=(mask&~U_GC_ND_MASK)|U_GC_ND_MASK;
635 mask=(mask&~U_GC_NL_MASK)|U_GC_NL_MASK;
636 mask=(mask&~U_GC_NO_MASK)|U_GC_NO_MASK;
637
638 mask=(mask&~U_GC_ZS_MASK)|U_GC_ZS_MASK;
639 mask=(mask&~U_GC_ZL_MASK)|U_GC_ZL_MASK;
640 mask=(mask&~U_GC_ZP_MASK)|U_GC_ZP_MASK;
641
642 mask=(mask&~U_GC_CC_MASK)|U_GC_CC_MASK;
643 mask=(mask&~U_GC_CF_MASK)|U_GC_CF_MASK;
644 mask=(mask&~U_GC_CO_MASK)|U_GC_CO_MASK;
645 mask=(mask&~U_GC_CS_MASK)|U_GC_CS_MASK;
646
647 mask=(mask&~U_GC_PD_MASK)|U_GC_PD_MASK;
648 mask=(mask&~U_GC_PS_MASK)|U_GC_PS_MASK;
649 mask=(mask&~U_GC_PE_MASK)|U_GC_PE_MASK;
650 mask=(mask&~U_GC_PC_MASK)|U_GC_PC_MASK;
651 mask=(mask&~U_GC_PO_MASK)|U_GC_PO_MASK;
652
653 mask=(mask&~U_GC_SM_MASK)|U_GC_SM_MASK;
654 mask=(mask&~U_GC_SC_MASK)|U_GC_SC_MASK;
655 mask=(mask&~U_GC_SK_MASK)|U_GC_SK_MASK;
656 mask=(mask&~U_GC_SO_MASK)|U_GC_SO_MASK;
657
658 mask=(mask&~U_GC_PI_MASK)|U_GC_PI_MASK;
659 mask=(mask&~U_GC_PF_MASK)|U_GC_PF_MASK;
660
661 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
662 log_err("error: problems with U_GC_XX_MASK constants\n");
663 }
664
665 mask=0;
666 mask=(mask&~U_GC_C_MASK)|U_GC_C_MASK;
667 mask=(mask&~U_GC_L_MASK)|U_GC_L_MASK;
668 mask=(mask&~U_GC_M_MASK)|U_GC_M_MASK;
669 mask=(mask&~U_GC_N_MASK)|U_GC_N_MASK;
670 mask=(mask&~U_GC_Z_MASK)|U_GC_Z_MASK;
671 mask=(mask&~U_GC_P_MASK)|U_GC_P_MASK;
672 mask=(mask&~U_GC_S_MASK)|U_GC_S_MASK;
673
674 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
675 log_err("error: problems with U_GC_Y_MASK constants\n");
676 }
677 {
678 static const UChar32 digit[10]={ 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039 };
679 for(i=0; i<10; i++){
680 if(digit[i]!=u_forDigit(i,10)){
681 log_err("u_forDigit failed for %i. Expected: 0x%4X Got: 0x%4X\n",i,digit[i],u_forDigit(i,10));
682 }
683 }
684 }
685
686 /* test u_digit() */
687 {
688 static const struct {
689 UChar32 c;
690 int8_t radix, value;
691 } data[]={
692 /* base 16 */
693 { 0x0031, 16, 1 },
694 { 0x0038, 16, 8 },
695 { 0x0043, 16, 12 },
696 { 0x0066, 16, 15 },
697 { 0x00e4, 16, -1 },
698 { 0x0662, 16, 2 },
699 { 0x06f5, 16, 5 },
700 { 0xff13, 16, 3 },
701 { 0xff41, 16, 10 },
702
703 /* base 8 */
704 { 0x0031, 8, 1 },
705 { 0x0038, 8, -1 },
706 { 0x0043, 8, -1 },
707 { 0x0066, 8, -1 },
708 { 0x00e4, 8, -1 },
709 { 0x0662, 8, 2 },
710 { 0x06f5, 8, 5 },
711 { 0xff13, 8, 3 },
712 { 0xff41, 8, -1 },
713
714 /* base 36 */
715 { 0x5a, 36, 35 },
716 { 0x7a, 36, 35 },
717 { 0xff3a, 36, 35 },
718 { 0xff5a, 36, 35 },
719
720 /* wrong radix values */
721 { 0x0031, 1, -1 },
722 { 0xff3a, 37, -1 }
723 };
724
725 for(i=0; i<UPRV_LENGTHOF(data); ++i) {
726 if(u_digit(data[i].c, data[i].radix)!=data[i].value) {
727 log_err("u_digit(U+%04x, %d)=%d expected %d\n",
728 data[i].c,
729 data[i].radix,
730 u_digit(data[i].c, data[i].radix),
731 data[i].value);
732 }
733 }
734 }
735 }
736
737 /* test C/POSIX-style functions --------------------------------------------- */
738
739 /* bit flags */
740 #define ISAL 1
741 #define ISLO 2
742 #define ISUP 4
743
744 #define ISDI 8
745 #define ISXD 0x10
746
747 #define ISAN 0x20
748
749 #define ISPU 0x40
750 #define ISGR 0x80
751 #define ISPR 0x100
752
753 #define ISSP 0x200
754 #define ISBL 0x400
755 #define ISCN 0x800
756
757 /* C/POSIX-style functions, in the same order as the bit flags */
758 typedef UBool U_EXPORT2 IsPOSIXClass(UChar32 c);
759
760 static const struct {
761 IsPOSIXClass *fn;
762 const char *name;
763 } posixClasses[]={
764 { u_isalpha, "isalpha" },
765 { u_islower, "islower" },
766 { u_isupper, "isupper" },
767 { u_isdigit, "isdigit" },
768 { u_isxdigit, "isxdigit" },
769 { u_isalnum, "isalnum" },
770 { u_ispunct, "ispunct" },
771 { u_isgraph, "isgraph" },
772 { u_isprint, "isprint" },
773 { u_isspace, "isspace" },
774 { u_isblank, "isblank" },
775 { u_iscntrl, "iscntrl" }
776 };
777
778 static const struct {
779 UChar32 c;
780 uint32_t posixResults;
781 } posixData[]={
782 { 0x0008, ISCN }, /* backspace */
783 { 0x0009, ISSP|ISBL|ISCN }, /* TAB */
784 { 0x000a, ISSP| ISCN }, /* LF */
785 { 0x000c, ISSP| ISCN }, /* FF */
786 { 0x000d, ISSP| ISCN }, /* CR */
787 { 0x0020, ISPR|ISSP|ISBL }, /* space */
788 { 0x0021, ISPU|ISGR|ISPR }, /* ! */
789 { 0x0033, ISDI|ISXD|ISAN| ISGR|ISPR }, /* 3 */
790 { 0x0040, ISPU|ISGR|ISPR }, /* @ */
791 { 0x0041, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* A */
792 { 0x007a, ISAL|ISLO| ISAN| ISGR|ISPR }, /* z */
793 { 0x007b, ISPU|ISGR|ISPR }, /* { */
794 { 0x0085, ISSP| ISCN }, /* NEL */
795 { 0x00a0, ISPR|ISSP|ISBL }, /* NBSP */
796 { 0x00a4, ISGR|ISPR }, /* currency sign */
797 { 0x00e4, ISAL|ISLO| ISAN| ISGR|ISPR }, /* a-umlaut */
798 { 0x0300, ISGR|ISPR }, /* combining grave */
799 { 0x0600, ISCN }, /* arabic number sign */
800 { 0x0627, ISAL| ISAN| ISGR|ISPR }, /* alef */
801 { 0x0663, ISDI|ISXD|ISAN| ISGR|ISPR }, /* arabic 3 */
802 { 0x2002, ISPR|ISSP|ISBL }, /* en space */
803 { 0x2007, ISPR|ISSP|ISBL }, /* figure space */
804 { 0x2009, ISPR|ISSP|ISBL }, /* thin space */
805 { 0x200b, ISCN }, /* ZWSP */
806 /*{ 0x200b, ISPR|ISSP },*/ /* ZWSP */ /* ZWSP became a control char in 4.0.1*/
807 { 0x200e, ISCN }, /* LRM */
808 { 0x2028, ISPR|ISSP| ISCN }, /* LS */
809 { 0x2029, ISPR|ISSP| ISCN }, /* PS */
810 { 0x20ac, ISGR|ISPR }, /* Euro */
811 { 0xff15, ISDI|ISXD|ISAN| ISGR|ISPR }, /* fullwidth 5 */
812 { 0xff25, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* fullwidth E */
813 { 0xff35, ISAL| ISUP| ISAN| ISGR|ISPR }, /* fullwidth U */
814 { 0xff45, ISAL|ISLO| ISXD|ISAN| ISGR|ISPR }, /* fullwidth e */
815 { 0xff55, ISAL|ISLO| ISAN| ISGR|ISPR } /* fullwidth u */
816 };
817
818 static void
819 TestPOSIX() {
820 uint32_t mask;
821 int32_t cl, i;
822 UBool expect;
823
824 mask=1;
825 for(cl=0; cl<12; ++cl) {
826 for(i=0; i<UPRV_LENGTHOF(posixData); ++i) {
827 expect=(UBool)((posixData[i].posixResults&mask)!=0);
828 if(posixClasses[cl].fn(posixData[i].c)!=expect) {
829 log_err("u_%s(U+%04x)=%s is wrong\n",
830 posixClasses[cl].name, posixData[i].c, expect ? "FALSE" : "TRUE");
831 }
832 }
833 mask<<=1;
834 }
835 }
836
837 /* Tests for isControl(u_iscntrl()) and isPrintable(u_isprint()) */
838 static void TestControlPrint()
839 {
840 const UChar32 sampleControl[] = {0x1b, 0x97, 0x82, 0x2028, 0x2029, 0x200c, 0x202b};
841 const UChar32 sampleNonControl[] = {0x61, 0x0031, 0x00e2};
842 const UChar32 samplePrintable[] = {0x0042, 0x005f, 0x2014};
843 const UChar32 sampleNonPrintable[] = {0x200c, 0x009f, 0x001b};
844 UChar32 c;
845
846 testSampleCharProps(u_iscntrl, "u_iscntrl", sampleControl, UPRV_LENGTHOF(sampleControl), TRUE);
847 testSampleCharProps(u_iscntrl, "u_iscntrl", sampleNonControl, UPRV_LENGTHOF(sampleNonControl), FALSE);
848
849 testSampleCharProps(u_isprint, "u_isprint",
850 samplePrintable, UPRV_LENGTHOF(samplePrintable), TRUE);
851 testSampleCharProps(u_isprint, "u_isprint",
852 sampleNonPrintable, UPRV_LENGTHOF(sampleNonPrintable), FALSE);
853
854 /* test all ISO 8 controls */
855 for(c=0; c<=0x9f; ++c) {
856 if(c==0x20) {
857 /* skip ASCII graphic characters and continue with DEL */
858 c=0x7f;
859 }
860 if(!u_iscntrl(c)) {
861 log_err("error: u_iscntrl(ISO 8 control U+%04x)=FALSE\n", c);
862 }
863 if(!u_isISOControl(c)) {
864 log_err("error: u_isISOControl(ISO 8 control U+%04x)=FALSE\n", c);
865 }
866 if(u_isprint(c)) {
867 log_err("error: u_isprint(ISO 8 control U+%04x)=TRUE\n", c);
868 }
869 }
870
871 /* test all Latin-1 graphic characters */
872 for(c=0x20; c<=0xff; ++c) {
873 if(c==0x7f) {
874 c=0xa0;
875 } else if(c==0xad) {
876 /* Unicode 4 changes 00AD Soft Hyphen to Cf (and it is in fact not printable) */
877 ++c;
878 }
879 if(!u_isprint(c)) {
880 log_err("error: u_isprint(Latin-1 graphic character U+%04x)=FALSE\n", c);
881 }
882 }
883 }
884
885 /* u_isJavaIDStart, u_isJavaIDPart, u_isIDStart(), u_isIDPart(), u_isIDIgnorable()*/
886 static void TestIdentifier()
887 {
888 const UChar32 sampleJavaIDStart[] = {0x0071, 0x00e4, 0x005f};
889 const UChar32 sampleNonJavaIDStart[] = {0x0020, 0x2030, 0x0082};
890 const UChar32 sampleJavaIDPart[] = {0x005f, 0x0032, 0x0045};
891 const UChar32 sampleNonJavaIDPart[] = {0x2030, 0x2020, 0x0020};
892 const UChar32 sampleUnicodeIDStart[] = {0x0250, 0x00e2, 0x0061};
893 const UChar32 sampleNonUnicodeIDStart[] = {0x2000, 0x000a, 0x2019};
894 const UChar32 sampleUnicodeIDPart[] = {0x005f, 0x0032, 0x0045};
895 const UChar32 sampleNonUnicodeIDPart[] = {0x2030, 0x00a3, 0x0020};
896 const UChar32 sampleIDIgnore[] = {0x0006, 0x0010, 0x206b, 0x85};
897 const UChar32 sampleNonIDIgnore[] = {0x0075, 0x00a3, 0x0061};
898
899 testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
900 sampleJavaIDStart, UPRV_LENGTHOF(sampleJavaIDStart), TRUE);
901 testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
902 sampleNonJavaIDStart, UPRV_LENGTHOF(sampleNonJavaIDStart), FALSE);
903
904 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
905 sampleJavaIDPart, UPRV_LENGTHOF(sampleJavaIDPart), TRUE);
906 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
907 sampleNonJavaIDPart, UPRV_LENGTHOF(sampleNonJavaIDPart), FALSE);
908
909 /* IDPart should imply IDStart */
910 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
911 sampleJavaIDStart, UPRV_LENGTHOF(sampleJavaIDStart), TRUE);
912
913 testSampleCharProps(u_isIDStart, "u_isIDStart",
914 sampleUnicodeIDStart, UPRV_LENGTHOF(sampleUnicodeIDStart), TRUE);
915 testSampleCharProps(u_isIDStart, "u_isIDStart",
916 sampleNonUnicodeIDStart, UPRV_LENGTHOF(sampleNonUnicodeIDStart), FALSE);
917
918 testSampleCharProps(u_isIDPart, "u_isIDPart",
919 sampleUnicodeIDPart, UPRV_LENGTHOF(sampleUnicodeIDPart), TRUE);
920 testSampleCharProps(u_isIDPart, "u_isIDPart",
921 sampleNonUnicodeIDPart, UPRV_LENGTHOF(sampleNonUnicodeIDPart), FALSE);
922
923 /* IDPart should imply IDStart */
924 testSampleCharProps(u_isIDPart, "u_isIDPart",
925 sampleUnicodeIDStart, UPRV_LENGTHOF(sampleUnicodeIDStart), TRUE);
926
927 testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
928 sampleIDIgnore, UPRV_LENGTHOF(sampleIDIgnore), TRUE);
929 testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
930 sampleNonIDIgnore, UPRV_LENGTHOF(sampleNonIDIgnore), FALSE);
931 }
932
933 /* for each line of UnicodeData.txt, check some of the properties */
934 typedef struct UnicodeDataContext {
935 #if UCONFIG_NO_NORMALIZATION
936 const void *dummy;
937 #else
938 const UNormalizer2 *nfc;
939 const UNormalizer2 *nfkc;
940 #endif
941 } UnicodeDataContext;
942
943 /*
944 * ### TODO
945 * This test fails incorrectly if the First or Last code point of a repetitive area
946 * is overridden, which is allowed and is encouraged for the PUAs.
947 * Currently, this means that both area First/Last and override lines are
948 * tested against the properties from the API,
949 * and the area boundary will not match and cause an error.
950 *
951 * This function should detect area boundaries and skip them for the test of individual
952 * code points' properties.
953 * Then it should check that the areas contain all the same properties except where overridden.
954 * For this, it would have had to set a flag for which code points were listed explicitly.
955 */
956 static void U_CALLCONV
957 unicodeDataLineFn(void *context,
958 char *fields[][2], int32_t fieldCount,
959 UErrorCode *pErrorCode)
960 {
961 char buffer[100];
962 const char *d;
963 char *end;
964 uint32_t value;
965 UChar32 c;
966 int32_t i;
967 int8_t type;
968 int32_t dt;
969 UChar dm[32], s[32];
970 int32_t dmLength, length;
971
972 #if !UCONFIG_NO_NORMALIZATION
973 const UNormalizer2 *nfc, *nfkc;
974 #endif
975
976 /* get the character code, field 0 */
977 c=strtoul(fields[0][0], &end, 16);
978 if(end<=fields[0][0] || end!=fields[0][1]) {
979 log_err("error: syntax error in field 0 at %s\n", fields[0][0]);
980 return;
981 }
982 if((uint32_t)c>=UCHAR_MAX_VALUE + 1) {
983 log_err("error in UnicodeData.txt: code point %lu out of range\n", c);
984 return;
985 }
986
987 /* get general category, field 2 */
988 *fields[2][1]=0;
989 type = (int8_t)tagValues[MakeProp(fields[2][0])];
990 if(u_charType(c)!=type) {
991 log_err("error: u_charType(U+%04lx)==%u instead of %u\n", c, u_charType(c), type);
992 }
993 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
994 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
995 }
996
997 /* get canonical combining class, field 3 */
998 value=strtoul(fields[3][0], &end, 10);
999 if(end<=fields[3][0] || end!=fields[3][1]) {
1000 log_err("error: syntax error in field 3 at code 0x%lx\n", c);
1001 return;
1002 }
1003 if(value>255) {
1004 log_err("error in UnicodeData.txt: combining class %lu out of range\n", value);
1005 return;
1006 }
1007 #if !UCONFIG_NO_NORMALIZATION
1008 if(value!=u_getCombiningClass(c) || value!=(uint32_t)u_getIntPropertyValue(c, UCHAR_CANONICAL_COMBINING_CLASS)) {
1009 log_err("error: u_getCombiningClass(U+%04lx)==%hu instead of %lu\n", c, u_getCombiningClass(c), value);
1010 }
1011 nfkc=((UnicodeDataContext *)context)->nfkc;
1012 if(value!=unorm2_getCombiningClass(nfkc, c)) {
1013 log_err("error: unorm2_getCombiningClass(nfkc, U+%04lx)==%hu instead of %lu\n", c, unorm2_getCombiningClass(nfkc, c), value);
1014 }
1015 #endif
1016
1017 /* get BiDi category, field 4 */
1018 *fields[4][1]=0;
1019 i=MakeDir(fields[4][0]);
1020 if(i!=u_charDirection(c) || i!=u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)) {
1021 log_err("error: u_charDirection(U+%04lx)==%u instead of %u (%s)\n", c, u_charDirection(c), MakeDir(fields[4][0]), fields[4][0]);
1022 }
1023
1024 /* get Decomposition_Type & Decomposition_Mapping, field 5 */
1025 d=NULL;
1026 if(fields[5][0]==fields[5][1]) {
1027 /* no decomposition, except UnicodeData.txt omits Hangul syllable decompositions */
1028 if(c==0xac00 || c==0xd7a3) {
1029 dt=U_DT_CANONICAL;
1030 } else {
1031 dt=U_DT_NONE;
1032 }
1033 } else {
1034 d=fields[5][0];
1035 *fields[5][1]=0;
1036 dt=UCHAR_INVALID_CODE;
1037 if(*d=='<') {
1038 end=strchr(++d, '>');
1039 if(end!=NULL) {
1040 *end=0;
1041 dt=u_getPropertyValueEnum(UCHAR_DECOMPOSITION_TYPE, d);
1042 d=u_skipWhitespace(end+1);
1043 }
1044 } else {
1045 dt=U_DT_CANONICAL;
1046 }
1047 }
1048 if(dt>U_DT_NONE) {
1049 if(c==0xac00) {
1050 dm[0]=0x1100;
1051 dm[1]=0x1161;
1052 dm[2]=0;
1053 dmLength=2;
1054 } else if(c==0xd7a3) {
1055 dm[0]=0xd788;
1056 dm[1]=0x11c2;
1057 dm[2]=0;
1058 dmLength=2;
1059 } else {
1060 dmLength=u_parseString(d, dm, 32, NULL, pErrorCode);
1061 }
1062 } else {
1063 dmLength=-1;
1064 }
1065 if(dt<0 || U_FAILURE(*pErrorCode)) {
1066 log_err("error in UnicodeData.txt: syntax error in U+%04lX decomposition field\n", (long)c);
1067 return;
1068 }
1069 #if !UCONFIG_NO_NORMALIZATION
1070 i=u_getIntPropertyValue(c, UCHAR_DECOMPOSITION_TYPE);
1071 if(i!=dt) {
1072 log_err("error: u_getIntPropertyValue(U+%04lx, UCHAR_DECOMPOSITION_TYPE)==%d instead of %d\n", c, i, dt);
1073 }
1074 /* Expect Decomposition_Mapping=nfkc.getRawDecomposition(c). */
1075 length=unorm2_getRawDecomposition(nfkc, c, s, 32, pErrorCode);
1076 if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1077 log_err("error: unorm2_getRawDecomposition(nfkc, U+%04lx)==%d instead of %d "
1078 "or the Decomposition_Mapping is different (%s)\n",
1079 c, length, dmLength, u_errorName(*pErrorCode));
1080 return;
1081 }
1082 /* For canonical decompositions only, expect Decomposition_Mapping=nfc.getRawDecomposition(c). */
1083 if(dt!=U_DT_CANONICAL) {
1084 dmLength=-1;
1085 }
1086 nfc=((UnicodeDataContext *)context)->nfc;
1087 length=unorm2_getRawDecomposition(nfc, c, s, 32, pErrorCode);
1088 if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1089 log_err("error: unorm2_getRawDecomposition(nfc, U+%04lx)==%d instead of %d "
1090 "or the Decomposition_Mapping is different (%s)\n",
1091 c, length, dmLength, u_errorName(*pErrorCode));
1092 return;
1093 }
1094 /* recompose */
1095 if(dt==U_DT_CANONICAL && !u_hasBinaryProperty(c, UCHAR_FULL_COMPOSITION_EXCLUSION)) {
1096 UChar32 a, b, composite;
1097 i=0;
1098 U16_NEXT(dm, i, dmLength, a);
1099 U16_NEXT(dm, i, dmLength, b);
1100 /* i==dmLength */
1101 composite=unorm2_composePair(nfc, a, b);
1102 if(composite!=c) {
1103 log_err("error: nfc U+%04lX decomposes to U+%04lX+U+%04lX but does not compose back (instead U+%04lX)\n",
1104 (long)c, (long)a, (long)b, (long)composite);
1105 }
1106 /*
1107 * Note: NFKC has fewer round-trip mappings than NFC,
1108 * so we can't just test unorm2_composePair(nfkc, a, b) here without further data.
1109 */
1110 }
1111 #endif
1112
1113 /* get ISO Comment, field 11 */
1114 *fields[11][1]=0;
1115 i=u_getISOComment(c, buffer, sizeof(buffer), pErrorCode);
1116 if(U_FAILURE(*pErrorCode) || 0!=strcmp(fields[11][0], buffer)) {
1117 log_err_status(*pErrorCode, "error: u_getISOComment(U+%04lx) wrong (%s): \"%s\" should be \"%s\"\n",
1118 c, u_errorName(*pErrorCode),
1119 U_FAILURE(*pErrorCode) ? buffer : "[error]",
1120 fields[11][0]);
1121 }
1122
1123 /* get uppercase mapping, field 12 */
1124 if(fields[12][0]!=fields[12][1]) {
1125 value=strtoul(fields[12][0], &end, 16);
1126 if(end!=fields[12][1]) {
1127 log_err("error: syntax error in field 12 at code 0x%lx\n", c);
1128 return;
1129 }
1130 if((UChar32)value!=u_toupper(c)) {
1131 log_err("error: u_toupper(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_toupper(c), value);
1132 }
1133 } else {
1134 /* no case mapping: the API must map the code point to itself */
1135 if(c!=u_toupper(c)) {
1136 log_err("error: U+%04lx does not have an uppercase mapping but u_toupper()==U+%04lx\n", c, u_toupper(c));
1137 }
1138 }
1139
1140 /* get lowercase mapping, field 13 */
1141 if(fields[13][0]!=fields[13][1]) {
1142 value=strtoul(fields[13][0], &end, 16);
1143 if(end!=fields[13][1]) {
1144 log_err("error: syntax error in field 13 at code 0x%lx\n", c);
1145 return;
1146 }
1147 if((UChar32)value!=u_tolower(c)) {
1148 log_err("error: u_tolower(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_tolower(c), value);
1149 }
1150 } else {
1151 /* no case mapping: the API must map the code point to itself */
1152 if(c!=u_tolower(c)) {
1153 log_err("error: U+%04lx does not have a lowercase mapping but u_tolower()==U+%04lx\n", c, u_tolower(c));
1154 }
1155 }
1156
1157 /* get titlecase mapping, field 14 */
1158 if(fields[14][0]!=fields[14][1]) {
1159 value=strtoul(fields[14][0], &end, 16);
1160 if(end!=fields[14][1]) {
1161 log_err("error: syntax error in field 14 at code 0x%lx\n", c);
1162 return;
1163 }
1164 if((UChar32)value!=u_totitle(c)) {
1165 log_err("error: u_totitle(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_totitle(c), value);
1166 }
1167 } else {
1168 /* no case mapping: the API must map the code point to itself */
1169 if(c!=u_totitle(c)) {
1170 log_err("error: U+%04lx does not have a titlecase mapping but u_totitle()==U+%04lx\n", c, u_totitle(c));
1171 }
1172 }
1173 }
1174
1175 static UBool U_CALLCONV
1176 enumTypeRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1177 static const UChar32 test[][2]={
1178 {0x41, U_UPPERCASE_LETTER},
1179 {0x308, U_NON_SPACING_MARK},
1180 {0xfffe, U_GENERAL_OTHER_TYPES},
1181 {0xe0041, U_FORMAT_CHAR},
1182 {0xeffff, U_UNASSIGNED}
1183 };
1184
1185 int32_t i, count;
1186
1187 if(0!=strcmp((const char *)context, "a1")) {
1188 log_err("error: u_enumCharTypes() passes on an incorrect context pointer\n");
1189 return FALSE;
1190 }
1191
1192 count=UPRV_LENGTHOF(test);
1193 for(i=0; i<count; ++i) {
1194 if(start<=test[i][0] && test[i][0]<limit) {
1195 if(type!=(UCharCategory)test[i][1]) {
1196 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld instead of U+%04lx with %ld\n",
1197 start, limit, (long)type, test[i][0], test[i][1]);
1198 }
1199 /* stop at the range that includes the last test code point (increases code coverage for enumeration) */
1200 return i==(count-1) ? FALSE : TRUE;
1201 }
1202 }
1203
1204 if(start>test[count-1][0]) {
1205 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld after it should have stopped\n",
1206 start, limit, (long)type);
1207 return FALSE;
1208 }
1209
1210 return TRUE;
1211 }
1212
1213 static UBool U_CALLCONV
1214 enumDefaultsRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1215 /* default Bidi classes for unassigned code points, from the DerivedBidiClass.txt header */
1216 static const int32_t defaultBidi[][2]={ /* { limit, class } */
1217 { 0x0590, U_LEFT_TO_RIGHT },
1218 { 0x0600, U_RIGHT_TO_LEFT },
1219 { 0x07C0, U_RIGHT_TO_LEFT_ARABIC },
1220 { 0x0860, U_RIGHT_TO_LEFT },
1221 { 0x0870, U_RIGHT_TO_LEFT_ARABIC }, // Unicode 10 changes U+0860..U+086F from R to AL.
1222 { 0x08A0, U_RIGHT_TO_LEFT },
1223 { 0x0900, U_RIGHT_TO_LEFT_ARABIC }, /* Unicode 6.1 changes U+08A0..U+08FF from R to AL */
1224 { 0x20A0, U_LEFT_TO_RIGHT },
1225 { 0x20D0, U_EUROPEAN_NUMBER_TERMINATOR }, /* Unicode 6.3 changes the currency symbols block U+20A0..U+20CF to default to ET not L */
1226 { 0xFB1D, U_LEFT_TO_RIGHT },
1227 { 0xFB50, U_RIGHT_TO_LEFT },
1228 { 0xFE00, U_RIGHT_TO_LEFT_ARABIC },
1229 { 0xFE70, U_LEFT_TO_RIGHT },
1230 { 0xFF00, U_RIGHT_TO_LEFT_ARABIC },
1231 { 0x10800, U_LEFT_TO_RIGHT },
1232 { 0x11000, U_RIGHT_TO_LEFT },
1233 { 0x1E800, U_LEFT_TO_RIGHT }, /* new default-R range in Unicode 5.2: U+1E800 - U+1EFFF */
1234 { 0x1EE00, U_RIGHT_TO_LEFT },
1235 { 0x1EF00, U_RIGHT_TO_LEFT_ARABIC }, /* Unicode 6.1 changes U+1EE00..U+1EEFF from R to AL */
1236 { 0x1F000, U_RIGHT_TO_LEFT },
1237 { 0x110000, U_LEFT_TO_RIGHT }
1238 };
1239
1240 UChar32 c;
1241 int32_t i;
1242 UCharDirection shouldBeDir;
1243
1244 /*
1245 * LineBreak.txt specifies:
1246 * # - Assigned characters that are not listed explicitly are given the value
1247 * # "AL".
1248 * # - Unassigned characters are given the value "XX".
1249 *
1250 * PUA characters are listed explicitly with "XX".
1251 * Verify that no assigned character has "XX".
1252 */
1253 if(type!=U_UNASSIGNED && type!=U_PRIVATE_USE_CHAR) {
1254 c=start;
1255 while(c<limit) {
1256 if(0==u_getIntPropertyValue(c, UCHAR_LINE_BREAK)) {
1257 log_err("error UCHAR_LINE_BREAK(assigned U+%04lx)=XX\n", c);
1258 }
1259 ++c;
1260 }
1261 }
1262
1263 /*
1264 * Verify default Bidi classes.
1265 * See DerivedBidiClass.txt, especially for unassigned code points.
1266 */
1267 if(type==U_UNASSIGNED || type==U_PRIVATE_USE_CHAR) {
1268 /* enumerate the intersections of defaultBidi ranges with [start..limit[ */
1269 c=start;
1270 for(i=0; i<UPRV_LENGTHOF(defaultBidi) && c<limit; ++i) {
1271 if((int32_t)c<defaultBidi[i][0]) {
1272 while(c<limit && (int32_t)c<defaultBidi[i][0]) {
1273 if(U_IS_UNICODE_NONCHAR(c) || u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
1274 shouldBeDir=U_BOUNDARY_NEUTRAL;
1275 } else {
1276 shouldBeDir=(UCharDirection)defaultBidi[i][1];
1277 }
1278
1279 if( u_charDirection(c)!=shouldBeDir ||
1280 u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)!=shouldBeDir
1281 ) {
1282 log_err("error: u_charDirection(unassigned/PUA U+%04lx)=%s should be %s\n",
1283 c, dirStrings[u_charDirection(c)], dirStrings[shouldBeDir]);
1284 }
1285 ++c;
1286 }
1287 }
1288 }
1289 }
1290
1291 return TRUE;
1292 }
1293
1294 /* tests for several properties */
1295 static void TestUnicodeData()
1296 {
1297 UVersionInfo expectVersionArray;
1298 UVersionInfo versionArray;
1299 char *fields[15][2];
1300 UErrorCode errorCode;
1301 UChar32 c;
1302 int8_t type;
1303
1304 UnicodeDataContext context;
1305
1306 u_versionFromString(expectVersionArray, U_UNICODE_VERSION);
1307 u_getUnicodeVersion(versionArray);
1308 if(memcmp(versionArray, expectVersionArray, U_MAX_VERSION_LENGTH) != 0)
1309 {
1310 log_err("Testing u_getUnicodeVersion() - expected " U_UNICODE_VERSION " got %d.%d.%d.%d\n",
1311 versionArray[0], versionArray[1], versionArray[2], versionArray[3]);
1312 }
1313
1314 #if defined(ICU_UNICODE_VERSION)
1315 /* test only happens where we have configure.in with UNICODE_VERSION - sanity check. */
1316 if(strcmp(U_UNICODE_VERSION, ICU_UNICODE_VERSION))
1317 {
1318 log_err("Testing configure.in's ICU_UNICODE_VERSION - expected " U_UNICODE_VERSION " got " ICU_UNICODE_VERSION "\n");
1319 }
1320 #endif
1321
1322 if (ublock_getCode((UChar)0x0041) != UBLOCK_BASIC_LATIN || u_getIntPropertyValue(0x41, UCHAR_BLOCK)!=(int32_t)UBLOCK_BASIC_LATIN) {
1323 log_err("ublock_getCode(U+0041) property failed! Expected : %i Got: %i \n", UBLOCK_BASIC_LATIN,ublock_getCode((UChar)0x0041));
1324 }
1325
1326 errorCode=U_ZERO_ERROR;
1327 #if !UCONFIG_NO_NORMALIZATION
1328 context.nfc=unorm2_getNFCInstance(&errorCode);
1329 context.nfkc=unorm2_getNFKCInstance(&errorCode);
1330 if(U_FAILURE(errorCode)) {
1331 log_data_err("error: unable to open an NFC or NFKC UNormalizer2 - %s\n", u_errorName(errorCode));
1332 return;
1333 }
1334 #endif
1335 parseUCDFile("UnicodeData.txt", fields, 15, unicodeDataLineFn, &context, &errorCode);
1336 if(U_FAILURE(errorCode)) {
1337 return; /* if we couldn't parse UnicodeData.txt, we should return */
1338 }
1339
1340 /* sanity check on repeated properties */
1341 for(c=0xfffe; c<=0x10ffff;) {
1342 type=u_charType(c);
1343 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1344 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1345 }
1346 if(type!=U_UNASSIGNED) {
1347 log_err("error: u_charType(U+%04lx)!=U_UNASSIGNED (returns %d)\n", c, u_charType(c));
1348 }
1349 if((c&0xffff)==0xfffe) {
1350 ++c;
1351 } else {
1352 c+=0xffff;
1353 }
1354 }
1355
1356 /* test that PUA is not "unassigned" */
1357 for(c=0xe000; c<=0x10fffd;) {
1358 type=u_charType(c);
1359 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1360 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1361 }
1362 if(type==U_UNASSIGNED) {
1363 log_err("error: u_charType(U+%04lx)==U_UNASSIGNED\n", c);
1364 } else if(type!=U_PRIVATE_USE_CHAR) {
1365 log_verbose("PUA override: u_charType(U+%04lx)=%d\n", c, type);
1366 }
1367 if(c==0xf8ff) {
1368 c=0xf0000;
1369 } else if(c==0xffffd) {
1370 c=0x100000;
1371 } else {
1372 ++c;
1373 }
1374 }
1375
1376 /* test u_enumCharTypes() */
1377 u_enumCharTypes(enumTypeRange, "a1");
1378
1379 /* check default properties */
1380 u_enumCharTypes(enumDefaultsRange, NULL);
1381 }
1382
1383 static void TestCodeUnit(){
1384 const UChar codeunit[]={0x0000,0xe065,0x20ac,0xd7ff,0xd800,0xd841,0xd905,0xdbff,0xdc00,0xdc02,0xddee,0xdfff,0};
1385
1386 int32_t i;
1387
1388 for(i=0; i<UPRV_LENGTHOF(codeunit); i++){
1389 UChar c=codeunit[i];
1390 if(i<4){
1391 if(!(UTF_IS_SINGLE(c)) || (UTF_IS_LEAD(c)) || (UTF_IS_TRAIL(c)) ||(UTF_IS_SURROGATE(c))){
1392 log_err("ERROR: U+%04x is a single", c);
1393 }
1394
1395 }
1396 if(i >= 4 && i< 8){
1397 if(!(UTF_IS_LEAD(c)) || UTF_IS_SINGLE(c) || UTF_IS_TRAIL(c) || !(UTF_IS_SURROGATE(c))){
1398 log_err("ERROR: U+%04x is a first surrogate", c);
1399 }
1400 }
1401 if(i >= 8 && i< 12){
1402 if(!(UTF_IS_TRAIL(c)) || UTF_IS_SINGLE(c) || UTF_IS_LEAD(c) || !(UTF_IS_SURROGATE(c))){
1403 log_err("ERROR: U+%04x is a second surrogate", c);
1404 }
1405 }
1406 }
1407
1408 }
1409
1410 static void TestCodePoint(){
1411 const UChar32 codePoint[]={
1412 /*surrogate, notvalid(codepoint), not a UnicodeChar, not Error */
1413 0xd800,
1414 0xdbff,
1415 0xdc00,
1416 0xdfff,
1417 0xdc04,
1418 0xd821,
1419 /*not a surrogate, valid, isUnicodeChar , not Error*/
1420 0x20ac,
1421 0xd7ff,
1422 0xe000,
1423 0xe123,
1424 0x0061,
1425 0xe065,
1426 0x20402,
1427 0x24506,
1428 0x23456,
1429 0x20402,
1430 0x10402,
1431 0x23456,
1432 /*not a surrogate, not valid, isUnicodeChar, isError */
1433 0x0015,
1434 0x009f,
1435 /*not a surrogate, not valid, not isUnicodeChar, isError */
1436 0xffff,
1437 0xfffe,
1438 };
1439 int32_t i;
1440 for(i=0; i<UPRV_LENGTHOF(codePoint); i++){
1441 UChar32 c=codePoint[i];
1442 if(i<6){
1443 if(!UTF_IS_SURROGATE(c) || !U_IS_SURROGATE(c) || !U16_IS_SURROGATE(c)){
1444 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1445 }
1446 if(UTF_IS_VALID(c)){
1447 log_err("ERROR: isValid() failed for U+%04x\n", c);
1448 }
1449 if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){
1450 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1451 }
1452 if(UTF_IS_ERROR(c)){
1453 log_err("ERROR: isError() failed for U+%04x\n", c);
1454 }
1455 }else if(i >=6 && i<18){
1456 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
1457 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1458 }
1459 if(!UTF_IS_VALID(c)){
1460 log_err("ERROR: isValid() failed for U+%04x\n", c);
1461 }
1462 if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){
1463 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1464 }
1465 if(UTF_IS_ERROR(c)){
1466 log_err("ERROR: isError() failed for U+%04x\n", c);
1467 }
1468 }else if(i >=18 && i<20){
1469 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
1470 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1471 }
1472 if(UTF_IS_VALID(c)){
1473 log_err("ERROR: isValid() failed for U+%04x\n", c);
1474 }
1475 if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){
1476 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1477 }
1478 if(!UTF_IS_ERROR(c)){
1479 log_err("ERROR: isError() failed for U+%04x\n", c);
1480 }
1481 }
1482 else if(i >=18 && i<UPRV_LENGTHOF(codePoint)){
1483 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
1484 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1485 }
1486 if(UTF_IS_VALID(c)){
1487 log_err("ERROR: isValid() failed for U+%04x\n", c);
1488 }
1489 if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){
1490 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1491 }
1492 if(!UTF_IS_ERROR(c)){
1493 log_err("ERROR: isError() failed for U+%04x\n", c);
1494 }
1495 }
1496 }
1497
1498 if(
1499 !U_IS_BMP(0) || !U_IS_BMP(0x61) || !U_IS_BMP(0x20ac) ||
1500 !U_IS_BMP(0xd9da) || !U_IS_BMP(0xdfed) || !U_IS_BMP(0xffff) ||
1501 U_IS_BMP(U_SENTINEL) || U_IS_BMP(0x10000) || U_IS_BMP(0x50005) ||
1502 U_IS_BMP(0x10ffff) || U_IS_BMP(0x110000) || U_IS_BMP(0x7fffffff)
1503 ) {
1504 log_err("error with U_IS_BMP()\n");
1505 }
1506
1507 if(
1508 U_IS_SUPPLEMENTARY(0) || U_IS_SUPPLEMENTARY(0x61) || U_IS_SUPPLEMENTARY(0x20ac) ||
1509 U_IS_SUPPLEMENTARY(0xd9da) || U_IS_SUPPLEMENTARY(0xdfed) || U_IS_SUPPLEMENTARY(0xffff) ||
1510 U_IS_SUPPLEMENTARY(U_SENTINEL) || !U_IS_SUPPLEMENTARY(0x10000) || !U_IS_SUPPLEMENTARY(0x50005) ||
1511 !U_IS_SUPPLEMENTARY(0x10ffff) || U_IS_SUPPLEMENTARY(0x110000) || U_IS_SUPPLEMENTARY(0x7fffffff)
1512 ) {
1513 log_err("error with U_IS_SUPPLEMENTARY()\n");
1514 }
1515 }
1516
1517 static void TestCharLength()
1518 {
1519 const int32_t codepoint[]={
1520 1, 0x0061,
1521 1, 0xe065,
1522 1, 0x20ac,
1523 2, 0x20402,
1524 2, 0x23456,
1525 2, 0x24506,
1526 2, 0x20402,
1527 2, 0x10402,
1528 1, 0xd7ff,
1529 1, 0xe000
1530 };
1531
1532 int32_t i;
1533 UBool multiple;
1534 for(i=0; i<UPRV_LENGTHOF(codepoint); i=(int16_t)(i+2)){
1535 UChar32 c=codepoint[i+1];
1536 if(UTF_CHAR_LENGTH(c) != codepoint[i] || U16_LENGTH(c) != codepoint[i]){
1537 log_err("The no: of code units for U+%04x:- Expected: %d Got: %d\n", c, codepoint[i], U16_LENGTH(c));
1538 }
1539 multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
1540 if(UTF_NEED_MULTIPLE_UCHAR(c) != multiple){
1541 log_err("ERROR: Unicode::needMultipleUChar() failed for U+%04x\n", c);
1542 }
1543 }
1544 }
1545
1546 /*internal functions ----*/
1547 static int32_t MakeProp(char* str)
1548 {
1549 int32_t result = 0;
1550 char* matchPosition =0;
1551
1552 matchPosition = strstr(tagStrings, str);
1553 if (matchPosition == 0)
1554 {
1555 log_err("unrecognized type letter ");
1556 log_err(str);
1557 }
1558 else
1559 result = (int32_t)((matchPosition - tagStrings) / 2);
1560 return result;
1561 }
1562
1563 static int32_t MakeDir(char* str)
1564 {
1565 int32_t pos = 0;
1566 for (pos = 0; pos < U_CHAR_DIRECTION_COUNT; pos++) {
1567 if (strcmp(str, dirStrings[pos]) == 0) {
1568 return pos;
1569 }
1570 }
1571 return -1;
1572 }
1573
1574 /* test u_charName() -------------------------------------------------------- */
1575
1576 static const struct {
1577 uint32_t code;
1578 const char *name, *oldName, *extName, *alias;
1579 } names[]={
1580 {0x0061, "LATIN SMALL LETTER A", "", "LATIN SMALL LETTER A"},
1581 {0x01a2, "LATIN CAPITAL LETTER OI", "",
1582 "LATIN CAPITAL LETTER OI",
1583 "LATIN CAPITAL LETTER GHA"},
1584 {0x0284, "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK", "",
1585 "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK" },
1586 {0x0fd0, "TIBETAN MARK BSKA- SHOG GI MGO RGYAN", "",
1587 "TIBETAN MARK BSKA- SHOG GI MGO RGYAN",
1588 "TIBETAN MARK BKA- SHOG GI MGO RGYAN"},
1589 {0x3401, "CJK UNIFIED IDEOGRAPH-3401", "", "CJK UNIFIED IDEOGRAPH-3401" },
1590 {0x7fed, "CJK UNIFIED IDEOGRAPH-7FED", "", "CJK UNIFIED IDEOGRAPH-7FED" },
1591 {0xac00, "HANGUL SYLLABLE GA", "", "HANGUL SYLLABLE GA" },
1592 {0xd7a3, "HANGUL SYLLABLE HIH", "", "HANGUL SYLLABLE HIH" },
1593 {0xd800, "", "", "<lead surrogate-D800>" },
1594 {0xdc00, "", "", "<trail surrogate-DC00>" },
1595 {0xff08, "FULLWIDTH LEFT PARENTHESIS", "", "FULLWIDTH LEFT PARENTHESIS" },
1596 {0xffe5, "FULLWIDTH YEN SIGN", "", "FULLWIDTH YEN SIGN" },
1597 {0xffff, "", "", "<noncharacter-FFFF>" },
1598 {0x1d0c5, "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", "",
1599 "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS",
1600 "BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS"},
1601 {0x23456, "CJK UNIFIED IDEOGRAPH-23456", "", "CJK UNIFIED IDEOGRAPH-23456" }
1602 };
1603
1604 static UBool
1605 enumCharNamesFn(void *context,
1606 UChar32 code, UCharNameChoice nameChoice,
1607 const char *name, int32_t length) {
1608 int32_t *pCount=(int32_t *)context;
1609 const char *expected;
1610 int i;
1611
1612 if(length<=0 || length!=(int32_t)strlen(name)) {
1613 /* should not be called with an empty string or invalid length */
1614 log_err("u_enumCharName(0x%lx)=%s but length=%ld\n", name, length);
1615 return TRUE;
1616 }
1617
1618 ++*pCount;
1619 for(i=0; i<UPRV_LENGTHOF(names); ++i) {
1620 if(code==(UChar32)names[i].code) {
1621 switch (nameChoice) {
1622 case U_EXTENDED_CHAR_NAME:
1623 if(0!=strcmp(name, names[i].extName)) {
1624 log_err("u_enumCharName(0x%lx - Extended)=%s instead of %s\n", code, name, names[i].extName);
1625 }
1626 break;
1627 case U_UNICODE_CHAR_NAME:
1628 if(0!=strcmp(name, names[i].name)) {
1629 log_err("u_enumCharName(0x%lx)=%s instead of %s\n", code, name, names[i].name);
1630 }
1631 break;
1632 case U_UNICODE_10_CHAR_NAME:
1633 expected=names[i].oldName;
1634 if(expected[0]==0 || 0!=strcmp(name, expected)) {
1635 log_err("u_enumCharName(0x%lx - 1.0)=%s instead of %s\n", code, name, expected);
1636 }
1637 break;
1638 case U_CHAR_NAME_ALIAS:
1639 expected=names[i].alias;
1640 if(expected==NULL || expected[0]==0 || 0!=strcmp(name, expected)) {
1641 log_err("u_enumCharName(0x%lx - alias)=%s instead of %s\n", code, name, expected);
1642 }
1643 break;
1644 case U_CHAR_NAME_CHOICE_COUNT:
1645 break;
1646 }
1647 break;
1648 }
1649 }
1650 return TRUE;
1651 }
1652
1653 struct enumExtCharNamesContext {
1654 uint32_t length;
1655 int32_t last;
1656 };
1657
1658 static UBool
1659 enumExtCharNamesFn(void *context,
1660 UChar32 code, UCharNameChoice nameChoice,
1661 const char *name, int32_t length) {
1662 struct enumExtCharNamesContext *ecncp = (struct enumExtCharNamesContext *) context;
1663
1664 if (ecncp->last != (int32_t) code - 1) {
1665 if (ecncp->last < 0) {
1666 log_err("u_enumCharName(0x%lx - Ext) after u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x%lx - Ext)\n", code, ecncp->last, ecncp->last + 1);
1667 } else {
1668 log_err("u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x0 - Ext)\n", code);
1669 }
1670 }
1671 ecncp->last = (int32_t) code;
1672
1673 if (!*name) {
1674 log_err("u_enumCharName(0x%lx - Ext) should not be an empty string\n", code);
1675 }
1676
1677 return enumCharNamesFn(&ecncp->length, code, nameChoice, name, length);
1678 }
1679
1680 /**
1681 * This can be made more efficient by moving it into putil.c and having
1682 * it directly access the ebcdic translation tables.
1683 * TODO: If we get this method in putil.c, then delete it from here.
1684 */
1685 static UChar
1686 u_charToUChar(char c) {
1687 UChar uc;
1688 u_charsToUChars(&c, &uc, 1);
1689 return uc;
1690 }
1691
1692 static void
1693 TestCharNames() {
1694 static char name[80];
1695 UErrorCode errorCode=U_ZERO_ERROR;
1696 struct enumExtCharNamesContext extContext;
1697 const char *expected;
1698 int32_t length;
1699 UChar32 c;
1700 int32_t i;
1701
1702 log_verbose("Testing uprv_getMaxCharNameLength()\n");
1703 length=uprv_getMaxCharNameLength();
1704 if(length==0) {
1705 /* no names data available */
1706 return;
1707 }
1708 if(length<83) { /* Unicode 3.2 max char name length */
1709 log_err("uprv_getMaxCharNameLength()=%d is too short");
1710 }
1711 /* ### TODO same tests for max ISO comment length as for max name length */
1712
1713 log_verbose("Testing u_charName()\n");
1714 for(i=0; i<UPRV_LENGTHOF(names); ++i) {
1715 /* modern Unicode character name */
1716 length=u_charName(names[i].code, U_UNICODE_CHAR_NAME, name, sizeof(name), &errorCode);
1717 if(U_FAILURE(errorCode)) {
1718 log_err("u_charName(0x%lx) error %s\n", names[i].code, u_errorName(errorCode));
1719 return;
1720 }
1721 if(length<0 || 0!=strcmp(name, names[i].name) || length!=(uint16_t)strlen(name)) {
1722 log_err("u_charName(0x%lx) gets: %s (length %ld) instead of: %s\n", names[i].code, name, length, names[i].name);
1723 }
1724
1725 /* find the modern name */
1726 if (*names[i].name) {
1727 c=u_charFromName(U_UNICODE_CHAR_NAME, names[i].name, &errorCode);
1728 if(U_FAILURE(errorCode)) {
1729 log_err("u_charFromName(%s) error %s\n", names[i].name, u_errorName(errorCode));
1730 return;
1731 }
1732 if(c!=(UChar32)names[i].code) {
1733 log_err("u_charFromName(%s) gets 0x%lx instead of 0x%lx\n", names[i].name, c, names[i].code);
1734 }
1735 }
1736
1737 /* Unicode 1.0 character name */
1738 length=u_charName(names[i].code, U_UNICODE_10_CHAR_NAME, name, sizeof(name), &errorCode);
1739 if(U_FAILURE(errorCode)) {
1740 log_err("u_charName(0x%lx - 1.0) error %s\n", names[i].code, u_errorName(errorCode));
1741 return;
1742 }
1743 if(length<0 || (length>0 && 0!=strcmp(name, names[i].oldName)) || length!=(uint16_t)strlen(name)) {
1744 log_err("u_charName(0x%lx - 1.0) gets %s length %ld instead of nothing or %s\n", names[i].code, name, length, names[i].oldName);
1745 }
1746
1747 /* find the Unicode 1.0 name if it is stored (length>0 means that we could read it) */
1748 if(names[i].oldName[0]!=0 /* && length>0 */) {
1749 c=u_charFromName(U_UNICODE_10_CHAR_NAME, names[i].oldName, &errorCode);
1750 if(U_FAILURE(errorCode)) {
1751 log_err("u_charFromName(%s - 1.0) error %s\n", names[i].oldName, u_errorName(errorCode));
1752 return;
1753 }
1754 if(c!=(UChar32)names[i].code) {
1755 log_err("u_charFromName(%s - 1.0) gets 0x%lx instead of 0x%lx\n", names[i].oldName, c, names[i].code);
1756 }
1757 }
1758
1759 /* Unicode character name alias */
1760 length=u_charName(names[i].code, U_CHAR_NAME_ALIAS, name, sizeof(name), &errorCode);
1761 if(U_FAILURE(errorCode)) {
1762 log_err("u_charName(0x%lx - alias) error %s\n", names[i].code, u_errorName(errorCode));
1763 return;
1764 }
1765 expected=names[i].alias;
1766 if(expected==NULL) {
1767 expected="";
1768 }
1769 if(length<0 || (length>0 && 0!=strcmp(name, expected)) || length!=(uint16_t)strlen(name)) {
1770 log_err("u_charName(0x%lx - alias) gets %s length %ld instead of nothing or %s\n",
1771 names[i].code, name, length, expected);
1772 }
1773
1774 /* find the Unicode character name alias if it is stored (length>0 means that we could read it) */
1775 if(expected[0]!=0 /* && length>0 */) {
1776 c=u_charFromName(U_CHAR_NAME_ALIAS, expected, &errorCode);
1777 if(U_FAILURE(errorCode)) {
1778 log_err("u_charFromName(%s - alias) error %s\n",
1779 expected, u_errorName(errorCode));
1780 return;
1781 }
1782 if(c!=(UChar32)names[i].code) {
1783 log_err("u_charFromName(%s - alias) gets 0x%lx instead of 0x%lx\n",
1784 expected, c, names[i].code);
1785 }
1786 }
1787 }
1788
1789 /* test u_enumCharNames() */
1790 length=0;
1791 errorCode=U_ZERO_ERROR;
1792 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumCharNamesFn, &length, U_UNICODE_CHAR_NAME, &errorCode);
1793 if(U_FAILURE(errorCode) || length<94140) {
1794 log_err("u_enumCharNames(%ld..%lx) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE, u_errorName(errorCode), length);
1795 }
1796
1797 extContext.length = 0;
1798 extContext.last = -1;
1799 errorCode=U_ZERO_ERROR;
1800 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumExtCharNamesFn, &extContext, U_EXTENDED_CHAR_NAME, &errorCode);
1801 if(U_FAILURE(errorCode) || extContext.length<UCHAR_MAX_VALUE + 1) {
1802 log_err("u_enumCharNames(%ld..0x%lx - Extended) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, u_errorName(errorCode), extContext.length);
1803 }
1804
1805 /* test that u_charFromName() uppercases the input name, i.e., works with mixed-case names (new in 2.0) */
1806 if(0x61!=u_charFromName(U_UNICODE_CHAR_NAME, "LATin smALl letTER A", &errorCode)) {
1807 log_err("u_charFromName(U_UNICODE_CHAR_NAME, \"LATin smALl letTER A\") did not find U+0061 (%s)\n", u_errorName(errorCode));
1808 }
1809
1810 /* Test getCharNameCharacters */
1811 if(!getTestOption(QUICK_OPTION)) {
1812 enum { BUFSIZE = 256 };
1813 UErrorCode ec = U_ZERO_ERROR;
1814 char buf[BUFSIZE];
1815 int32_t maxLength;
1816 UChar32 cp;
1817 UChar pat[BUFSIZE], dumbPat[BUFSIZE];
1818 int32_t l1, l2;
1819 UBool map[256];
1820 UBool ok;
1821
1822 USet* set = uset_open(1, 0); /* empty set */
1823 USet* dumb = uset_open(1, 0); /* empty set */
1824
1825 /*
1826 * uprv_getCharNameCharacters() will likely return more lowercase
1827 * letters than actual character names contain because
1828 * it includes all the characters in lowercased names of
1829 * general categories, for the full possible set of extended names.
1830 */
1831 {
1832 USetAdder sa={
1833 NULL,
1834 uset_add,
1835 uset_addRange,
1836 uset_addString,
1837 NULL /* don't need remove() */
1838 };
1839 sa.set=set;
1840 uprv_getCharNameCharacters(&sa);
1841 }
1842
1843 /* build set the dumb (but sure-fire) way */
1844 for (i=0; i<256; ++i) {
1845 map[i] = FALSE;
1846 }
1847
1848 maxLength=0;
1849 for (cp=0; cp<0x110000; ++cp) {
1850 int32_t len = u_charName(cp, U_EXTENDED_CHAR_NAME,
1851 buf, BUFSIZE, &ec);
1852 if (U_FAILURE(ec)) {
1853 log_err("FAIL: u_charName failed when it shouldn't\n");
1854 uset_close(set);
1855 uset_close(dumb);
1856 return;
1857 }
1858 if(len>maxLength) {
1859 maxLength=len;
1860 }
1861
1862 for (i=0; i<len; ++i) {
1863 if (!map[(uint8_t) buf[i]]) {
1864 uset_add(dumb, (UChar32)u_charToUChar(buf[i]));
1865 map[(uint8_t) buf[i]] = TRUE;
1866 }
1867 }
1868
1869 /* test for leading/trailing whitespace */
1870 if(buf[0]==' ' || buf[0]=='\t' || buf[len-1]==' ' || buf[len-1]=='\t') {
1871 log_err("u_charName(U+%04x) returns a name with leading or trailing whitespace\n", cp);
1872 }
1873 }
1874
1875 if(map[(uint8_t)'\t']) {
1876 log_err("u_charName() returned a name with a TAB for some code point\n", cp);
1877 }
1878
1879 length=uprv_getMaxCharNameLength();
1880 if(length!=maxLength) {
1881 log_err("uprv_getMaxCharNameLength()=%d differs from the maximum length %d of all extended names\n",
1882 length, maxLength);
1883 }
1884
1885 /* compare the sets. Where is my uset_equals?!! */
1886 ok=TRUE;
1887 for(i=0; i<256; ++i) {
1888 if(uset_contains(set, i)!=uset_contains(dumb, i)) {
1889 if(0x61<=i && i<=0x7a /* a-z */ && uset_contains(set, i) && !uset_contains(dumb, i)) {
1890 /* ignore lowercase a-z that are in set but not in dumb */
1891 ok=TRUE;
1892 } else {
1893 ok=FALSE;
1894 break;
1895 }
1896 }
1897 }
1898
1899 l1 = uset_toPattern(set, pat, BUFSIZE, TRUE, &ec);
1900 l2 = uset_toPattern(dumb, dumbPat, BUFSIZE, TRUE, &ec);
1901 if (U_FAILURE(ec)) {
1902 log_err("FAIL: uset_toPattern failed when it shouldn't\n");
1903 uset_close(set);
1904 uset_close(dumb);
1905 return;
1906 }
1907
1908 if (l1 >= BUFSIZE) {
1909 l1 = BUFSIZE-1;
1910 pat[l1] = 0;
1911 }
1912 if (l2 >= BUFSIZE) {
1913 l2 = BUFSIZE-1;
1914 dumbPat[l2] = 0;
1915 }
1916
1917 if (!ok) {
1918 log_err("FAIL: uprv_getCharNameCharacters() returned %s, expected %s (too many lowercase a-z are ok)\n",
1919 aescstrdup(pat, l1), aescstrdup(dumbPat, l2));
1920 } else if(getTestOption(VERBOSITY_OPTION)) {
1921 log_verbose("Ok: uprv_getCharNameCharacters() returned %s\n", aescstrdup(pat, l1));
1922 }
1923
1924 uset_close(set);
1925 uset_close(dumb);
1926 }
1927
1928 /* ### TODO: test error cases and other interesting things */
1929 }
1930
1931 static void
1932 TestUCharFromNameUnderflow() {
1933 // Ticket #10889: Underflow crash when there is no dash.
1934 UErrorCode errorCode=U_ZERO_ERROR;
1935 UChar32 c=u_charFromName(U_EXTENDED_CHAR_NAME, "<NO BREAK SPACE>", &errorCode);
1936 if(U_SUCCESS(errorCode)) {
1937 log_err("u_charFromName(<NO BREAK SPACE>) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
1938 }
1939
1940 // Test related edge cases.
1941 errorCode=U_ZERO_ERROR;
1942 c=u_charFromName(U_EXTENDED_CHAR_NAME, "<-00a0>", &errorCode);
1943 if(U_SUCCESS(errorCode)) {
1944 log_err("u_charFromName(<-00a0>) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
1945 }
1946
1947 errorCode=U_ZERO_ERROR;
1948 c=u_charFromName(U_EXTENDED_CHAR_NAME, "<control->", &errorCode);
1949 if(U_SUCCESS(errorCode)) {
1950 log_err("u_charFromName(<control->) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
1951 }
1952
1953 errorCode=U_ZERO_ERROR;
1954 c=u_charFromName(U_EXTENDED_CHAR_NAME, "<control-111111>", &errorCode);
1955 if(U_SUCCESS(errorCode)) {
1956 log_err("u_charFromName(<control-111111>) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
1957 }
1958 }
1959
1960 /* test u_isMirrored() and u_charMirror() ----------------------------------- */
1961
1962 static void
1963 TestMirroring() {
1964 USet *set;
1965 UErrorCode errorCode;
1966
1967 UChar32 start, end, c2, c3;
1968 int32_t i;
1969
1970 U_STRING_DECL(mirroredPattern, "[:Bidi_Mirrored:]", 17);
1971
1972 U_STRING_INIT(mirroredPattern, "[:Bidi_Mirrored:]", 17);
1973
1974 log_verbose("Testing u_isMirrored()\n");
1975 if(!(u_isMirrored(0x28) && u_isMirrored(0xbb) && u_isMirrored(0x2045) && u_isMirrored(0x232a) &&
1976 !u_isMirrored(0x27) && !u_isMirrored(0x61) && !u_isMirrored(0x284) && !u_isMirrored(0x3400)
1977 )
1978 ) {
1979 log_err("u_isMirrored() does not work correctly\n");
1980 }
1981
1982 log_verbose("Testing u_charMirror()\n");
1983 if(!(u_charMirror(0x3c)==0x3e && u_charMirror(0x5d)==0x5b && u_charMirror(0x208d)==0x208e && u_charMirror(0x3017)==0x3016 &&
1984 u_charMirror(0xbb)==0xab && u_charMirror(0x2215)==0x29F5 && u_charMirror(0x29F5)==0x2215 && /* large delta between the code points */
1985 u_charMirror(0x2e)==0x2e && u_charMirror(0x6f3)==0x6f3 && u_charMirror(0x301c)==0x301c && u_charMirror(0xa4ab)==0xa4ab &&
1986 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
1987 u_charMirror(0x2018)==0x2018 && u_charMirror(0x201b)==0x201b && u_charMirror(0x301d)==0x301d
1988 )
1989 ) {
1990 log_err("u_charMirror() does not work correctly\n");
1991 }
1992
1993 /* verify that Bidi_Mirroring_Glyph roundtrips */
1994 errorCode=U_ZERO_ERROR;
1995 set=uset_openPattern(mirroredPattern, 17, &errorCode);
1996
1997 if (U_FAILURE(errorCode)) {
1998 log_data_err("uset_openPattern(mirroredPattern, 17, &errorCode) failed!\n");
1999 } else {
2000 for(i=0; 0==uset_getItem(set, i, &start, &end, NULL, 0, &errorCode); ++i) {
2001 do {
2002 c2=u_charMirror(start);
2003 c3=u_charMirror(c2);
2004 if(c3!=start) {
2005 log_err("u_charMirror() does not roundtrip: U+%04lx->U+%04lx->U+%04lx\n", (long)start, (long)c2, (long)c3);
2006 }
2007 c3=u_getBidiPairedBracket(start);
2008 if(u_getIntPropertyValue(start, UCHAR_BIDI_PAIRED_BRACKET_TYPE)==U_BPT_NONE) {
2009 if(c3!=start) {
2010 log_err("u_getBidiPairedBracket(U+%04lx) != self for bpt(c)==None\n",
2011 (long)start);
2012 }
2013 } else {
2014 if(c3!=c2) {
2015 log_err("u_getBidiPairedBracket(U+%04lx) != U+%04lx = bmg(c)'\n",
2016 (long)start, (long)c2);
2017 }
2018 }
2019 } while(++start<=end);
2020 }
2021 }
2022
2023 uset_close(set);
2024 }
2025
2026
2027 struct RunTestData
2028 {
2029 const char *runText;
2030 UScriptCode runCode;
2031 };
2032
2033 typedef struct RunTestData RunTestData;
2034
2035 static void
2036 CheckScriptRuns(UScriptRun *scriptRun, int32_t *runStarts, const RunTestData *testData, int32_t nRuns,
2037 const char *prefix)
2038 {
2039 int32_t run, runStart, runLimit;
2040 UScriptCode runCode;
2041
2042 /* iterate over all the runs */
2043 run = 0;
2044 while (uscript_nextRun(scriptRun, &runStart, &runLimit, &runCode)) {
2045 if (runStart != runStarts[run]) {
2046 log_err("%s: incorrect start offset for run %d: expected %d, got %d\n",
2047 prefix, run, runStarts[run], runStart);
2048 }
2049
2050 if (runLimit != runStarts[run + 1]) {
2051 log_err("%s: incorrect limit offset for run %d: expected %d, got %d\n",
2052 prefix, run, runStarts[run + 1], runLimit);
2053 }
2054
2055 if (runCode != testData[run].runCode) {
2056 log_err("%s: incorrect script for run %d: expected \"%s\", got \"%s\"\n",
2057 prefix, run, uscript_getName(testData[run].runCode), uscript_getName(runCode));
2058 }
2059
2060 run += 1;
2061
2062 /* stop when we've seen all the runs we expect to see */
2063 if (run >= nRuns) {
2064 break;
2065 }
2066 }
2067
2068 /* Complain if we didn't see then number of runs we expected */
2069 if (run != nRuns) {
2070 log_err("%s: incorrect number of runs: expected %d, got %d\n", prefix, run, nRuns);
2071 }
2072 }
2073
2074 static void
2075 TestUScriptRunAPI()
2076 {
2077 static const RunTestData testData1[] = {
2078 {"\\u0020\\u0946\\u0939\\u093F\\u0928\\u094D\\u0926\\u0940\\u0020", USCRIPT_DEVANAGARI},
2079 {"\\u0627\\u0644\\u0639\\u0631\\u0628\\u064A\\u0629\\u0020", USCRIPT_ARABIC},
2080 {"\\u0420\\u0443\\u0441\\u0441\\u043A\\u0438\\u0439\\u0020", USCRIPT_CYRILLIC},
2081 {"English (", USCRIPT_LATIN},
2082 {"\\u0E44\\u0E17\\u0E22", USCRIPT_THAI},
2083 {") ", USCRIPT_LATIN},
2084 {"\\u6F22\\u5B75", USCRIPT_HAN},
2085 {"\\u3068\\u3072\\u3089\\u304C\\u306A\\u3068", USCRIPT_HIRAGANA},
2086 {"\\u30AB\\u30BF\\u30AB\\u30CA", USCRIPT_KATAKANA},
2087 {"\\U00010400\\U00010401\\U00010402\\U00010403", USCRIPT_DESERET}
2088 };
2089
2090 static const RunTestData testData2[] = {
2091 {"((((((((((abc))))))))))", USCRIPT_LATIN}
2092 };
2093
2094 static const struct {
2095 const RunTestData *testData;
2096 int32_t nRuns;
2097 } testDataEntries[] = {
2098 {testData1, UPRV_LENGTHOF(testData1)},
2099 {testData2, UPRV_LENGTHOF(testData2)}
2100 };
2101
2102 static const int32_t nTestEntries = UPRV_LENGTHOF(testDataEntries);
2103 int32_t testEntry;
2104
2105 for (testEntry = 0; testEntry < nTestEntries; testEntry += 1) {
2106 UChar testString[1024];
2107 int32_t runStarts[256];
2108 int32_t nTestRuns = testDataEntries[testEntry].nRuns;
2109 const RunTestData *testData = testDataEntries[testEntry].testData;
2110
2111 int32_t run, stringLimit;
2112 UScriptRun *scriptRun = NULL;
2113 UErrorCode err;
2114
2115 /*
2116 * Fill in the test string and the runStarts array.
2117 */
2118 stringLimit = 0;
2119 for (run = 0; run < nTestRuns; run += 1) {
2120 runStarts[run] = stringLimit;
2121 stringLimit += u_unescape(testData[run].runText, &testString[stringLimit], 1024 - stringLimit);
2122 /*stringLimit -= 1;*/
2123 }
2124
2125 /* The limit of the last run */
2126 runStarts[nTestRuns] = stringLimit;
2127
2128 /*
2129 * Make sure that calling uscript_OpenRun with a NULL text pointer
2130 * and a non-zero text length returns the correct error.
2131 */
2132 err = U_ZERO_ERROR;
2133 scriptRun = uscript_openRun(NULL, stringLimit, &err);
2134
2135 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2136 log_err("uscript_openRun(NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2137 }
2138
2139 if (scriptRun != NULL) {
2140 log_err("uscript_openRun(NULL, stringLimit, &err) returned a non-NULL result.\n");
2141 uscript_closeRun(scriptRun);
2142 }
2143
2144 /*
2145 * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2146 * and a zero text length returns the correct error.
2147 */
2148 err = U_ZERO_ERROR;
2149 scriptRun = uscript_openRun(testString, 0, &err);
2150
2151 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2152 log_err("uscript_openRun(testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2153 }
2154
2155 if (scriptRun != NULL) {
2156 log_err("uscript_openRun(testString, 0, &err) returned a non-NULL result.\n");
2157 uscript_closeRun(scriptRun);
2158 }
2159
2160 /*
2161 * Make sure that calling uscript_openRun with a NULL text pointer
2162 * and a zero text length doesn't return an error.
2163 */
2164 err = U_ZERO_ERROR;
2165 scriptRun = uscript_openRun(NULL, 0, &err);
2166
2167 if (U_FAILURE(err)) {
2168 log_err("Got error %s from uscript_openRun(NULL, 0, &err)\n", u_errorName(err));
2169 }
2170
2171 /* Make sure that the empty iterator doesn't find any runs */
2172 if (uscript_nextRun(scriptRun, NULL, NULL, NULL)) {
2173 log_err("uscript_nextRun(...) returned TRUE for an empty iterator.\n");
2174 }
2175
2176 /*
2177 * Make sure that calling uscript_setRunText with a NULL text pointer
2178 * and a non-zero text length returns the correct error.
2179 */
2180 err = U_ZERO_ERROR;
2181 uscript_setRunText(scriptRun, NULL, stringLimit, &err);
2182
2183 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2184 log_err("uscript_setRunText(scriptRun, NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2185 }
2186
2187 /*
2188 * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2189 * and a zero text length returns the correct error.
2190 */
2191 err = U_ZERO_ERROR;
2192 uscript_setRunText(scriptRun, testString, 0, &err);
2193
2194 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2195 log_err("uscript_setRunText(scriptRun, testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2196 }
2197
2198 /*
2199 * Now call uscript_setRunText on the empty iterator
2200 * and make sure that it works.
2201 */
2202 err = U_ZERO_ERROR;
2203 uscript_setRunText(scriptRun, testString, stringLimit, &err);
2204
2205 if (U_FAILURE(err)) {
2206 log_err("Got error %s from uscript_setRunText(...)\n", u_errorName(err));
2207 } else {
2208 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_setRunText");
2209 }
2210
2211 uscript_closeRun(scriptRun);
2212
2213 /*
2214 * Now open an interator over the testString
2215 * using uscript_openRun and make sure that it works
2216 */
2217 scriptRun = uscript_openRun(testString, stringLimit, &err);
2218
2219 if (U_FAILURE(err)) {
2220 log_err("Got error %s from uscript_openRun(...)\n", u_errorName(err));
2221 } else {
2222 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_openRun");
2223 }
2224
2225 /* Now reset the iterator, and make sure
2226 * that it still works.
2227 */
2228 uscript_resetRun(scriptRun);
2229
2230 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_resetRun");
2231
2232 /* Close the iterator */
2233 uscript_closeRun(scriptRun);
2234 }
2235 }
2236
2237 /* test additional, non-core properties */
2238 static void
2239 TestAdditionalProperties() {
2240 /* test data for u_charAge() */
2241 static const struct {
2242 UChar32 c;
2243 UVersionInfo version;
2244 } charAges[]={
2245 {0x41, { 1, 1, 0, 0 }},
2246 {0xffff, { 1, 1, 0, 0 }},
2247 {0x20ab, { 2, 0, 0, 0 }},
2248 {0x2fffe, { 2, 0, 0, 0 }},
2249 {0x20ac, { 2, 1, 0, 0 }},
2250 {0xfb1d, { 3, 0, 0, 0 }},
2251 {0x3f4, { 3, 1, 0, 0 }},
2252 {0x10300, { 3, 1, 0, 0 }},
2253 {0x220, { 3, 2, 0, 0 }},
2254 {0xff60, { 3, 2, 0, 0 }}
2255 };
2256
2257 /* test data for u_hasBinaryProperty() */
2258 static const int32_t
2259 props[][3]={ /* code point, property, value */
2260 { 0x0627, UCHAR_ALPHABETIC, TRUE },
2261 { 0x1034a, UCHAR_ALPHABETIC, TRUE },
2262 { 0x2028, UCHAR_ALPHABETIC, FALSE },
2263
2264 { 0x0066, UCHAR_ASCII_HEX_DIGIT, TRUE },
2265 { 0x0067, UCHAR_ASCII_HEX_DIGIT, FALSE },
2266
2267 { 0x202c, UCHAR_BIDI_CONTROL, TRUE },
2268 { 0x202f, UCHAR_BIDI_CONTROL, FALSE },
2269
2270 { 0x003c, UCHAR_BIDI_MIRRORED, TRUE },
2271 { 0x003d, UCHAR_BIDI_MIRRORED, FALSE },
2272
2273 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
2274 { 0x2018, UCHAR_BIDI_MIRRORED, FALSE },
2275 { 0x201d, UCHAR_BIDI_MIRRORED, FALSE },
2276 { 0x201f, UCHAR_BIDI_MIRRORED, FALSE },
2277 { 0x301e, UCHAR_BIDI_MIRRORED, FALSE },
2278
2279 { 0x058a, UCHAR_DASH, TRUE },
2280 { 0x007e, UCHAR_DASH, FALSE },
2281
2282 { 0x0c4d, UCHAR_DIACRITIC, TRUE },
2283 { 0x3000, UCHAR_DIACRITIC, FALSE },
2284
2285 { 0x0e46, UCHAR_EXTENDER, TRUE },
2286 { 0x0020, UCHAR_EXTENDER, FALSE },
2287
2288 #if !UCONFIG_NO_NORMALIZATION
2289 { 0xfb1d, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2290 { 0x1d15f, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2291 { 0xfb1e, UCHAR_FULL_COMPOSITION_EXCLUSION, FALSE },
2292
2293 { 0x110a, UCHAR_NFD_INERT, TRUE }, /* Jamo L */
2294 { 0x0308, UCHAR_NFD_INERT, FALSE },
2295
2296 { 0x1164, UCHAR_NFKD_INERT, TRUE }, /* Jamo V */
2297 { 0x1d79d, UCHAR_NFKD_INERT, FALSE }, /* math compat version of xi */
2298
2299 { 0x0021, UCHAR_NFC_INERT, TRUE }, /* ! */
2300 { 0x0061, UCHAR_NFC_INERT, FALSE }, /* a */
2301 { 0x00e4, UCHAR_NFC_INERT, FALSE }, /* a-umlaut */
2302 { 0x0102, UCHAR_NFC_INERT, FALSE }, /* a-breve */
2303 { 0xac1c, UCHAR_NFC_INERT, FALSE }, /* Hangul LV */
2304 { 0xac1d, UCHAR_NFC_INERT, TRUE }, /* Hangul LVT */
2305
2306 { 0x1d79d, UCHAR_NFKC_INERT, FALSE }, /* math compat version of xi */
2307 { 0x2a6d6, UCHAR_NFKC_INERT, TRUE }, /* Han, last of CJK ext. B */
2308
2309 { 0x00e4, UCHAR_SEGMENT_STARTER, TRUE },
2310 { 0x0308, UCHAR_SEGMENT_STARTER, FALSE },
2311 { 0x110a, UCHAR_SEGMENT_STARTER, TRUE }, /* Jamo L */
2312 { 0x1164, UCHAR_SEGMENT_STARTER, FALSE },/* Jamo V */
2313 { 0xac1c, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LV */
2314 { 0xac1d, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LVT */
2315 #endif
2316
2317 { 0x0044, UCHAR_HEX_DIGIT, TRUE },
2318 { 0xff46, UCHAR_HEX_DIGIT, TRUE },
2319 { 0x0047, UCHAR_HEX_DIGIT, FALSE },
2320
2321 { 0x30fb, UCHAR_HYPHEN, TRUE },
2322 { 0xfe58, UCHAR_HYPHEN, FALSE },
2323
2324 { 0x2172, UCHAR_ID_CONTINUE, TRUE },
2325 { 0x0307, UCHAR_ID_CONTINUE, TRUE },
2326 { 0x005c, UCHAR_ID_CONTINUE, FALSE },
2327
2328 { 0x2172, UCHAR_ID_START, TRUE },
2329 { 0x007a, UCHAR_ID_START, TRUE },
2330 { 0x0039, UCHAR_ID_START, FALSE },
2331
2332 { 0x4db5, UCHAR_IDEOGRAPHIC, TRUE },
2333 { 0x2f999, UCHAR_IDEOGRAPHIC, TRUE },
2334 { 0x2f99, UCHAR_IDEOGRAPHIC, FALSE },
2335
2336 { 0x200c, UCHAR_JOIN_CONTROL, TRUE },
2337 { 0x2029, UCHAR_JOIN_CONTROL, FALSE },
2338
2339 { 0x1d7bc, UCHAR_LOWERCASE, TRUE },
2340 { 0x0345, UCHAR_LOWERCASE, TRUE },
2341 { 0x0030, UCHAR_LOWERCASE, FALSE },
2342
2343 { 0x1d7a9, UCHAR_MATH, TRUE },
2344 { 0x2135, UCHAR_MATH, TRUE },
2345 { 0x0062, UCHAR_MATH, FALSE },
2346
2347 { 0xfde1, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2348 { 0x10ffff, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2349 { 0x10fffd, UCHAR_NONCHARACTER_CODE_POINT, FALSE },
2350
2351 { 0x0022, UCHAR_QUOTATION_MARK, TRUE },
2352 { 0xff62, UCHAR_QUOTATION_MARK, TRUE },
2353 { 0xd840, UCHAR_QUOTATION_MARK, FALSE },
2354
2355 { 0x061f, UCHAR_TERMINAL_PUNCTUATION, TRUE },
2356 { 0xe003f, UCHAR_TERMINAL_PUNCTUATION, FALSE },
2357
2358 { 0x1d44a, UCHAR_UPPERCASE, TRUE },
2359 { 0x2162, UCHAR_UPPERCASE, TRUE },
2360 { 0x0345, UCHAR_UPPERCASE, FALSE },
2361
2362 { 0x0020, UCHAR_WHITE_SPACE, TRUE },
2363 { 0x202f, UCHAR_WHITE_SPACE, TRUE },
2364 { 0x3001, UCHAR_WHITE_SPACE, FALSE },
2365
2366 { 0x0711, UCHAR_XID_CONTINUE, TRUE },
2367 { 0x1d1aa, UCHAR_XID_CONTINUE, TRUE },
2368 { 0x007c, UCHAR_XID_CONTINUE, FALSE },
2369
2370 { 0x16ee, UCHAR_XID_START, TRUE },
2371 { 0x23456, UCHAR_XID_START, TRUE },
2372 { 0x1d1aa, UCHAR_XID_START, FALSE },
2373
2374 /*
2375 * Version break:
2376 * The following properties are only supported starting with the
2377 * Unicode version indicated in the second field.
2378 */
2379 { -1, 0x320, 0 },
2380
2381 { 0x180c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2382 { 0xfe02, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2383 { 0x1801, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, FALSE },
2384
2385 { 0x0149, UCHAR_DEPRECATED, TRUE }, /* changed in Unicode 5.2 */
2386 { 0x0341, UCHAR_DEPRECATED, FALSE }, /* changed in Unicode 5.2 */
2387 { 0xe0001, UCHAR_DEPRECATED, TRUE }, /* changed from Unicode 5 to 5.1 */
2388 { 0xe0100, UCHAR_DEPRECATED, FALSE },
2389
2390 { 0x00a0, UCHAR_GRAPHEME_BASE, TRUE },
2391 { 0x0a4d, UCHAR_GRAPHEME_BASE, FALSE },
2392 { 0xff9d, UCHAR_GRAPHEME_BASE, TRUE },
2393 { 0xff9f, UCHAR_GRAPHEME_BASE, FALSE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2394
2395 { 0x0300, UCHAR_GRAPHEME_EXTEND, TRUE },
2396 { 0xff9d, UCHAR_GRAPHEME_EXTEND, FALSE },
2397 { 0xff9f, UCHAR_GRAPHEME_EXTEND, TRUE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2398 { 0x0603, UCHAR_GRAPHEME_EXTEND, FALSE },
2399
2400 { 0x0a4d, UCHAR_GRAPHEME_LINK, TRUE },
2401 { 0xff9f, UCHAR_GRAPHEME_LINK, FALSE },
2402
2403 { 0x2ff7, UCHAR_IDS_BINARY_OPERATOR, TRUE },
2404 { 0x2ff3, UCHAR_IDS_BINARY_OPERATOR, FALSE },
2405
2406 { 0x2ff3, UCHAR_IDS_TRINARY_OPERATOR, TRUE },
2407 { 0x2f03, UCHAR_IDS_TRINARY_OPERATOR, FALSE },
2408
2409 { 0x0ec1, UCHAR_LOGICAL_ORDER_EXCEPTION, TRUE },
2410 { 0xdcba, UCHAR_LOGICAL_ORDER_EXCEPTION, FALSE },
2411
2412 { 0x2e9b, UCHAR_RADICAL, TRUE },
2413 { 0x4e00, UCHAR_RADICAL, FALSE },
2414
2415 { 0x012f, UCHAR_SOFT_DOTTED, TRUE },
2416 { 0x0049, UCHAR_SOFT_DOTTED, FALSE },
2417
2418 { 0xfa11, UCHAR_UNIFIED_IDEOGRAPH, TRUE },
2419 { 0xfa12, UCHAR_UNIFIED_IDEOGRAPH, FALSE },
2420
2421 { -1, 0x401, 0 }, /* version break for Unicode 4.0.1 */
2422
2423 { 0x002e, UCHAR_S_TERM, TRUE },
2424 { 0x0061, UCHAR_S_TERM, FALSE },
2425
2426 { 0x180c, UCHAR_VARIATION_SELECTOR, TRUE },
2427 { 0xfe03, UCHAR_VARIATION_SELECTOR, TRUE },
2428 { 0xe01ef, UCHAR_VARIATION_SELECTOR, TRUE },
2429 { 0xe0200, UCHAR_VARIATION_SELECTOR, FALSE },
2430
2431 /* enum/integer type properties */
2432
2433 /* UCHAR_BIDI_CLASS tested for assigned characters in TestUnicodeData() */
2434 /* test default Bidi classes for unassigned code points */
2435 { 0x0590, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2436 { 0x05cf, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2437 { 0x05ed, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2438 { 0x07f2, UCHAR_BIDI_CLASS, U_DIR_NON_SPACING_MARK }, /* Nko, new in Unicode 5.0 */
2439 { 0x07fe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, /* unassigned R */
2440 { 0x089f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2441 { 0xfb37, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2442 { 0xfb42, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2443 { 0x10806, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2444 { 0x10909, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2445 { 0x10fe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2446
2447 { 0x061d, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2448 { 0x063f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2449 { 0x070e, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2450 { 0x0775, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2451 { 0xfbc2, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2452 { 0xfd90, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2453 { 0xfefe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2454
2455 { 0x02AF, UCHAR_BLOCK, UBLOCK_IPA_EXTENSIONS },
2456 { 0x0C4E, UCHAR_BLOCK, UBLOCK_TELUGU },
2457 { 0x155A, UCHAR_BLOCK, UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS },
2458 { 0x1717, UCHAR_BLOCK, UBLOCK_TAGALOG },
2459 { 0x1900, UCHAR_BLOCK, UBLOCK_LIMBU },
2460 { 0x1CBF, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2461 { 0x3040, UCHAR_BLOCK, UBLOCK_HIRAGANA },
2462 { 0x1D0FF, UCHAR_BLOCK, UBLOCK_BYZANTINE_MUSICAL_SYMBOLS },
2463 { 0x50000, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2464 { 0xEFFFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2465 { 0x10D0FF, UCHAR_BLOCK, UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B },
2466
2467 /* UCHAR_CANONICAL_COMBINING_CLASS tested for assigned characters in TestUnicodeData() */
2468 { 0xd7d7, UCHAR_CANONICAL_COMBINING_CLASS, 0 },
2469
2470 { 0x00A0, UCHAR_DECOMPOSITION_TYPE, U_DT_NOBREAK },
2471 { 0x00A8, UCHAR_DECOMPOSITION_TYPE, U_DT_COMPAT },
2472 { 0x00bf, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2473 { 0x00c0, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2474 { 0x1E9B, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2475 { 0xBCDE, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2476 { 0xFB5D, UCHAR_DECOMPOSITION_TYPE, U_DT_MEDIAL },
2477 { 0x1D736, UCHAR_DECOMPOSITION_TYPE, U_DT_FONT },
2478 { 0xe0033, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2479
2480 { 0x0009, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2481 { 0x0020, UCHAR_EAST_ASIAN_WIDTH, U_EA_NARROW },
2482 { 0x00B1, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2483 { 0x20A9, UCHAR_EAST_ASIAN_WIDTH, U_EA_HALFWIDTH },
2484 { 0x2FFB, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2485 { 0x3000, UCHAR_EAST_ASIAN_WIDTH, U_EA_FULLWIDTH },
2486 { 0x35bb, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2487 { 0x58bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2488 { 0xD7A3, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2489 { 0xEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2490 { 0x1D198, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2491 { 0x20000, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2492 { 0x2F8C7, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2493 { 0x3a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, /* plane 3 got default W values in Unicode 4 */
2494 { 0x5a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2495 { 0xFEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2496 { 0x10EEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2497
2498 /* UCHAR_GENERAL_CATEGORY tested for assigned characters in TestUnicodeData() */
2499 { 0xd7c7, UCHAR_GENERAL_CATEGORY, 0 },
2500 { 0xd7d7, UCHAR_GENERAL_CATEGORY, U_OTHER_LETTER }, /* changed in Unicode 5.2 */
2501
2502 { 0x0444, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2503 { 0x0639, UCHAR_JOINING_GROUP, U_JG_AIN },
2504 { 0x072A, UCHAR_JOINING_GROUP, U_JG_DALATH_RISH },
2505 { 0x0647, UCHAR_JOINING_GROUP, U_JG_HEH },
2506 { 0x06C1, UCHAR_JOINING_GROUP, U_JG_HEH_GOAL },
2507
2508 { 0x200C, UCHAR_JOINING_TYPE, U_JT_NON_JOINING },
2509 { 0x200D, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2510 { 0x0639, UCHAR_JOINING_TYPE, U_JT_DUAL_JOINING },
2511 { 0x0640, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2512 { 0x06C3, UCHAR_JOINING_TYPE, U_JT_RIGHT_JOINING },
2513 { 0x0300, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2514 { 0x070F, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2515 { 0xe0033, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2516
2517 /* TestUnicodeData() verifies that no assigned character has "XX" (unknown) */
2518 { 0xe7e7, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2519 { 0x10fffd, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2520 { 0x0028, UCHAR_LINE_BREAK, U_LB_OPEN_PUNCTUATION },
2521 { 0x232A, UCHAR_LINE_BREAK, U_LB_CLOSE_PUNCTUATION },
2522 { 0x3401, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2523 { 0x4e02, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2524 { 0x20004, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2525 { 0xf905, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2526 { 0xdb7e, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2527 { 0xdbfd, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2528 { 0xdffc, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2529 { 0x2762, UCHAR_LINE_BREAK, U_LB_EXCLAMATION },
2530 { 0x002F, UCHAR_LINE_BREAK, U_LB_BREAK_SYMBOLS },
2531 { 0x1D49C, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2532 { 0x1731, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2533
2534 /* UCHAR_NUMERIC_TYPE tested in TestNumericProperties() */
2535
2536 /* UCHAR_SCRIPT tested in TestUScriptCodeAPI() */
2537
2538 { 0x10ff, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2539 { 0x1100, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2540 { 0x1111, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2541 { 0x1159, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2542 { 0x115a, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2543 { 0x115e, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2544 { 0x115f, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2545
2546 { 0xa95f, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2547 { 0xa960, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2548 { 0xa97c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2549 { 0xa97d, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2550
2551 { 0x1160, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2552 { 0x1161, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2553 { 0x1172, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2554 { 0x11a2, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2555 { 0x11a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2556 { 0x11a7, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2557
2558 { 0xd7af, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2559 { 0xd7b0, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2560 { 0xd7c6, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2561 { 0xd7c7, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2562
2563 { 0x11a8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2564 { 0x11b8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2565 { 0x11c8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2566 { 0x11f9, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2567 { 0x11fa, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2568 { 0x11ff, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2569 { 0x1200, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2570
2571 { 0xd7ca, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2572 { 0xd7cb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2573 { 0xd7fb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2574 { 0xd7fc, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2575
2576 { 0xac00, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2577 { 0xac1c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2578 { 0xc5ec, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2579 { 0xd788, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2580
2581 { 0xac01, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2582 { 0xac1b, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2583 { 0xac1d, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2584 { 0xc5ee, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2585 { 0xd7a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2586
2587 { 0xd7a4, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2588
2589 { -1, 0x410, 0 }, /* version break for Unicode 4.1 */
2590
2591 { 0x00d7, UCHAR_PATTERN_SYNTAX, TRUE },
2592 { 0xfe45, UCHAR_PATTERN_SYNTAX, TRUE },
2593 { 0x0061, UCHAR_PATTERN_SYNTAX, FALSE },
2594
2595 { 0x0020, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2596 { 0x0085, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2597 { 0x200f, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2598 { 0x00a0, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2599 { 0x3000, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2600
2601 { 0x1d200, UCHAR_BLOCK, UBLOCK_ANCIENT_GREEK_MUSICAL_NOTATION },
2602 { 0x2c8e, UCHAR_BLOCK, UBLOCK_COPTIC },
2603 { 0xfe17, UCHAR_BLOCK, UBLOCK_VERTICAL_FORMS },
2604
2605 { 0x1a00, UCHAR_SCRIPT, USCRIPT_BUGINESE },
2606 { 0x2cea, UCHAR_SCRIPT, USCRIPT_COPTIC },
2607 { 0xa82b, UCHAR_SCRIPT, USCRIPT_SYLOTI_NAGRI },
2608 { 0x103d0, UCHAR_SCRIPT, USCRIPT_OLD_PERSIAN },
2609
2610 { 0xcc28, UCHAR_LINE_BREAK, U_LB_H2 },
2611 { 0xcc29, UCHAR_LINE_BREAK, U_LB_H3 },
2612 { 0xac03, UCHAR_LINE_BREAK, U_LB_H3 },
2613 { 0x115f, UCHAR_LINE_BREAK, U_LB_JL },
2614 { 0x11aa, UCHAR_LINE_BREAK, U_LB_JT },
2615 { 0x11a1, UCHAR_LINE_BREAK, U_LB_JV },
2616
2617 { 0xb2c9, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_LVT },
2618 { 0x036f, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_EXTEND },
2619 { 0x0000, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_CONTROL },
2620 { 0x1160, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_V },
2621
2622 { 0x05f4, UCHAR_WORD_BREAK, U_WB_MIDLETTER },
2623 { 0x4ef0, UCHAR_WORD_BREAK, U_WB_OTHER },
2624 { 0x19d9, UCHAR_WORD_BREAK, U_WB_NUMERIC },
2625 { 0x2044, UCHAR_WORD_BREAK, U_WB_MIDNUM },
2626
2627 { 0xfffd, UCHAR_SENTENCE_BREAK, U_SB_OTHER },
2628 { 0x1ffc, UCHAR_SENTENCE_BREAK, U_SB_UPPER },
2629 { 0xff63, UCHAR_SENTENCE_BREAK, U_SB_CLOSE },
2630 { 0x2028, UCHAR_SENTENCE_BREAK, U_SB_SEP },
2631
2632 { -1, 0x520, 0 }, /* version break for Unicode 5.2 */
2633
2634 /* unassigned code points in new default Bidi R blocks */
2635 { 0x1ede4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2636 { 0x1efe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2637
2638 /* test some script codes >127 */
2639 { 0xa6e6, UCHAR_SCRIPT, USCRIPT_BAMUM },
2640 { 0xa4d0, UCHAR_SCRIPT, USCRIPT_LISU },
2641 { 0x10a7f, UCHAR_SCRIPT, USCRIPT_OLD_SOUTH_ARABIAN },
2642
2643 { -1, 0x600, 0 }, /* version break for Unicode 6.0 */
2644
2645 /* value changed in Unicode 6.0 */
2646 { 0x06C3, UCHAR_JOINING_GROUP, U_JG_TEH_MARBUTA_GOAL },
2647
2648 { -1, 0x610, 0 }, /* version break for Unicode 6.1 */
2649
2650 /* unassigned code points in new/changed default Bidi AL blocks */
2651 { 0x08ba, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2652 { 0x1eee4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2653
2654 { -1, 0x630, 0 }, /* version break for Unicode 6.3 */
2655
2656 /* unassigned code points in the currency symbols block now default to ET */
2657 { 0x20C0, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR },
2658 { 0x20CF, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR },
2659
2660 /* new property in Unicode 6.3 */
2661 { 0x0027, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE },
2662 { 0x0028, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN },
2663 { 0x0029, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE },
2664 { 0xFF5C, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE },
2665 { 0xFF5B, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN },
2666 { 0xFF5D, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE },
2667
2668 { -1, 0x700, 0 }, /* version break for Unicode 7.0 */
2669
2670 /* new character range with Joining_Group values */
2671 { 0x10ABF, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2672 { 0x10AC0, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_ALEPH },
2673 { 0x10AC1, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_BETH },
2674 { 0x10AEF, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_HUNDRED },
2675 { 0x10AF0, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2676
2677 { -1, 0xa00, 0 }, // version break for Unicode 10
2678
2679 { 0x1F1E5, UCHAR_REGIONAL_INDICATOR, FALSE },
2680 { 0x1F1E7, UCHAR_REGIONAL_INDICATOR, TRUE },
2681 { 0x1F1FF, UCHAR_REGIONAL_INDICATOR, TRUE },
2682 { 0x1F200, UCHAR_REGIONAL_INDICATOR, FALSE },
2683
2684 { 0x0600, UCHAR_PREPENDED_CONCATENATION_MARK, TRUE },
2685 { 0x0606, UCHAR_PREPENDED_CONCATENATION_MARK, FALSE },
2686 { 0x110BD, UCHAR_PREPENDED_CONCATENATION_MARK, TRUE },
2687
2688 /* undefined UProperty values */
2689 { 0x61, 0x4a7, 0 },
2690 { 0x234bc, 0x15ed, 0 }
2691 };
2692
2693 UVersionInfo version;
2694 UChar32 c;
2695 int32_t i, result, uVersion;
2696 UProperty which;
2697
2698 /* what is our Unicode version? */
2699 u_getUnicodeVersion(version);
2700 uVersion=((int32_t)version[0]<<8)|(version[1]<<4)|version[2]; /* major/minor/update version numbers */
2701
2702 u_charAge(0x20, version);
2703 if(version[0]==0) {
2704 /* no additional properties available */
2705 log_err("TestAdditionalProperties: no additional properties available, not tested\n");
2706 return;
2707 }
2708
2709 /* test u_charAge() */
2710 for(i=0; i<UPRV_LENGTHOF(charAges); ++i) {
2711 u_charAge(charAges[i].c, version);
2712 if(0!=memcmp(version, charAges[i].version, sizeof(UVersionInfo))) {
2713 log_err("error: u_charAge(U+%04lx)={ %u, %u, %u, %u } instead of { %u, %u, %u, %u }\n",
2714 charAges[i].c,
2715 version[0], version[1], version[2], version[3],
2716 charAges[i].version[0], charAges[i].version[1], charAges[i].version[2], charAges[i].version[3]);
2717 }
2718 }
2719
2720 if( u_getIntPropertyMinValue(UCHAR_DASH)!=0 ||
2721 u_getIntPropertyMinValue(UCHAR_BIDI_CLASS)!=0 ||
2722 u_getIntPropertyMinValue(UCHAR_BLOCK)!=0 || /* j2478 */
2723 u_getIntPropertyMinValue(UCHAR_SCRIPT)!=0 || /*JB#2410*/
2724 u_getIntPropertyMinValue(0x2345)!=0
2725 ) {
2726 log_err("error: u_getIntPropertyMinValue() wrong\n");
2727 }
2728 if( u_getIntPropertyMaxValue(UCHAR_DASH)!=1) {
2729 log_err("error: u_getIntPropertyMaxValue(UCHAR_DASH) wrong\n");
2730 }
2731 if( u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE)!=1) {
2732 log_err("error: u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE) wrong\n");
2733 }
2734 if( u_getIntPropertyMaxValue((UProperty)(UCHAR_BINARY_LIMIT-1))!=1) {
2735 log_err("error: u_getIntPropertyMaxValue(UCHAR_BINARY_LIMIT-1) wrong\n");
2736 }
2737 if( u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS)!=(int32_t)U_CHAR_DIRECTION_COUNT-1 ) {
2738 log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS) wrong\n");
2739 }
2740 if( u_getIntPropertyMaxValue(UCHAR_BLOCK)!=(int32_t)UBLOCK_COUNT-1 ) {
2741 log_err("error: u_getIntPropertyMaxValue(UCHAR_BLOCK) wrong\n");
2742 }
2743 if(u_getIntPropertyMaxValue(UCHAR_LINE_BREAK)!=(int32_t)U_LB_COUNT-1) {
2744 log_err("error: u_getIntPropertyMaxValue(UCHAR_LINE_BREAK) wrong\n");
2745 }
2746 if(u_getIntPropertyMaxValue(UCHAR_SCRIPT)!=(int32_t)USCRIPT_CODE_LIMIT-1) {
2747 log_err("error: u_getIntPropertyMaxValue(UCHAR_SCRIPT) wrong\n");
2748 }
2749 if(u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE)!=(int32_t)U_NT_COUNT-1) {
2750 log_err("error: u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE) wrong\n");
2751 }
2752 if(u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY)!=(int32_t)U_CHAR_CATEGORY_COUNT-1) {
2753 log_err("error: u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY) wrong\n");
2754 }
2755 if(u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE)!=(int32_t)U_HST_COUNT-1) {
2756 log_err("error: u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE) wrong\n");
2757 }
2758 if(u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK)!=(int32_t)U_GCB_COUNT-1) {
2759 log_err("error: u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK) wrong\n");
2760 }
2761 if(u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK)!=(int32_t)U_SB_COUNT-1) {
2762 log_err("error: u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK) wrong\n");
2763 }
2764 if(u_getIntPropertyMaxValue(UCHAR_WORD_BREAK)!=(int32_t)U_WB_COUNT-1) {
2765 log_err("error: u_getIntPropertyMaxValue(UCHAR_WORD_BREAK) wrong\n");
2766 }
2767 if(u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE)!=(int32_t)U_BPT_COUNT-1) {
2768 log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE) wrong\n");
2769 }
2770 /*JB#2410*/
2771 if( u_getIntPropertyMaxValue(0x2345)!=-1) {
2772 log_err("error: u_getIntPropertyMaxValue(0x2345) wrong\n");
2773 }
2774 if( u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) != (int32_t) (U_DT_COUNT - 1)) {
2775 log_err("error: u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) wrong\n");
2776 }
2777 if( u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) != (int32_t) (U_JG_COUNT -1)) {
2778 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) wrong\n");
2779 }
2780 if( u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) != (int32_t) (U_JT_COUNT -1)) {
2781 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) wrong\n");
2782 }
2783 if( u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) != (int32_t) (U_EA_COUNT -1)) {
2784 log_err("error: u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) wrong\n");
2785 }
2786
2787 /* test u_hasBinaryProperty() and u_getIntPropertyValue() */
2788 for(i=0; i<UPRV_LENGTHOF(props); ++i) {
2789 const char *whichName;
2790
2791 if(props[i][0]<0) {
2792 /* Unicode version break */
2793 if(uVersion<props[i][1]) {
2794 break; /* do not test properties that are not yet supported */
2795 } else {
2796 continue; /* skip this row */
2797 }
2798 }
2799
2800 c=(UChar32)props[i][0];
2801 which=(UProperty)props[i][1];
2802 whichName=u_getPropertyName(which, U_LONG_PROPERTY_NAME);
2803
2804 if(which<UCHAR_INT_START) {
2805 result=u_hasBinaryProperty(c, which);
2806 if(result!=props[i][2]) {
2807 log_data_err("error: u_hasBinaryProperty(U+%04lx, %s)=%d is wrong (props[%d]) - (Are you missing data?)\n",
2808 c, whichName, result, i);
2809 }
2810 }
2811
2812 result=u_getIntPropertyValue(c, which);
2813 if(result!=props[i][2]) {
2814 log_data_err("error: u_getIntPropertyValue(U+%04lx, %s)=%d is wrong, should be %d (props[%d]) - (Are you missing data?)\n",
2815 c, whichName, result, props[i][2], i);
2816 }
2817
2818 /* test separate functions, too */
2819 switch((UProperty)props[i][1]) {
2820 case UCHAR_ALPHABETIC:
2821 if(u_isUAlphabetic((UChar32)props[i][0])!=(UBool)props[i][2]) {
2822 log_err("error: u_isUAlphabetic(U+%04lx)=%d is wrong (props[%d])\n",
2823 props[i][0], result, i);
2824 }
2825 break;
2826 case UCHAR_LOWERCASE:
2827 if(u_isULowercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2828 log_err("error: u_isULowercase(U+%04lx)=%d is wrong (props[%d])\n",
2829 props[i][0], result, i);
2830 }
2831 break;
2832 case UCHAR_UPPERCASE:
2833 if(u_isUUppercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2834 log_err("error: u_isUUppercase(U+%04lx)=%d is wrong (props[%d])\n",
2835 props[i][0], result, i);
2836 }
2837 break;
2838 case UCHAR_WHITE_SPACE:
2839 if(u_isUWhiteSpace((UChar32)props[i][0])!=(UBool)props[i][2]) {
2840 log_err("error: u_isUWhiteSpace(U+%04lx)=%d is wrong (props[%d])\n",
2841 props[i][0], result, i);
2842 }
2843 break;
2844 default:
2845 break;
2846 }
2847 }
2848 }
2849
2850 static void
2851 TestNumericProperties(void) {
2852 /* see UnicodeData.txt, DerivedNumericValues.txt */
2853 static const struct {
2854 UChar32 c;
2855 int32_t type;
2856 double numValue;
2857 } values[]={
2858 { 0x0F33, U_NT_NUMERIC, -1./2. },
2859 { 0x0C66, U_NT_DECIMAL, 0 },
2860 { 0x96f6, U_NT_NUMERIC, 0 },
2861 { 0xa833, U_NT_NUMERIC, 1./16. },
2862 { 0x2152, U_NT_NUMERIC, 1./10. },
2863 { 0x2151, U_NT_NUMERIC, 1./9. },
2864 { 0x1245f, U_NT_NUMERIC, 1./8. },
2865 { 0x2150, U_NT_NUMERIC, 1./7. },
2866 { 0x2159, U_NT_NUMERIC, 1./6. },
2867 { 0x09f6, U_NT_NUMERIC, 3./16. },
2868 { 0x2155, U_NT_NUMERIC, 1./5. },
2869 { 0x00BD, U_NT_NUMERIC, 1./2. },
2870 { 0x0031, U_NT_DECIMAL, 1. },
2871 { 0x4e00, U_NT_NUMERIC, 1. },
2872 { 0x58f1, U_NT_NUMERIC, 1. },
2873 { 0x10320, U_NT_NUMERIC, 1. },
2874 { 0x0F2B, U_NT_NUMERIC, 3./2. },
2875 { 0x00B2, U_NT_DIGIT, 2. },
2876 { 0x5f10, U_NT_NUMERIC, 2. },
2877 { 0x1813, U_NT_DECIMAL, 3. },
2878 { 0x5f0e, U_NT_NUMERIC, 3. },
2879 { 0x2173, U_NT_NUMERIC, 4. },
2880 { 0x8086, U_NT_NUMERIC, 4. },
2881 { 0x278E, U_NT_DIGIT, 5. },
2882 { 0x1D7F2, U_NT_DECIMAL, 6. },
2883 { 0x247A, U_NT_DIGIT, 7. },
2884 { 0x7396, U_NT_NUMERIC, 9. },
2885 { 0x1372, U_NT_NUMERIC, 10. },
2886 { 0x216B, U_NT_NUMERIC, 12. },
2887 { 0x16EE, U_NT_NUMERIC, 17. },
2888 { 0x249A, U_NT_NUMERIC, 19. },
2889 { 0x303A, U_NT_NUMERIC, 30. },
2890 { 0x5345, U_NT_NUMERIC, 30. },
2891 { 0x32B2, U_NT_NUMERIC, 37. },
2892 { 0x1375, U_NT_NUMERIC, 40. },
2893 { 0x10323, U_NT_NUMERIC, 50. },
2894 { 0x0BF1, U_NT_NUMERIC, 100. },
2895 { 0x964c, U_NT_NUMERIC, 100. },
2896 { 0x217E, U_NT_NUMERIC, 500. },
2897 { 0x2180, U_NT_NUMERIC, 1000. },
2898 { 0x4edf, U_NT_NUMERIC, 1000. },
2899 { 0x2181, U_NT_NUMERIC, 5000. },
2900 { 0x137C, U_NT_NUMERIC, 10000. },
2901 { 0x4e07, U_NT_NUMERIC, 10000. },
2902 { 0x12432, U_NT_NUMERIC, 216000. },
2903 { 0x12433, U_NT_NUMERIC, 432000. },
2904 { 0x4ebf, U_NT_NUMERIC, 100000000. },
2905 { 0x5146, U_NT_NUMERIC, 1000000000000. },
2906 { -1, U_NT_NONE, U_NO_NUMERIC_VALUE },
2907 { 0x61, U_NT_NONE, U_NO_NUMERIC_VALUE },
2908 { 0x3000, U_NT_NONE, U_NO_NUMERIC_VALUE },
2909 { 0xfffe, U_NT_NONE, U_NO_NUMERIC_VALUE },
2910 { 0x10301, U_NT_NONE, U_NO_NUMERIC_VALUE },
2911 { 0xe0033, U_NT_NONE, U_NO_NUMERIC_VALUE },
2912 { 0x10ffff, U_NT_NONE, U_NO_NUMERIC_VALUE },
2913 { 0x110000, U_NT_NONE, U_NO_NUMERIC_VALUE }
2914 };
2915
2916 double nv;
2917 UChar32 c;
2918 int32_t i, type;
2919
2920 for(i=0; i<UPRV_LENGTHOF(values); ++i) {
2921 c=values[i].c;
2922 type=u_getIntPropertyValue(c, UCHAR_NUMERIC_TYPE);
2923 nv=u_getNumericValue(c);
2924
2925 if(type!=values[i].type) {
2926 log_err("UCHAR_NUMERIC_TYPE(U+%04lx)=%d should be %d\n", c, type, values[i].type);
2927 }
2928 if(0.000001 <= fabs(nv - values[i].numValue)) {
2929 log_err("u_getNumericValue(U+%04lx)=%g should be %g\n", c, nv, values[i].numValue);
2930 }
2931 }
2932 }
2933
2934 /**
2935 * Test the property names and property value names API.
2936 */
2937 static void
2938 TestPropertyNames(void) {
2939 int32_t p, v, choice=0, rev;
2940 UBool atLeastSomething = FALSE;
2941
2942 for (p=0; ; ++p) {
2943 UProperty propEnum = (UProperty)p;
2944 UBool sawProp = FALSE;
2945 if(p > 10 && !atLeastSomething) {
2946 log_data_err("Never got anything after 10 tries.\nYour data is probably fried. Quitting this test\n", p, choice);
2947 return;
2948 }
2949
2950 for (choice=0; ; ++choice) {
2951 const char* name = u_getPropertyName(propEnum, (UPropertyNameChoice)choice);
2952 if (name) {
2953 if (!sawProp)
2954 log_verbose("prop 0x%04x+%2d:", p&~0xfff, p&0xfff);
2955 log_verbose("%d=\"%s\"", choice, name);
2956 sawProp = TRUE;
2957 atLeastSomething = TRUE;
2958
2959 /* test reverse mapping */
2960 rev = u_getPropertyEnum(name);
2961 if (rev != p) {
2962 log_err("Property round-trip failure: %d -> %s -> %d\n",
2963 p, name, rev);
2964 }
2965 }
2966 if (!name && choice>0) break;
2967 }
2968 if (sawProp) {
2969 /* looks like a valid property; check the values */
2970 const char* pname = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
2971 int32_t max = 0;
2972 if (p == UCHAR_CANONICAL_COMBINING_CLASS) {
2973 max = 255;
2974 } else if (p == UCHAR_GENERAL_CATEGORY_MASK) {
2975 /* it's far too slow to iterate all the way up to
2976 the real max, U_GC_P_MASK */
2977 max = U_GC_NL_MASK;
2978 } else if (p == UCHAR_BLOCK) {
2979 /* UBlockCodes, unlike other values, start at 1 */
2980 max = 1;
2981 }
2982 log_verbose("\n");
2983 for (v=-1; ; ++v) {
2984 UBool sawValue = FALSE;
2985 for (choice=0; ; ++choice) {
2986 const char* vname = u_getPropertyValueName(propEnum, v, (UPropertyNameChoice)choice);
2987 if (vname) {
2988 if (!sawValue) log_verbose(" %s, value %d:", pname, v);
2989 log_verbose("%d=\"%s\"", choice, vname);
2990 sawValue = TRUE;
2991
2992 /* test reverse mapping */
2993 rev = u_getPropertyValueEnum(propEnum, vname);
2994 if (rev != v) {
2995 log_err("Value round-trip failure (%s): %d -> %s -> %d\n",
2996 pname, v, vname, rev);
2997 }
2998 }
2999 if (!vname && choice>0) break;
3000 }
3001 if (sawValue) {
3002 log_verbose("\n");
3003 }
3004 if (!sawValue && v>=max) break;
3005 }
3006 }
3007 if (!sawProp) {
3008 if (p>=UCHAR_STRING_LIMIT) {
3009 break;
3010 } else if (p>=UCHAR_DOUBLE_LIMIT) {
3011 p = UCHAR_STRING_START - 1;
3012 } else if (p>=UCHAR_MASK_LIMIT) {
3013 p = UCHAR_DOUBLE_START - 1;
3014 } else if (p>=UCHAR_INT_LIMIT) {
3015 p = UCHAR_MASK_START - 1;
3016 } else if (p>=UCHAR_BINARY_LIMIT) {
3017 p = UCHAR_INT_START - 1;
3018 }
3019 }
3020 }
3021 }
3022
3023 /**
3024 * Test the property values API. See JB#2410.
3025 */
3026 static void
3027 TestPropertyValues(void) {
3028 int32_t i, p, min, max;
3029 UErrorCode ec;
3030
3031 /* Min should be 0 for everything. */
3032 /* Until JB#2478 is fixed, the one exception is UCHAR_BLOCK. */
3033 for (p=UCHAR_INT_START; p<UCHAR_INT_LIMIT; ++p) {
3034 UProperty propEnum = (UProperty)p;
3035 min = u_getIntPropertyMinValue(propEnum);
3036 if (min != 0) {
3037 if (p == UCHAR_BLOCK) {
3038 /* This is okay...for now. See JB#2487.
3039 TODO Update this for JB#2487. */
3040 } else {
3041 const char* name;
3042 name = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
3043 if (name == NULL)
3044 name = "<ERROR>";
3045 log_err("FAIL: u_getIntPropertyMinValue(%s) = %d, exp. 0\n",
3046 name, min);
3047 }
3048 }
3049 }
3050
3051 if( u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY_MASK)!=0 ||
3052 u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY_MASK)!=-1) {
3053 log_err("error: u_getIntPropertyMin/MaxValue(UCHAR_GENERAL_CATEGORY_MASK) is wrong\n");
3054 }
3055
3056 /* Max should be -1 for invalid properties. */
3057 max = u_getIntPropertyMaxValue(UCHAR_INVALID_CODE);
3058 if (max != -1) {
3059 log_err("FAIL: u_getIntPropertyMaxValue(-1) = %d, exp. -1\n",
3060 max);
3061 }
3062
3063 /* Script should return USCRIPT_INVALID_CODE for an invalid code point. */
3064 for (i=0; i<2; ++i) {
3065 int32_t script;
3066 const char* desc;
3067 ec = U_ZERO_ERROR;
3068 switch (i) {
3069 case 0:
3070 script = uscript_getScript(-1, &ec);
3071 desc = "uscript_getScript(-1)";
3072 break;
3073 case 1:
3074 script = u_getIntPropertyValue(-1, UCHAR_SCRIPT);
3075 desc = "u_getIntPropertyValue(-1, UCHAR_SCRIPT)";
3076 break;
3077 default:
3078 log_err("Internal test error. Too many scripts\n");
3079 return;
3080 }
3081 /* We don't explicitly test ec. It should be U_FAILURE but it
3082 isn't documented as such. */
3083 if (script != (int32_t)USCRIPT_INVALID_CODE) {
3084 log_err("FAIL: %s = %d, exp. 0\n",
3085 desc, script);
3086 }
3087 }
3088 }
3089
3090 /* various tests for consistency of UCD data and API behavior */
3091 static void
3092 TestConsistency() {
3093 char buffer[300];
3094 USet *set1, *set2, *set3, *set4;
3095 UErrorCode errorCode;
3096
3097 UChar32 start, end;
3098 int32_t i, length;
3099
3100 U_STRING_DECL(hyphenPattern, "[:Hyphen:]", 10);
3101 U_STRING_DECL(dashPattern, "[:Dash:]", 8);
3102 U_STRING_DECL(lowerPattern, "[:Lowercase:]", 13);
3103 U_STRING_DECL(formatPattern, "[:Cf:]", 6);
3104 U_STRING_DECL(alphaPattern, "[:Alphabetic:]", 14);
3105
3106 U_STRING_DECL(mathBlocksPattern,
3107 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
3108 214);
3109 U_STRING_DECL(mathPattern, "[:Math:]", 8);
3110 U_STRING_DECL(unassignedPattern, "[:Cn:]", 6);
3111 U_STRING_DECL(unknownPattern, "[:sc=Unknown:]", 14);
3112 U_STRING_DECL(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3113
3114 U_STRING_INIT(hyphenPattern, "[:Hyphen:]", 10);
3115 U_STRING_INIT(dashPattern, "[:Dash:]", 8);
3116 U_STRING_INIT(lowerPattern, "[:Lowercase:]", 13);
3117 U_STRING_INIT(formatPattern, "[:Cf:]", 6);
3118 U_STRING_INIT(alphaPattern, "[:Alphabetic:]", 14);
3119
3120 U_STRING_INIT(mathBlocksPattern,
3121 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
3122 214);
3123 U_STRING_INIT(mathPattern, "[:Math:]", 8);
3124 U_STRING_INIT(unassignedPattern, "[:Cn:]", 6);
3125 U_STRING_INIT(unknownPattern, "[:sc=Unknown:]", 14);
3126 U_STRING_INIT(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3127
3128 /*
3129 * It used to be that UCD.html and its precursors said
3130 * "Those dashes used to mark connections between pieces of words,
3131 * plus the Katakana middle dot."
3132 *
3133 * Unicode 4 changed 00AD Soft Hyphen to Cf and removed it from Dash
3134 * but not from Hyphen.
3135 * UTC 94 (2003mar) decided to leave it that way and to change UCD.html.
3136 * Therefore, do not show errors when testing the Hyphen property.
3137 */
3138 log_verbose("Starting with Unicode 4, inconsistencies with [:Hyphen:] are\n"
3139 "known to the UTC and not considered errors.\n");
3140
3141 errorCode=U_ZERO_ERROR;
3142 set1=uset_openPattern(hyphenPattern, 10, &errorCode);
3143 set2=uset_openPattern(dashPattern, 8, &errorCode);
3144 if(U_SUCCESS(errorCode)) {
3145 /* remove the Katakana middle dot(s) from set1 */
3146 uset_remove(set1, 0x30fb);
3147 uset_remove(set1, 0xff65); /* halfwidth variant */
3148 showAMinusB(set1, set2, "[:Hyphen:]", "[:Dash:]", FALSE);
3149 } else {
3150 log_data_err("error opening [:Hyphen:] or [:Dash:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3151 }
3152
3153 /* check that Cf is neither Hyphen nor Dash nor Alphabetic */
3154 set3=uset_openPattern(formatPattern, 6, &errorCode);
3155 set4=uset_openPattern(alphaPattern, 14, &errorCode);
3156 if(U_SUCCESS(errorCode)) {
3157 showAIntersectB(set3, set1, "[:Cf:]", "[:Hyphen:]", FALSE);
3158 showAIntersectB(set3, set2, "[:Cf:]", "[:Dash:]", TRUE);
3159 showAIntersectB(set3, set4, "[:Cf:]", "[:Alphabetic:]", TRUE);
3160 } else {
3161 log_data_err("error opening [:Cf:] or [:Alpbabetic:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3162 }
3163
3164 uset_close(set1);
3165 uset_close(set2);
3166 uset_close(set3);
3167 uset_close(set4);
3168
3169 /*
3170 * Check that each lowercase character has "small" in its name
3171 * and not "capital".
3172 * There are some such characters, some of which seem odd.
3173 * Use the verbose flag to see these notices.
3174 */
3175 errorCode=U_ZERO_ERROR;
3176 set1=uset_openPattern(lowerPattern, 13, &errorCode);
3177 if(U_SUCCESS(errorCode)) {
3178 for(i=0;; ++i) {
3179 length=uset_getItem(set1, i, &start, &end, NULL, 0, &errorCode);
3180 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
3181 break; /* done */
3182 }
3183 if(U_FAILURE(errorCode)) {
3184 log_err("error iterating over [:Lowercase:] at item %d: %s\n",
3185 i, u_errorName(errorCode));
3186 break;
3187 }
3188 if(length!=0) {
3189 break; /* done with code points, got a string or -1 */
3190 }
3191
3192 while(start<=end) {
3193 length=u_charName(start, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode);
3194 if(U_FAILURE(errorCode)) {
3195 log_data_err("error getting the name of U+%04x - %s\n", start, u_errorName(errorCode));
3196 errorCode=U_ZERO_ERROR;
3197 }
3198 if( (strstr(buffer, "SMALL")==NULL || strstr(buffer, "CAPITAL")!=NULL) &&
3199 strstr(buffer, "SMALL CAPITAL")==NULL
3200 ) {
3201 log_verbose("info: [:Lowercase:] contains U+%04x whose name does not suggest lowercase: %s\n", start, buffer);
3202 }
3203 ++start;
3204 }
3205 }
3206 } else {
3207 log_data_err("error opening [:Lowercase:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3208 }
3209 uset_close(set1);
3210
3211 /* verify that all assigned characters in Math blocks are exactly Math characters */
3212 errorCode=U_ZERO_ERROR;
3213 set1=uset_openPattern(mathBlocksPattern, -1, &errorCode);
3214 set2=uset_openPattern(mathPattern, 8, &errorCode);
3215 set3=uset_openPattern(unassignedPattern, 6, &errorCode);
3216 if(U_SUCCESS(errorCode)) {
3217 uset_retainAll(set2, set1); /* [math blocks]&[:Math:] */
3218 uset_complement(set3); /* assigned characters */
3219 uset_retainAll(set1, set3); /* [math blocks]&[assigned] */
3220 compareUSets(set1, set2,
3221 "[assigned Math block chars]", "[math blocks]&[:Math:]",
3222 TRUE);
3223 } else {
3224 log_data_err("error opening [math blocks] or [:Math:] or [:Cn:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3225 }
3226 uset_close(set1);
3227 uset_close(set2);
3228 uset_close(set3);
3229
3230 /* new in Unicode 5.0: exactly all unassigned+PUA+surrogate code points have script=Unknown */
3231 errorCode=U_ZERO_ERROR;
3232 set1=uset_openPattern(unknownPattern, 14, &errorCode);
3233 set2=uset_openPattern(reservedPattern, 20, &errorCode);
3234 if(U_SUCCESS(errorCode)) {
3235 compareUSets(set1, set2,
3236 "[:sc=Unknown:]", "[[:Cn:][:Co:][:Cs:]]",
3237 TRUE);
3238 } else {
3239 log_data_err("error opening [:sc=Unknown:] or [[:Cn:][:Co:][:Cs:]] - %s (Are you missing data?)\n", u_errorName(errorCode));
3240 }
3241 uset_close(set1);
3242 uset_close(set2);
3243 }
3244
3245 /*
3246 * Starting with ICU4C 3.4, the core Unicode properties files
3247 * (uprops.icu, ucase.icu, ubidi.icu, unorm.icu)
3248 * are hardcoded in the common DLL and therefore not included
3249 * in the data package any more.
3250 * Test requiring these files are disabled so that
3251 * we need not jump through hoops (like adding snapshots of these files
3252 * to testdata).
3253 * See Jitterbug 4497.
3254 */
3255 #define HARDCODED_DATA_4497 1
3256
3257 /* API coverage for ubidi_props.c */
3258 static void TestUBiDiProps() {
3259 #if !HARDCODED_DATA_4497
3260 UDataMemory *pData;
3261 UBiDiProps *bdp;
3262 const UBiDiProps *cbdp;
3263 UErrorCode errorCode;
3264
3265 /* coverage for ubidi_openBinary() */
3266 errorCode=U_ZERO_ERROR;
3267 pData=udata_open(NULL, UBIDI_DATA_TYPE, UBIDI_DATA_NAME, &errorCode);
3268 if(U_FAILURE(errorCode)) {
3269 log_data_err("unable to open " UBIDI_DATA_NAME "." UBIDI_DATA_TYPE ": %s\n",
3270 u_errorName(errorCode));
3271 return;
3272 }
3273
3274 bdp=ubidi_openBinary((const uint8_t *)pData->pHeader, -1, &errorCode);
3275 if(U_FAILURE(errorCode)) {
3276 log_err("ubidi_openBinary() fails for the contents of " UBIDI_DATA_NAME "." UBIDI_DATA_TYPE ": %s\n",
3277 u_errorName(errorCode));
3278 udata_close(pData);
3279 return;
3280 }
3281
3282 if(0x2215!=ubidi_getMirror(bdp, 0x29F5)) { /* verify some data */
3283 log_err("ubidi_openBinary() does not seem to return working UBiDiProps\n");
3284 }
3285
3286 ubidi_closeProps(bdp);
3287 udata_close(pData);
3288
3289 /* coverage for ubidi_getDummy() */
3290 errorCode=U_ZERO_ERROR;
3291 cbdp=ubidi_getDummy(&errorCode);
3292 if(ubidi_getClass(cbdp, 0x20)!=0) {
3293 log_err("ubidi_getClass(dummy, space)!=0\n");
3294 }
3295 #endif
3296 }
3297
3298 /* test case folding, compare return values with CaseFolding.txt ------------ */
3299
3300 /* bit set for which case foldings for a character have been tested already */
3301 enum {
3302 CF_SIMPLE=1,
3303 CF_FULL=2,
3304 CF_TURKIC=4,
3305 CF_ALL=7
3306 };
3307
3308 static void
3309 testFold(UChar32 c, int which,
3310 UChar32 simple, UChar32 turkic,
3311 const UChar *full, int32_t fullLength,
3312 const UChar *turkicFull, int32_t turkicFullLength) {
3313 UChar s[2], t[32];
3314 UChar32 c2;
3315 int32_t length, length2;
3316
3317 UErrorCode errorCode=U_ZERO_ERROR;
3318
3319 length=0;
3320 U16_APPEND_UNSAFE(s, length, c);
3321
3322 if((which&CF_SIMPLE)!=0 && (c2=u_foldCase(c, 0))!=simple) {
3323 log_err("u_foldCase(U+%04lx, default)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3324 }
3325 if((which&CF_FULL)!=0) {
3326 length2=u_strFoldCase(t, UPRV_LENGTHOF(t), s, length, 0, &errorCode);
3327 if(length2!=fullLength || 0!=u_memcmp(t, full, fullLength)) {
3328 log_err("u_strFoldCase(U+%04lx, default) does not fold properly\n", (long)c);
3329 }
3330 }
3331 if((which&CF_TURKIC)!=0) {
3332 if((c2=u_foldCase(c, U_FOLD_CASE_EXCLUDE_SPECIAL_I))!=turkic) {
3333 log_err("u_foldCase(U+%04lx, turkic)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3334 }
3335
3336 length2=u_strFoldCase(t, UPRV_LENGTHOF(t), s, length, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &errorCode);
3337 if(length2!=turkicFullLength || 0!=u_memcmp(t, turkicFull, length2)) {
3338 log_err("u_strFoldCase(U+%04lx, turkic) does not fold properly\n", (long)c);
3339 }
3340 }
3341 }
3342
3343 /* test that c case-folds to itself */
3344 static void
3345 testFoldToSelf(UChar32 c, int which) {
3346 UChar s[2];
3347 int32_t length;
3348
3349 length=0;
3350 U16_APPEND_UNSAFE(s, length, c);
3351 testFold(c, which, c, c, s, length, s, length);
3352 }
3353
3354 struct CaseFoldingData {
3355 USet *notSeen;
3356 UChar32 prev, prevSimple;
3357 UChar prevFull[32];
3358 int32_t prevFullLength;
3359 int which;
3360 };
3361 typedef struct CaseFoldingData CaseFoldingData;
3362
3363 static void U_CALLCONV
3364 caseFoldingLineFn(void *context,
3365 char *fields[][2], int32_t fieldCount,
3366 UErrorCode *pErrorCode) {
3367 CaseFoldingData *pData=(CaseFoldingData *)context;
3368 char *end;
3369 UChar full[32];
3370 UChar32 c, prev, simple;
3371 int32_t count;
3372 int which;
3373 char status;
3374
3375 /* get code point */
3376 const char *s=u_skipWhitespace(fields[0][0]);
3377 if(0==strncmp(s, "0000..10FFFF", 12)) {
3378 /*
3379 * Ignore the line
3380 * # @missing: 0000..10FFFF; C; <code point>
3381 * because maps-to-self is already our default, and this line breaks this parser.
3382 */
3383 return;
3384 }
3385 c=(UChar32)strtoul(s, &end, 16);
3386 end=(char *)u_skipWhitespace(end);
3387 if(end<=fields[0][0] || end!=fields[0][1]) {
3388 log_err("syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
3389 *pErrorCode=U_PARSE_ERROR;
3390 return;
3391 }
3392
3393 /* get the status of this mapping */
3394 status=*u_skipWhitespace(fields[1][0]);
3395 if(status!='C' && status!='S' && status!='F' && status!='T') {
3396 log_err("unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
3397 *pErrorCode=U_PARSE_ERROR;
3398 return;
3399 }
3400
3401 /* get the mapping */
3402 count=u_parseString(fields[2][0], full, 32, (uint32_t *)&simple, pErrorCode);
3403 if(U_FAILURE(*pErrorCode)) {
3404 log_err("error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
3405 return;
3406 }
3407
3408 /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
3409 if(count==0 || count>2 || (count==2 && U16_IS_SINGLE(full[1]))) {
3410 simple=c;
3411 }
3412
3413 if(c!=(prev=pData->prev)) {
3414 /*
3415 * Test remaining mappings for the previous code point.
3416 * If a turkic folding was not mentioned, then it should fold the same
3417 * as the regular simple case folding.
3418 */
3419 UChar prevString[2];
3420 int32_t length;
3421
3422 length=0;
3423 U16_APPEND_UNSAFE(prevString, length, prev);
3424 testFold(prev, (~pData->which)&CF_ALL,
3425 prev, pData->prevSimple,
3426 prevString, length,
3427 pData->prevFull, pData->prevFullLength);
3428 pData->prev=pData->prevSimple=c;
3429 length=0;
3430 U16_APPEND_UNSAFE(pData->prevFull, length, c);
3431 pData->prevFullLength=length;
3432 pData->which=0;
3433 }
3434
3435 /*
3436 * Turn the status into a bit set of case foldings to test.
3437 * Remember non-Turkic case foldings as defaults for Turkic mode.
3438 */
3439 switch(status) {
3440 case 'C':
3441 which=CF_SIMPLE|CF_FULL;
3442 pData->prevSimple=simple;
3443 u_memcpy(pData->prevFull, full, count);
3444 pData->prevFullLength=count;
3445 break;
3446 case 'S':
3447 which=CF_SIMPLE;
3448 pData->prevSimple=simple;
3449 break;
3450 case 'F':
3451 which=CF_FULL;
3452 u_memcpy(pData->prevFull, full, count);
3453 pData->prevFullLength=count;
3454 break;
3455 case 'T':
3456 which=CF_TURKIC;
3457 break;
3458 default:
3459 which=0;
3460 break; /* won't happen because of test above */
3461 }
3462
3463 testFold(c, which, simple, simple, full, count, full, count);
3464
3465 /* remember which case foldings of c have been tested */
3466 pData->which|=which;
3467
3468 /* remove c from the set of ones not mentioned in CaseFolding.txt */
3469 uset_remove(pData->notSeen, c);
3470 }
3471
3472 static void
3473 TestCaseFolding() {
3474 CaseFoldingData data={ NULL };
3475 char *fields[3][2];
3476 UErrorCode errorCode;
3477
3478 static char *lastLine= (char *)"10FFFF; C; 10FFFF;";
3479
3480 errorCode=U_ZERO_ERROR;
3481 /* test BMP & plane 1 - nothing interesting above */
3482 data.notSeen=uset_open(0, 0x1ffff);
3483 data.prevFullLength=1; /* length of full case folding of U+0000 */
3484
3485 parseUCDFile("CaseFolding.txt", fields, 3, caseFoldingLineFn, &data, &errorCode);
3486 if(U_SUCCESS(errorCode)) {
3487 int32_t i, start, end;
3488
3489 /* add a pseudo-last line to finish testing of the actual last one */
3490 fields[0][0]=lastLine;
3491 fields[0][1]=lastLine+6;
3492 fields[1][0]=lastLine+7;
3493 fields[1][1]=lastLine+9;
3494 fields[2][0]=lastLine+10;
3495 fields[2][1]=lastLine+17;
3496 caseFoldingLineFn(&data, fields, 3, &errorCode);
3497
3498 /* verify that all code points that are not mentioned in CaseFolding.txt fold to themselves */
3499 for(i=0;
3500 0==uset_getItem(data.notSeen, i, &start, &end, NULL, 0, &errorCode) &&
3501 U_SUCCESS(errorCode);
3502 ++i
3503 ) {
3504 do {
3505 testFoldToSelf(start, CF_ALL);
3506 } while(++start<=end);
3507 }
3508 }
3509
3510 uset_close(data.notSeen);
3511 }