]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/cintltst/cucdtst.c
ICU-64252.0.1.tar.gz
[apple/icu.git] / icuSources / test / cintltst / cucdtst.c
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 1997-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8 /*******************************************************************************
9 *
10 * File CUCDTST.C
11 *
12 * Modification History:
13 * Name Description
14 * Madhu Katragadda Ported for C API, added tests for string functions
15 ********************************************************************************
16 */
17
18 #include <string.h>
19 #include <math.h>
20 #include <stdlib.h>
21
22 #include "unicode/utypes.h"
23 #include "unicode/uchar.h"
24 #include "unicode/putil.h"
25 #include "unicode/ustring.h"
26 #include "unicode/uloc.h"
27 #include "unicode/unorm2.h"
28 #include "unicode/utf16.h"
29 #include "unicode/utf_old.h"
30 #include "cintltst.h"
31 #include "putilimp.h"
32 #include "uparse.h"
33 #include "ucase.h"
34 #include "ubidi_props.h"
35 #include "uprops.h"
36 #include "uset_imp.h"
37 #include "usc_impl.h"
38 #include "udatamem.h"
39 #include "cucdapi.h"
40 #include "cmemory.h"
41
42 /* prototypes --------------------------------------------------------------- */
43
44 static void TestUpperLower(void);
45 static void TestLetterNumber(void);
46 static void TestMisc(void);
47 static void TestPOSIX(void);
48 static void TestControlPrint(void);
49 static void TestIdentifier(void);
50 static void TestUnicodeData(void);
51 static void TestCodeUnit(void);
52 static void TestCodePoint(void);
53 static void TestCharLength(void);
54 static void TestCharNames(void);
55 static void TestUCharFromNameUnderflow(void);
56 static void TestMirroring(void);
57 static void TestUScriptRunAPI(void);
58 static void TestAdditionalProperties(void);
59 static void TestNumericProperties(void);
60 static void TestPropertyNames(void);
61 static void TestPropertyValues(void);
62 static void TestConsistency(void);
63 static void TestCaseFolding(void);
64 static void TestBinaryCharacterPropertiesAPI(void);
65 static void TestIntCharacterPropertiesAPI(void);
66
67 /* internal methods used */
68 static int32_t MakeProp(char* str);
69 static int32_t MakeDir(char* str);
70
71 /* helpers ------------------------------------------------------------------ */
72
73 static void
74 parseUCDFile(const char *filename,
75 char *fields[][2], int32_t fieldCount,
76 UParseLineFn *lineFn, void *context,
77 UErrorCode *pErrorCode) {
78 char path[256];
79 char backupPath[256];
80
81 if(U_FAILURE(*pErrorCode)) {
82 return;
83 }
84
85 /* Look inside ICU_DATA first */
86 strcpy(path, u_getDataDirectory());
87 strcat(path, ".." U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING);
88 strcat(path, filename);
89
90 /* As a fallback, try to guess where the source data was located
91 * at the time ICU was built, and look there.
92 */
93 strcpy(backupPath, ctest_dataSrcDir());
94 strcat(backupPath, U_FILE_SEP_STRING);
95 strcat(backupPath, "unidata" U_FILE_SEP_STRING);
96 strcat(backupPath, filename);
97
98 u_parseDelimitedFile(path, ';', fields, fieldCount, lineFn, context, pErrorCode);
99 if(*pErrorCode==U_FILE_ACCESS_ERROR) {
100 *pErrorCode=U_ZERO_ERROR;
101 u_parseDelimitedFile(backupPath, ';', fields, fieldCount, lineFn, context, pErrorCode);
102 }
103 if(U_FAILURE(*pErrorCode)) {
104 log_err_status(*pErrorCode, "error parsing %s: %s\n", filename, u_errorName(*pErrorCode));
105 }
106 }
107
108 /* test data ---------------------------------------------------------------- */
109
110 static const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf";
111 static const int32_t tagValues[] =
112 {
113 /* Mn */ U_NON_SPACING_MARK,
114 /* Mc */ U_COMBINING_SPACING_MARK,
115 /* Me */ U_ENCLOSING_MARK,
116 /* Nd */ U_DECIMAL_DIGIT_NUMBER,
117 /* Nl */ U_LETTER_NUMBER,
118 /* No */ U_OTHER_NUMBER,
119 /* Zs */ U_SPACE_SEPARATOR,
120 /* Zl */ U_LINE_SEPARATOR,
121 /* Zp */ U_PARAGRAPH_SEPARATOR,
122 /* Cc */ U_CONTROL_CHAR,
123 /* Cf */ U_FORMAT_CHAR,
124 /* Cs */ U_SURROGATE,
125 /* Co */ U_PRIVATE_USE_CHAR,
126 /* Cn */ U_UNASSIGNED,
127 /* Lu */ U_UPPERCASE_LETTER,
128 /* Ll */ U_LOWERCASE_LETTER,
129 /* Lt */ U_TITLECASE_LETTER,
130 /* Lm */ U_MODIFIER_LETTER,
131 /* Lo */ U_OTHER_LETTER,
132 /* Pc */ U_CONNECTOR_PUNCTUATION,
133 /* Pd */ U_DASH_PUNCTUATION,
134 /* Ps */ U_START_PUNCTUATION,
135 /* Pe */ U_END_PUNCTUATION,
136 /* Po */ U_OTHER_PUNCTUATION,
137 /* Sm */ U_MATH_SYMBOL,
138 /* Sc */ U_CURRENCY_SYMBOL,
139 /* Sk */ U_MODIFIER_SYMBOL,
140 /* So */ U_OTHER_SYMBOL,
141 /* Pi */ U_INITIAL_PUNCTUATION,
142 /* Pf */ U_FINAL_PUNCTUATION
143 };
144
145 static const char dirStrings[][5] = {
146 "L",
147 "R",
148 "EN",
149 "ES",
150 "ET",
151 "AN",
152 "CS",
153 "B",
154 "S",
155 "WS",
156 "ON",
157 "LRE",
158 "LRO",
159 "AL",
160 "RLE",
161 "RLO",
162 "PDF",
163 "NSM",
164 "BN",
165 /* new in Unicode 6.3/ICU 52 */
166 "FSI",
167 "LRI",
168 "RLI",
169 "PDI"
170 };
171
172 void addUnicodeTest(TestNode** root);
173
174 void addUnicodeTest(TestNode** root)
175 {
176 addTest(root, &TestCodeUnit, "tsutil/cucdtst/TestCodeUnit");
177 addTest(root, &TestCodePoint, "tsutil/cucdtst/TestCodePoint");
178 addTest(root, &TestCharLength, "tsutil/cucdtst/TestCharLength");
179 addTest(root, &TestBinaryValues, "tsutil/cucdtst/TestBinaryValues");
180 addTest(root, &TestUnicodeData, "tsutil/cucdtst/TestUnicodeData");
181 addTest(root, &TestAdditionalProperties, "tsutil/cucdtst/TestAdditionalProperties");
182 addTest(root, &TestNumericProperties, "tsutil/cucdtst/TestNumericProperties");
183 addTest(root, &TestUpperLower, "tsutil/cucdtst/TestUpperLower");
184 addTest(root, &TestLetterNumber, "tsutil/cucdtst/TestLetterNumber");
185 addTest(root, &TestMisc, "tsutil/cucdtst/TestMisc");
186 addTest(root, &TestPOSIX, "tsutil/cucdtst/TestPOSIX");
187 addTest(root, &TestControlPrint, "tsutil/cucdtst/TestControlPrint");
188 addTest(root, &TestIdentifier, "tsutil/cucdtst/TestIdentifier");
189 addTest(root, &TestCharNames, "tsutil/cucdtst/TestCharNames");
190 addTest(root, &TestUCharFromNameUnderflow, "tsutil/cucdtst/TestUCharFromNameUnderflow");
191 addTest(root, &TestMirroring, "tsutil/cucdtst/TestMirroring");
192 addTest(root, &TestUScriptCodeAPI, "tsutil/cucdtst/TestUScriptCodeAPI");
193 addTest(root, &TestHasScript, "tsutil/cucdtst/TestHasScript");
194 addTest(root, &TestGetScriptExtensions, "tsutil/cucdtst/TestGetScriptExtensions");
195 addTest(root, &TestScriptMetadataAPI, "tsutil/cucdtst/TestScriptMetadataAPI");
196 addTest(root, &TestUScriptRunAPI, "tsutil/cucdtst/TestUScriptRunAPI");
197 addTest(root, &TestPropertyNames, "tsutil/cucdtst/TestPropertyNames");
198 addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues");
199 addTest(root, &TestConsistency, "tsutil/cucdtst/TestConsistency");
200 addTest(root, &TestCaseFolding, "tsutil/cucdtst/TestCaseFolding");
201 addTest(root, &TestBinaryCharacterPropertiesAPI,
202 "tsutil/cucdtst/TestBinaryCharacterPropertiesAPI");
203 addTest(root, &TestIntCharacterPropertiesAPI,
204 "tsutil/cucdtst/TestIntCharacterPropertiesAPI");
205 }
206
207 /*==================================================== */
208 /* test u_toupper() and u_tolower() */
209 /*==================================================== */
210 static void TestUpperLower()
211 {
212 const UChar upper[] = {0x41, 0x42, 0x00b2, 0x01c4, 0x01c6, 0x01c9, 0x01c8, 0x01c9, 0x000c, 0x0000};
213 const UChar lower[] = {0x61, 0x62, 0x00b2, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01c9, 0x000c, 0x0000};
214 U_STRING_DECL(upperTest, "abcdefg123hij.?:klmno", 21);
215 U_STRING_DECL(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
216 int32_t i;
217
218 U_STRING_INIT(upperTest, "abcdefg123hij.?:klmno", 21);
219 U_STRING_INIT(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
220
221 /*
222 Checks LetterLike Symbols which were previously a source of confusion
223 [Bertrand A. D. 02/04/98]
224 */
225 for (i=0x2100;i<0x2138;i++)
226 {
227 /* Unicode 5.0 adds lowercase U+214E (TURNED SMALL F) to U+2132 (TURNED CAPITAL F) */
228 if(i!=0x2126 && i!=0x212a && i!=0x212b && i!=0x2132)
229 {
230 if (i != (int)u_tolower(i)) /* itself */
231 log_err("Failed case conversion with itself: U+%04x\n", i);
232 if (i != (int)u_toupper(i))
233 log_err("Failed case conversion with itself: U+%04x\n", i);
234 }
235 }
236
237 for(i=0; i < u_strlen(upper); i++){
238 if(u_tolower(upper[i]) != lower[i]){
239 log_err("FAILED u_tolower() for %lx Expected %lx Got %lx\n", upper[i], lower[i], u_tolower(upper[i]));
240 }
241 }
242
243 log_verbose("testing upper lower\n");
244 for (i = 0; i < 21; i++) {
245
246 if (u_isalpha(upperTest[i]) && !u_islower(upperTest[i]))
247 {
248 log_err("Failed isLowerCase test at %c\n", upperTest[i]);
249 }
250 else if (u_isalpha(lowerTest[i]) && !u_isupper(lowerTest[i]))
251 {
252 log_err("Failed isUpperCase test at %c\n", lowerTest[i]);
253 }
254 else if (upperTest[i] != u_tolower(lowerTest[i]))
255 {
256 log_err("Failed case conversion from %c To %c :\n", lowerTest[i], upperTest[i]);
257 }
258 else if (lowerTest[i] != u_toupper(upperTest[i]))
259 {
260 log_err("Failed case conversion : %c To %c \n", upperTest[i], lowerTest[i]);
261 }
262 else if (upperTest[i] != u_tolower(upperTest[i]))
263 {
264 log_err("Failed case conversion with itself: %c\n", upperTest[i]);
265 }
266 else if (lowerTest[i] != u_toupper(lowerTest[i]))
267 {
268 log_err("Failed case conversion with itself: %c\n", lowerTest[i]);
269 }
270 }
271 log_verbose("done testing upper lower\n");
272
273 log_verbose("testing u_istitle\n");
274 {
275 static const UChar expected[] = {
276 0x1F88,
277 0x1F89,
278 0x1F8A,
279 0x1F8B,
280 0x1F8C,
281 0x1F8D,
282 0x1F8E,
283 0x1F8F,
284 0x1F88,
285 0x1F89,
286 0x1F8A,
287 0x1F8B,
288 0x1F8C,
289 0x1F8D,
290 0x1F8E,
291 0x1F8F,
292 0x1F98,
293 0x1F99,
294 0x1F9A,
295 0x1F9B,
296 0x1F9C,
297 0x1F9D,
298 0x1F9E,
299 0x1F9F,
300 0x1F98,
301 0x1F99,
302 0x1F9A,
303 0x1F9B,
304 0x1F9C,
305 0x1F9D,
306 0x1F9E,
307 0x1F9F,
308 0x1FA8,
309 0x1FA9,
310 0x1FAA,
311 0x1FAB,
312 0x1FAC,
313 0x1FAD,
314 0x1FAE,
315 0x1FAF,
316 0x1FA8,
317 0x1FA9,
318 0x1FAA,
319 0x1FAB,
320 0x1FAC,
321 0x1FAD,
322 0x1FAE,
323 0x1FAF,
324 0x1FBC,
325 0x1FBC,
326 0x1FCC,
327 0x1FCC,
328 0x1FFC,
329 0x1FFC,
330 };
331 int32_t num = UPRV_LENGTHOF(expected);
332 for(i=0; i<num; i++){
333 if(!u_istitle(expected[i])){
334 log_err("u_istitle failed for 0x%4X. Expected TRUE, got FALSE\n",expected[i]);
335 }
336 }
337
338 }
339 }
340
341 /* compare two sets and verify that their difference or intersection is empty */
342 static UBool
343 showADiffB(const USet *a, const USet *b,
344 const char *a_name, const char *b_name,
345 UBool expect, UBool diffIsError) {
346 USet *aa;
347 int32_t i, start, end, length;
348 UErrorCode errorCode;
349
350 /*
351 * expect:
352 * TRUE -> a-b should be empty, that is, b should contain all of a
353 * FALSE -> a&b should be empty, that is, a should contain none of b (and vice versa)
354 */
355 if(expect ? uset_containsAll(b, a) : uset_containsNone(a, b)) {
356 return TRUE;
357 }
358
359 /* clone a to aa because a is const */
360 aa=uset_open(1, 0);
361 if(aa==NULL) {
362 /* unusual problem - out of memory? */
363 return FALSE;
364 }
365 uset_addAll(aa, a);
366
367 /* compute the set in question */
368 if(expect) {
369 /* a-b */
370 uset_removeAll(aa, b);
371 } else {
372 /* a&b */
373 uset_retainAll(aa, b);
374 }
375
376 /* aa is not empty because of the initial tests above; show its contents */
377 errorCode=U_ZERO_ERROR;
378 i=0;
379 for(;;) {
380 length=uset_getItem(aa, i, &start, &end, NULL, 0, &errorCode);
381 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
382 break; /* done */
383 }
384 if(U_FAILURE(errorCode)) {
385 log_err("error comparing %s with %s at difference item %d: %s\n",
386 a_name, b_name, i, u_errorName(errorCode));
387 break;
388 }
389 if(length!=0) {
390 break; /* done with code points, got a string or -1 */
391 }
392
393 if(diffIsError) {
394 if(expect) {
395 log_err("error: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
396 } else {
397 log_err("error: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
398 }
399 } else {
400 if(expect) {
401 log_verbose("info: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
402 } else {
403 log_verbose("info: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
404 }
405 }
406
407 ++i;
408 }
409
410 uset_close(aa);
411 return FALSE;
412 }
413
414 static UBool
415 showAMinusB(const USet *a, const USet *b,
416 const char *a_name, const char *b_name,
417 UBool diffIsError) {
418 return showADiffB(a, b, a_name, b_name, TRUE, diffIsError);
419 }
420
421 static UBool
422 showAIntersectB(const USet *a, const USet *b,
423 const char *a_name, const char *b_name,
424 UBool diffIsError) {
425 return showADiffB(a, b, a_name, b_name, FALSE, diffIsError);
426 }
427
428 static UBool
429 compareUSets(const USet *a, const USet *b,
430 const char *a_name, const char *b_name,
431 UBool diffIsError) {
432 /*
433 * Use an arithmetic & not a logical && so that both branches
434 * are always taken and all differences are shown.
435 */
436 return
437 showAMinusB(a, b, a_name, b_name, diffIsError) &
438 showAMinusB(b, a, b_name, a_name, diffIsError);
439 }
440
441 /* test isLetter(u_isapha()) and isDigit(u_isdigit()) */
442 static void TestLetterNumber()
443 {
444 UChar i = 0x0000;
445
446 log_verbose("Testing for isalpha\n");
447 for (i = 0x0041; i < 0x005B; i++) {
448 if (!u_isalpha(i))
449 {
450 log_err("Failed isLetter test at %.4X\n", i);
451 }
452 }
453 for (i = 0x0660; i < 0x066A; i++) {
454 if (u_isalpha(i))
455 {
456 log_err("Failed isLetter test with numbers at %.4X\n", i);
457 }
458 }
459
460 log_verbose("Testing for isdigit\n");
461 for (i = 0x0660; i < 0x066A; i++) {
462 if (!u_isdigit(i))
463 {
464 log_verbose("Failed isNumber test at %.4X\n", i);
465 }
466 }
467
468 log_verbose("Testing for isalnum\n");
469 for (i = 0x0041; i < 0x005B; i++) {
470 if (!u_isalnum(i))
471 {
472 log_err("Failed isAlNum test at %.4X\n", i);
473 }
474 }
475 for (i = 0x0660; i < 0x066A; i++) {
476 if (!u_isalnum(i))
477 {
478 log_err("Failed isAlNum test at %.4X\n", i);
479 }
480 }
481
482 {
483 /*
484 * The following checks work only starting from Unicode 4.0.
485 * Check the version number here.
486 */
487 static UVersionInfo u401={ 4, 0, 1, 0 };
488 UVersionInfo version;
489 u_getUnicodeVersion(version);
490 if(version[0]<4 || 0==memcmp(version, u401, 4)) {
491 return;
492 }
493 }
494
495 {
496 /*
497 * Sanity check:
498 * Verify that exactly the digit characters have decimal digit values.
499 * This assumption is used in the implementation of u_digit()
500 * (which checks nt=de)
501 * compared with the parallel java.lang.Character.digit()
502 * (which checks Nd).
503 *
504 * This was not true in Unicode 3.2 and earlier.
505 * Unicode 4.0 fixed discrepancies.
506 * Unicode 4.0.1 re-introduced problems in this area due to an
507 * unintentionally incomplete last-minute change.
508 */
509 U_STRING_DECL(digitsPattern, "[:Nd:]", 6);
510 U_STRING_DECL(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
511
512 USet *digits, *decimalValues;
513 UErrorCode errorCode;
514
515 U_STRING_INIT(digitsPattern, "[:Nd:]", 6);
516 U_STRING_INIT(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
517 errorCode=U_ZERO_ERROR;
518 digits=uset_openPattern(digitsPattern, 6, &errorCode);
519 decimalValues=uset_openPattern(decimalValuesPattern, 24, &errorCode);
520
521 if(U_SUCCESS(errorCode)) {
522 compareUSets(digits, decimalValues, "[:Nd:]", "[:Numeric_Type=Decimal:]", TRUE);
523 }
524
525 uset_close(digits);
526 uset_close(decimalValues);
527 }
528 }
529
530 static void testSampleCharProps(UBool propFn(UChar32), const char *propName,
531 const UChar32 *sampleChars, int32_t sampleCharsLength,
532 UBool expected) {
533 int32_t i;
534 for (i = 0; i < sampleCharsLength; ++i) {
535 UBool result = propFn(sampleChars[i]);
536 if (result != expected) {
537 log_err("error: character property function %s(U+%04x)=%d is wrong\n",
538 propName, sampleChars[i], result);
539 }
540 }
541 }
542
543 /* Tests for isDefined(u_isdefined)(, isBaseForm(u_isbase()), isSpaceChar(u_isspace()), isWhiteSpace(), u_CharDigitValue() */
544 static void TestMisc()
545 {
546 static const UChar32 sampleSpaces[] = {0x0020, 0x00a0, 0x2000, 0x2001, 0x2005};
547 static const UChar32 sampleNonSpaces[] = {0x61, 0x62, 0x63, 0x64, 0x74};
548 static const UChar32 sampleUndefined[] = {0xfff1, 0xfff7, 0xfa6e};
549 static const UChar32 sampleDefined[] = {0x523E, 0x4f88, 0xfffd};
550 static const UChar32 sampleBase[] = {0x0061, 0x0031, 0x03d2};
551 static const UChar32 sampleNonBase[] = {0x002B, 0x0020, 0x203B};
552 /* static const UChar sampleChars[] = {0x000a, 0x0045, 0x4e00, 0xDC00, 0xFFE8, 0xFFF0};*/
553 static const UChar32 sampleDigits[]= {0x0030, 0x0662, 0x0F23, 0x0ED5};
554 static const UChar32 sampleNonDigits[] = {0x0010, 0x0041, 0x0122, 0x68FE};
555 static const UChar32 sampleWhiteSpaces[] = {0x2008, 0x2009, 0x200a, 0x001c, 0x000c};
556 static const UChar32 sampleNonWhiteSpaces[] = {0x61, 0x62, 0x3c, 0x28, 0x3f, 0x85, 0x2007, 0xffef};
557
558 static const int32_t sampleDigitValues[] = {0, 2, 3, 5};
559
560 uint32_t mask;
561
562 int32_t i;
563 char icuVersion[U_MAX_VERSION_STRING_LENGTH];
564 UVersionInfo realVersion;
565
566 memset(icuVersion, 0, U_MAX_VERSION_STRING_LENGTH);
567
568 testSampleCharProps(u_isspace, "u_isspace", sampleSpaces, UPRV_LENGTHOF(sampleSpaces), TRUE);
569 testSampleCharProps(u_isspace, "u_isspace", sampleNonSpaces, UPRV_LENGTHOF(sampleNonSpaces), FALSE);
570
571 testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
572 sampleSpaces, UPRV_LENGTHOF(sampleSpaces), TRUE);
573 testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
574 sampleNonSpaces, UPRV_LENGTHOF(sampleNonSpaces), FALSE);
575
576 testSampleCharProps(u_isWhitespace, "u_isWhitespace",
577 sampleWhiteSpaces, UPRV_LENGTHOF(sampleWhiteSpaces), TRUE);
578 testSampleCharProps(u_isWhitespace, "u_isWhitespace",
579 sampleNonWhiteSpaces, UPRV_LENGTHOF(sampleNonWhiteSpaces), FALSE);
580
581 testSampleCharProps(u_isdefined, "u_isdefined",
582 sampleDefined, UPRV_LENGTHOF(sampleDefined), TRUE);
583 testSampleCharProps(u_isdefined, "u_isdefined",
584 sampleUndefined, UPRV_LENGTHOF(sampleUndefined), FALSE);
585
586 testSampleCharProps(u_isbase, "u_isbase", sampleBase, UPRV_LENGTHOF(sampleBase), TRUE);
587 testSampleCharProps(u_isbase, "u_isbase", sampleNonBase, UPRV_LENGTHOF(sampleNonBase), FALSE);
588
589 testSampleCharProps(u_isdigit, "u_isdigit", sampleDigits, UPRV_LENGTHOF(sampleDigits), TRUE);
590 testSampleCharProps(u_isdigit, "u_isdigit", sampleNonDigits, UPRV_LENGTHOF(sampleNonDigits), FALSE);
591
592 for (i = 0; i < UPRV_LENGTHOF(sampleDigits); i++) {
593 if (u_charDigitValue(sampleDigits[i]) != sampleDigitValues[i]) {
594 log_err("error: u_charDigitValue(U+04x)=%d != %d\n",
595 sampleDigits[i], u_charDigitValue(sampleDigits[i]), sampleDigitValues[i]);
596 }
597 }
598
599 /* Tests the ICU version #*/
600 u_getVersion(realVersion);
601 u_versionToString(realVersion, icuVersion);
602 if (strncmp(icuVersion, U_ICU_VERSION, uprv_min((int32_t)strlen(icuVersion), (int32_t)strlen(U_ICU_VERSION))) != 0)
603 {
604 log_err("ICU version test failed. Header says=%s, got=%s \n", U_ICU_VERSION, icuVersion);
605 }
606 #if defined(ICU_VERSION)
607 /* test only happens where we have configure.in with VERSION - sanity check. */
608 if(strcmp(U_ICU_VERSION, ICU_VERSION))
609 {
610 log_err("ICU version mismatch: Header says %s, build environment says %s.\n", U_ICU_VERSION, ICU_VERSION);
611 }
612 #endif
613
614 /* test U_GC_... */
615 if(
616 U_GET_GC_MASK(0x41)!=U_GC_LU_MASK ||
617 U_GET_GC_MASK(0x662)!=U_GC_ND_MASK ||
618 U_GET_GC_MASK(0xa0)!=U_GC_ZS_MASK ||
619 U_GET_GC_MASK(0x28)!=U_GC_PS_MASK ||
620 U_GET_GC_MASK(0x2044)!=U_GC_SM_MASK ||
621 U_GET_GC_MASK(0xe0063)!=U_GC_CF_MASK
622 ) {
623 log_err("error: U_GET_GC_MASK does not work properly\n");
624 }
625
626 mask=0;
627 mask=(mask&~U_GC_CN_MASK)|U_GC_CN_MASK;
628
629 mask=(mask&~U_GC_LU_MASK)|U_GC_LU_MASK;
630 mask=(mask&~U_GC_LL_MASK)|U_GC_LL_MASK;
631 mask=(mask&~U_GC_LT_MASK)|U_GC_LT_MASK;
632 mask=(mask&~U_GC_LM_MASK)|U_GC_LM_MASK;
633 mask=(mask&~U_GC_LO_MASK)|U_GC_LO_MASK;
634
635 mask=(mask&~U_GC_MN_MASK)|U_GC_MN_MASK;
636 mask=(mask&~U_GC_ME_MASK)|U_GC_ME_MASK;
637 mask=(mask&~U_GC_MC_MASK)|U_GC_MC_MASK;
638
639 mask=(mask&~U_GC_ND_MASK)|U_GC_ND_MASK;
640 mask=(mask&~U_GC_NL_MASK)|U_GC_NL_MASK;
641 mask=(mask&~U_GC_NO_MASK)|U_GC_NO_MASK;
642
643 mask=(mask&~U_GC_ZS_MASK)|U_GC_ZS_MASK;
644 mask=(mask&~U_GC_ZL_MASK)|U_GC_ZL_MASK;
645 mask=(mask&~U_GC_ZP_MASK)|U_GC_ZP_MASK;
646
647 mask=(mask&~U_GC_CC_MASK)|U_GC_CC_MASK;
648 mask=(mask&~U_GC_CF_MASK)|U_GC_CF_MASK;
649 mask=(mask&~U_GC_CO_MASK)|U_GC_CO_MASK;
650 mask=(mask&~U_GC_CS_MASK)|U_GC_CS_MASK;
651
652 mask=(mask&~U_GC_PD_MASK)|U_GC_PD_MASK;
653 mask=(mask&~U_GC_PS_MASK)|U_GC_PS_MASK;
654 mask=(mask&~U_GC_PE_MASK)|U_GC_PE_MASK;
655 mask=(mask&~U_GC_PC_MASK)|U_GC_PC_MASK;
656 mask=(mask&~U_GC_PO_MASK)|U_GC_PO_MASK;
657
658 mask=(mask&~U_GC_SM_MASK)|U_GC_SM_MASK;
659 mask=(mask&~U_GC_SC_MASK)|U_GC_SC_MASK;
660 mask=(mask&~U_GC_SK_MASK)|U_GC_SK_MASK;
661 mask=(mask&~U_GC_SO_MASK)|U_GC_SO_MASK;
662
663 mask=(mask&~U_GC_PI_MASK)|U_GC_PI_MASK;
664 mask=(mask&~U_GC_PF_MASK)|U_GC_PF_MASK;
665
666 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
667 log_err("error: problems with U_GC_XX_MASK constants\n");
668 }
669
670 mask=0;
671 mask=(mask&~U_GC_C_MASK)|U_GC_C_MASK;
672 mask=(mask&~U_GC_L_MASK)|U_GC_L_MASK;
673 mask=(mask&~U_GC_M_MASK)|U_GC_M_MASK;
674 mask=(mask&~U_GC_N_MASK)|U_GC_N_MASK;
675 mask=(mask&~U_GC_Z_MASK)|U_GC_Z_MASK;
676 mask=(mask&~U_GC_P_MASK)|U_GC_P_MASK;
677 mask=(mask&~U_GC_S_MASK)|U_GC_S_MASK;
678
679 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
680 log_err("error: problems with U_GC_Y_MASK constants\n");
681 }
682 {
683 static const UChar32 digit[10]={ 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039 };
684 for(i=0; i<10; i++){
685 if(digit[i]!=u_forDigit(i,10)){
686 log_err("u_forDigit failed for %i. Expected: 0x%4X Got: 0x%4X\n",i,digit[i],u_forDigit(i,10));
687 }
688 }
689 }
690
691 /* test u_digit() */
692 {
693 static const struct {
694 UChar32 c;
695 int8_t radix, value;
696 } data[]={
697 /* base 16 */
698 { 0x0031, 16, 1 },
699 { 0x0038, 16, 8 },
700 { 0x0043, 16, 12 },
701 { 0x0066, 16, 15 },
702 { 0x00e4, 16, -1 },
703 { 0x0662, 16, 2 },
704 { 0x06f5, 16, 5 },
705 { 0xff13, 16, 3 },
706 { 0xff41, 16, 10 },
707
708 /* base 8 */
709 { 0x0031, 8, 1 },
710 { 0x0038, 8, -1 },
711 { 0x0043, 8, -1 },
712 { 0x0066, 8, -1 },
713 { 0x00e4, 8, -1 },
714 { 0x0662, 8, 2 },
715 { 0x06f5, 8, 5 },
716 { 0xff13, 8, 3 },
717 { 0xff41, 8, -1 },
718
719 /* base 36 */
720 { 0x5a, 36, 35 },
721 { 0x7a, 36, 35 },
722 { 0xff3a, 36, 35 },
723 { 0xff5a, 36, 35 },
724
725 /* wrong radix values */
726 { 0x0031, 1, -1 },
727 { 0xff3a, 37, -1 }
728 };
729
730 for(i=0; i<UPRV_LENGTHOF(data); ++i) {
731 if(u_digit(data[i].c, data[i].radix)!=data[i].value) {
732 log_err("u_digit(U+%04x, %d)=%d expected %d\n",
733 data[i].c,
734 data[i].radix,
735 u_digit(data[i].c, data[i].radix),
736 data[i].value);
737 }
738 }
739 }
740 }
741
742 /* test C/POSIX-style functions --------------------------------------------- */
743
744 /* bit flags */
745 #define ISAL 1
746 #define ISLO 2
747 #define ISUP 4
748
749 #define ISDI 8
750 #define ISXD 0x10
751
752 #define ISAN 0x20
753
754 #define ISPU 0x40
755 #define ISGR 0x80
756 #define ISPR 0x100
757
758 #define ISSP 0x200
759 #define ISBL 0x400
760 #define ISCN 0x800
761
762 /* C/POSIX-style functions, in the same order as the bit flags */
763 typedef UBool U_EXPORT2 IsPOSIXClass(UChar32 c);
764
765 static const struct {
766 IsPOSIXClass *fn;
767 const char *name;
768 } posixClasses[]={
769 { u_isalpha, "isalpha" },
770 { u_islower, "islower" },
771 { u_isupper, "isupper" },
772 { u_isdigit, "isdigit" },
773 { u_isxdigit, "isxdigit" },
774 { u_isalnum, "isalnum" },
775 { u_ispunct, "ispunct" },
776 { u_isgraph, "isgraph" },
777 { u_isprint, "isprint" },
778 { u_isspace, "isspace" },
779 { u_isblank, "isblank" },
780 { u_iscntrl, "iscntrl" }
781 };
782
783 static const struct {
784 UChar32 c;
785 uint32_t posixResults;
786 } posixData[]={
787 { 0x0008, ISCN }, /* backspace */
788 { 0x0009, ISSP|ISBL|ISCN }, /* TAB */
789 { 0x000a, ISSP| ISCN }, /* LF */
790 { 0x000c, ISSP| ISCN }, /* FF */
791 { 0x000d, ISSP| ISCN }, /* CR */
792 { 0x0020, ISPR|ISSP|ISBL }, /* space */
793 { 0x0021, ISPU|ISGR|ISPR }, /* ! */
794 { 0x0033, ISDI|ISXD|ISAN| ISGR|ISPR }, /* 3 */
795 { 0x0040, ISPU|ISGR|ISPR }, /* @ */
796 { 0x0041, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* A */
797 { 0x007a, ISAL|ISLO| ISAN| ISGR|ISPR }, /* z */
798 { 0x007b, ISPU|ISGR|ISPR }, /* { */
799 { 0x0085, ISSP| ISCN }, /* NEL */
800 { 0x00a0, ISPR|ISSP|ISBL }, /* NBSP */
801 { 0x00a4, ISGR|ISPR }, /* currency sign */
802 { 0x00e4, ISAL|ISLO| ISAN| ISGR|ISPR }, /* a-umlaut */
803 { 0x0300, ISGR|ISPR }, /* combining grave */
804 { 0x0600, ISCN }, /* arabic number sign */
805 { 0x0627, ISAL| ISAN| ISGR|ISPR }, /* alef */
806 { 0x0663, ISDI|ISXD|ISAN| ISGR|ISPR }, /* arabic 3 */
807 { 0x2002, ISPR|ISSP|ISBL }, /* en space */
808 { 0x2007, ISPR|ISSP|ISBL }, /* figure space */
809 { 0x2009, ISPR|ISSP|ISBL }, /* thin space */
810 { 0x200b, ISCN }, /* ZWSP */
811 /*{ 0x200b, ISPR|ISSP },*/ /* ZWSP */ /* ZWSP became a control char in 4.0.1*/
812 { 0x200e, ISCN }, /* LRM */
813 { 0x2028, ISPR|ISSP| ISCN }, /* LS */
814 { 0x2029, ISPR|ISSP| ISCN }, /* PS */
815 { 0x20ac, ISGR|ISPR }, /* Euro */
816 { 0xff15, ISDI|ISXD|ISAN| ISGR|ISPR }, /* fullwidth 5 */
817 { 0xff25, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* fullwidth E */
818 { 0xff35, ISAL| ISUP| ISAN| ISGR|ISPR }, /* fullwidth U */
819 { 0xff45, ISAL|ISLO| ISXD|ISAN| ISGR|ISPR }, /* fullwidth e */
820 { 0xff55, ISAL|ISLO| ISAN| ISGR|ISPR } /* fullwidth u */
821 };
822
823 static void
824 TestPOSIX() {
825 uint32_t mask;
826 int32_t cl, i;
827 UBool expect;
828
829 mask=1;
830 for(cl=0; cl<12; ++cl) {
831 for(i=0; i<UPRV_LENGTHOF(posixData); ++i) {
832 expect=(UBool)((posixData[i].posixResults&mask)!=0);
833 if(posixClasses[cl].fn(posixData[i].c)!=expect) {
834 log_err("u_%s(U+%04x)=%s is wrong\n",
835 posixClasses[cl].name, posixData[i].c, expect ? "FALSE" : "TRUE");
836 }
837 }
838 mask<<=1;
839 }
840 }
841
842 /* Tests for isControl(u_iscntrl()) and isPrintable(u_isprint()) */
843 static void TestControlPrint()
844 {
845 const UChar32 sampleControl[] = {0x1b, 0x97, 0x82, 0x2028, 0x2029, 0x200c, 0x202b};
846 const UChar32 sampleNonControl[] = {0x61, 0x0031, 0x00e2};
847 const UChar32 samplePrintable[] = {0x0042, 0x005f, 0x2014};
848 const UChar32 sampleNonPrintable[] = {0x200c, 0x009f, 0x001b};
849 UChar32 c;
850
851 testSampleCharProps(u_iscntrl, "u_iscntrl", sampleControl, UPRV_LENGTHOF(sampleControl), TRUE);
852 testSampleCharProps(u_iscntrl, "u_iscntrl", sampleNonControl, UPRV_LENGTHOF(sampleNonControl), FALSE);
853
854 testSampleCharProps(u_isprint, "u_isprint",
855 samplePrintable, UPRV_LENGTHOF(samplePrintable), TRUE);
856 testSampleCharProps(u_isprint, "u_isprint",
857 sampleNonPrintable, UPRV_LENGTHOF(sampleNonPrintable), FALSE);
858
859 /* test all ISO 8 controls */
860 for(c=0; c<=0x9f; ++c) {
861 if(c==0x20) {
862 /* skip ASCII graphic characters and continue with DEL */
863 c=0x7f;
864 }
865 if(!u_iscntrl(c)) {
866 log_err("error: u_iscntrl(ISO 8 control U+%04x)=FALSE\n", c);
867 }
868 if(!u_isISOControl(c)) {
869 log_err("error: u_isISOControl(ISO 8 control U+%04x)=FALSE\n", c);
870 }
871 if(u_isprint(c)) {
872 log_err("error: u_isprint(ISO 8 control U+%04x)=TRUE\n", c);
873 }
874 }
875
876 /* test all Latin-1 graphic characters */
877 for(c=0x20; c<=0xff; ++c) {
878 if(c==0x7f) {
879 c=0xa0;
880 } else if(c==0xad) {
881 /* Unicode 4 changes 00AD Soft Hyphen to Cf (and it is in fact not printable) */
882 ++c;
883 }
884 if(!u_isprint(c)) {
885 log_err("error: u_isprint(Latin-1 graphic character U+%04x)=FALSE\n", c);
886 }
887 }
888 }
889
890 /* u_isJavaIDStart, u_isJavaIDPart, u_isIDStart(), u_isIDPart(), u_isIDIgnorable()*/
891 static void TestIdentifier()
892 {
893 const UChar32 sampleJavaIDStart[] = {0x0071, 0x00e4, 0x005f};
894 const UChar32 sampleNonJavaIDStart[] = {0x0020, 0x2030, 0x0082};
895 const UChar32 sampleJavaIDPart[] = {0x005f, 0x0032, 0x0045};
896 const UChar32 sampleNonJavaIDPart[] = {0x2030, 0x2020, 0x0020};
897 const UChar32 sampleUnicodeIDStart[] = {0x0250, 0x00e2, 0x0061};
898 const UChar32 sampleNonUnicodeIDStart[] = {0x2000, 0x000a, 0x2019};
899 const UChar32 sampleUnicodeIDPart[] = {0x005f, 0x0032, 0x0045};
900 const UChar32 sampleNonUnicodeIDPart[] = {0x2030, 0x00a3, 0x0020};
901 const UChar32 sampleIDIgnore[] = {0x0006, 0x0010, 0x206b, 0x85};
902 const UChar32 sampleNonIDIgnore[] = {0x0075, 0x00a3, 0x0061};
903
904 testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
905 sampleJavaIDStart, UPRV_LENGTHOF(sampleJavaIDStart), TRUE);
906 testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
907 sampleNonJavaIDStart, UPRV_LENGTHOF(sampleNonJavaIDStart), FALSE);
908
909 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
910 sampleJavaIDPart, UPRV_LENGTHOF(sampleJavaIDPart), TRUE);
911 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
912 sampleNonJavaIDPart, UPRV_LENGTHOF(sampleNonJavaIDPart), FALSE);
913
914 /* IDPart should imply IDStart */
915 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
916 sampleJavaIDStart, UPRV_LENGTHOF(sampleJavaIDStart), TRUE);
917
918 testSampleCharProps(u_isIDStart, "u_isIDStart",
919 sampleUnicodeIDStart, UPRV_LENGTHOF(sampleUnicodeIDStart), TRUE);
920 testSampleCharProps(u_isIDStart, "u_isIDStart",
921 sampleNonUnicodeIDStart, UPRV_LENGTHOF(sampleNonUnicodeIDStart), FALSE);
922
923 testSampleCharProps(u_isIDPart, "u_isIDPart",
924 sampleUnicodeIDPart, UPRV_LENGTHOF(sampleUnicodeIDPart), TRUE);
925 testSampleCharProps(u_isIDPart, "u_isIDPart",
926 sampleNonUnicodeIDPart, UPRV_LENGTHOF(sampleNonUnicodeIDPart), FALSE);
927
928 /* IDPart should imply IDStart */
929 testSampleCharProps(u_isIDPart, "u_isIDPart",
930 sampleUnicodeIDStart, UPRV_LENGTHOF(sampleUnicodeIDStart), TRUE);
931
932 testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
933 sampleIDIgnore, UPRV_LENGTHOF(sampleIDIgnore), TRUE);
934 testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
935 sampleNonIDIgnore, UPRV_LENGTHOF(sampleNonIDIgnore), FALSE);
936 }
937
938 /* for each line of UnicodeData.txt, check some of the properties */
939 typedef struct UnicodeDataContext {
940 #if UCONFIG_NO_NORMALIZATION
941 const void *dummy;
942 #else
943 const UNormalizer2 *nfc;
944 const UNormalizer2 *nfkc;
945 #endif
946 } UnicodeDataContext;
947
948 /*
949 * ### TODO
950 * This test fails incorrectly if the First or Last code point of a repetitive area
951 * is overridden, which is allowed and is encouraged for the PUAs.
952 * Currently, this means that both area First/Last and override lines are
953 * tested against the properties from the API,
954 * and the area boundary will not match and cause an error.
955 *
956 * This function should detect area boundaries and skip them for the test of individual
957 * code points' properties.
958 * Then it should check that the areas contain all the same properties except where overridden.
959 * For this, it would have had to set a flag for which code points were listed explicitly.
960 */
961 static void U_CALLCONV
962 unicodeDataLineFn(void *context,
963 char *fields[][2], int32_t fieldCount,
964 UErrorCode *pErrorCode)
965 {
966 char buffer[100];
967 const char *d;
968 char *end;
969 uint32_t value;
970 UChar32 c;
971 int32_t i;
972 int8_t type;
973 int32_t dt;
974 UChar dm[32], s[32];
975 int32_t dmLength, length;
976
977 #if !UCONFIG_NO_NORMALIZATION
978 const UNormalizer2 *nfc, *nfkc;
979 #endif
980
981 /* get the character code, field 0 */
982 c=strtoul(fields[0][0], &end, 16);
983 if(end<=fields[0][0] || end!=fields[0][1]) {
984 log_err("error: syntax error in field 0 at %s\n", fields[0][0]);
985 return;
986 }
987 if((uint32_t)c>=UCHAR_MAX_VALUE + 1) {
988 log_err("error in UnicodeData.txt: code point %lu out of range\n", c);
989 return;
990 }
991
992 /* get general category, field 2 */
993 *fields[2][1]=0;
994 type = (int8_t)tagValues[MakeProp(fields[2][0])];
995 if(u_charType(c)!=type) {
996 log_err("error: u_charType(U+%04lx)==%u instead of %u\n", c, u_charType(c), type);
997 }
998 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
999 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1000 }
1001
1002 /* get canonical combining class, field 3 */
1003 value=strtoul(fields[3][0], &end, 10);
1004 if(end<=fields[3][0] || end!=fields[3][1]) {
1005 log_err("error: syntax error in field 3 at code 0x%lx\n", c);
1006 return;
1007 }
1008 if(value>255) {
1009 log_err("error in UnicodeData.txt: combining class %lu out of range\n", value);
1010 return;
1011 }
1012 #if !UCONFIG_NO_NORMALIZATION
1013 if(value!=u_getCombiningClass(c) || value!=(uint32_t)u_getIntPropertyValue(c, UCHAR_CANONICAL_COMBINING_CLASS)) {
1014 log_err("error: u_getCombiningClass(U+%04lx)==%hu instead of %lu\n", c, u_getCombiningClass(c), value);
1015 }
1016 nfkc=((UnicodeDataContext *)context)->nfkc;
1017 if(value!=unorm2_getCombiningClass(nfkc, c)) {
1018 log_err("error: unorm2_getCombiningClass(nfkc, U+%04lx)==%hu instead of %lu\n", c, unorm2_getCombiningClass(nfkc, c), value);
1019 }
1020 #endif
1021
1022 /* get BiDi category, field 4 */
1023 *fields[4][1]=0;
1024 i=MakeDir(fields[4][0]);
1025 if(i!=u_charDirection(c) || i!=u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)) {
1026 log_err("error: u_charDirection(U+%04lx)==%u instead of %u (%s)\n", c, u_charDirection(c), MakeDir(fields[4][0]), fields[4][0]);
1027 }
1028
1029 /* get Decomposition_Type & Decomposition_Mapping, field 5 */
1030 d=NULL;
1031 if(fields[5][0]==fields[5][1]) {
1032 /* no decomposition, except UnicodeData.txt omits Hangul syllable decompositions */
1033 if(c==0xac00 || c==0xd7a3) {
1034 dt=U_DT_CANONICAL;
1035 } else {
1036 dt=U_DT_NONE;
1037 }
1038 } else {
1039 d=fields[5][0];
1040 *fields[5][1]=0;
1041 dt=UCHAR_INVALID_CODE;
1042 if(*d=='<') {
1043 end=strchr(++d, '>');
1044 if(end!=NULL) {
1045 *end=0;
1046 dt=u_getPropertyValueEnum(UCHAR_DECOMPOSITION_TYPE, d);
1047 d=u_skipWhitespace(end+1);
1048 }
1049 } else {
1050 dt=U_DT_CANONICAL;
1051 }
1052 }
1053 if(dt>U_DT_NONE) {
1054 if(c==0xac00) {
1055 dm[0]=0x1100;
1056 dm[1]=0x1161;
1057 dm[2]=0;
1058 dmLength=2;
1059 } else if(c==0xd7a3) {
1060 dm[0]=0xd788;
1061 dm[1]=0x11c2;
1062 dm[2]=0;
1063 dmLength=2;
1064 } else {
1065 dmLength=u_parseString(d, dm, 32, NULL, pErrorCode);
1066 }
1067 } else {
1068 dmLength=-1;
1069 }
1070 if(dt<0 || U_FAILURE(*pErrorCode)) {
1071 log_err("error in UnicodeData.txt: syntax error in U+%04lX decomposition field\n", (long)c);
1072 return;
1073 }
1074 #if !UCONFIG_NO_NORMALIZATION
1075 i=u_getIntPropertyValue(c, UCHAR_DECOMPOSITION_TYPE);
1076 if(i!=dt) {
1077 log_err("error: u_getIntPropertyValue(U+%04lx, UCHAR_DECOMPOSITION_TYPE)==%d instead of %d\n", c, i, dt);
1078 }
1079 /* Expect Decomposition_Mapping=nfkc.getRawDecomposition(c). */
1080 length=unorm2_getRawDecomposition(nfkc, c, s, 32, pErrorCode);
1081 if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1082 log_err("error: unorm2_getRawDecomposition(nfkc, U+%04lx)==%d instead of %d "
1083 "or the Decomposition_Mapping is different (%s)\n",
1084 c, length, dmLength, u_errorName(*pErrorCode));
1085 return;
1086 }
1087 /* For canonical decompositions only, expect Decomposition_Mapping=nfc.getRawDecomposition(c). */
1088 if(dt!=U_DT_CANONICAL) {
1089 dmLength=-1;
1090 }
1091 nfc=((UnicodeDataContext *)context)->nfc;
1092 length=unorm2_getRawDecomposition(nfc, c, s, 32, pErrorCode);
1093 if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1094 log_err("error: unorm2_getRawDecomposition(nfc, U+%04lx)==%d instead of %d "
1095 "or the Decomposition_Mapping is different (%s)\n",
1096 c, length, dmLength, u_errorName(*pErrorCode));
1097 return;
1098 }
1099 /* recompose */
1100 if(dt==U_DT_CANONICAL && !u_hasBinaryProperty(c, UCHAR_FULL_COMPOSITION_EXCLUSION)) {
1101 UChar32 a, b, composite;
1102 i=0;
1103 U16_NEXT(dm, i, dmLength, a);
1104 U16_NEXT(dm, i, dmLength, b);
1105 /* i==dmLength */
1106 composite=unorm2_composePair(nfc, a, b);
1107 if(composite!=c) {
1108 log_err("error: nfc U+%04lX decomposes to U+%04lX+U+%04lX but does not compose back (instead U+%04lX)\n",
1109 (long)c, (long)a, (long)b, (long)composite);
1110 }
1111 /*
1112 * Note: NFKC has fewer round-trip mappings than NFC,
1113 * so we can't just test unorm2_composePair(nfkc, a, b) here without further data.
1114 */
1115 }
1116 #endif
1117
1118 /* get ISO Comment, field 11 */
1119 *fields[11][1]=0;
1120 i=u_getISOComment(c, buffer, sizeof(buffer), pErrorCode);
1121 if(U_FAILURE(*pErrorCode) || 0!=strcmp(fields[11][0], buffer)) {
1122 log_err_status(*pErrorCode, "error: u_getISOComment(U+%04lx) wrong (%s): \"%s\" should be \"%s\"\n",
1123 c, u_errorName(*pErrorCode),
1124 U_FAILURE(*pErrorCode) ? buffer : "[error]",
1125 fields[11][0]);
1126 }
1127
1128 /* get uppercase mapping, field 12 */
1129 if(fields[12][0]!=fields[12][1]) {
1130 value=strtoul(fields[12][0], &end, 16);
1131 if(end!=fields[12][1]) {
1132 log_err("error: syntax error in field 12 at code 0x%lx\n", c);
1133 return;
1134 }
1135 if((UChar32)value!=u_toupper(c)) {
1136 log_err("error: u_toupper(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_toupper(c), value);
1137 }
1138 } else {
1139 /* no case mapping: the API must map the code point to itself */
1140 if(c!=u_toupper(c)) {
1141 log_err("error: U+%04lx does not have an uppercase mapping but u_toupper()==U+%04lx\n", c, u_toupper(c));
1142 }
1143 }
1144
1145 /* get lowercase mapping, field 13 */
1146 if(fields[13][0]!=fields[13][1]) {
1147 value=strtoul(fields[13][0], &end, 16);
1148 if(end!=fields[13][1]) {
1149 log_err("error: syntax error in field 13 at code 0x%lx\n", c);
1150 return;
1151 }
1152 if((UChar32)value!=u_tolower(c)) {
1153 log_err("error: u_tolower(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_tolower(c), value);
1154 }
1155 } else {
1156 /* no case mapping: the API must map the code point to itself */
1157 if(c!=u_tolower(c)) {
1158 log_err("error: U+%04lx does not have a lowercase mapping but u_tolower()==U+%04lx\n", c, u_tolower(c));
1159 }
1160 }
1161
1162 /* get titlecase mapping, field 14 */
1163 if(fields[14][0]!=fields[14][1]) {
1164 value=strtoul(fields[14][0], &end, 16);
1165 if(end!=fields[14][1]) {
1166 log_err("error: syntax error in field 14 at code 0x%lx\n", c);
1167 return;
1168 }
1169 if((UChar32)value!=u_totitle(c)) {
1170 log_err("error: u_totitle(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_totitle(c), value);
1171 }
1172 } else {
1173 /* no case mapping: the API must map the code point to itself */
1174 if(c!=u_totitle(c)) {
1175 log_err("error: U+%04lx does not have a titlecase mapping but u_totitle()==U+%04lx\n", c, u_totitle(c));
1176 }
1177 }
1178 }
1179
1180 static UBool U_CALLCONV
1181 enumTypeRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1182 static const UChar32 test[][2]={
1183 {0x41, U_UPPERCASE_LETTER},
1184 {0x308, U_NON_SPACING_MARK},
1185 {0xfffe, U_GENERAL_OTHER_TYPES},
1186 {0xe0041, U_FORMAT_CHAR},
1187 {0xeffff, U_UNASSIGNED}
1188 };
1189
1190 int32_t i, count;
1191
1192 if(0!=strcmp((const char *)context, "a1")) {
1193 log_err("error: u_enumCharTypes() passes on an incorrect context pointer\n");
1194 return FALSE;
1195 }
1196
1197 count=UPRV_LENGTHOF(test);
1198 for(i=0; i<count; ++i) {
1199 if(start<=test[i][0] && test[i][0]<limit) {
1200 if(type!=(UCharCategory)test[i][1]) {
1201 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld instead of U+%04lx with %ld\n",
1202 start, limit, (long)type, test[i][0], test[i][1]);
1203 }
1204 /* stop at the range that includes the last test code point (increases code coverage for enumeration) */
1205 return i==(count-1) ? FALSE : TRUE;
1206 }
1207 }
1208
1209 if(start>test[count-1][0]) {
1210 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld after it should have stopped\n",
1211 start, limit, (long)type);
1212 return FALSE;
1213 }
1214
1215 return TRUE;
1216 }
1217
1218 static UBool U_CALLCONV
1219 enumDefaultsRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1220 /* default Bidi classes for unassigned code points, from the DerivedBidiClass.txt header */
1221 static const int32_t defaultBidi[][2]={ /* { limit, class } */
1222 { 0x0590, U_LEFT_TO_RIGHT },
1223 { 0x0600, U_RIGHT_TO_LEFT },
1224 { 0x07C0, U_RIGHT_TO_LEFT_ARABIC },
1225 { 0x0860, U_RIGHT_TO_LEFT },
1226 { 0x0870, U_RIGHT_TO_LEFT_ARABIC }, // Unicode 10 changes U+0860..U+086F from R to AL.
1227 { 0x08A0, U_RIGHT_TO_LEFT },
1228 { 0x0900, U_RIGHT_TO_LEFT_ARABIC }, /* Unicode 6.1 changes U+08A0..U+08FF from R to AL */
1229 { 0x20A0, U_LEFT_TO_RIGHT },
1230 { 0x20D0, U_EUROPEAN_NUMBER_TERMINATOR }, /* Unicode 6.3 changes the currency symbols block U+20A0..U+20CF to default to ET not L */
1231 { 0xFB1D, U_LEFT_TO_RIGHT },
1232 { 0xFB50, U_RIGHT_TO_LEFT },
1233 { 0xFE00, U_RIGHT_TO_LEFT_ARABIC },
1234 { 0xFE70, U_LEFT_TO_RIGHT },
1235 { 0xFF00, U_RIGHT_TO_LEFT_ARABIC },
1236
1237 { 0x10800, U_LEFT_TO_RIGHT },
1238 { 0x10D00, U_RIGHT_TO_LEFT }, // Unicode 11 changes U+10D00..U+10D3F from R to AL.
1239 { 0x10D40, U_RIGHT_TO_LEFT_ARABIC },
1240 { 0x10F30, U_RIGHT_TO_LEFT }, // Unicode 11 changes U+10F30..U+10F6F from R to AL.
1241 { 0x10F70, U_RIGHT_TO_LEFT_ARABIC },
1242 { 0x11000, U_RIGHT_TO_LEFT },
1243
1244 { 0x1E800, U_LEFT_TO_RIGHT }, /* new default-R range in Unicode 5.2: U+1E800 - U+1EFFF */
1245 { 0x1EC70, U_RIGHT_TO_LEFT }, // Unicode 11 changes U+1EC70..U+1ECBF from R to AL.
1246 { 0x1ECC0, U_RIGHT_TO_LEFT_ARABIC },
1247 { 0x1ED00, U_RIGHT_TO_LEFT }, // Unicode 12 changes U+1ED00..U+1ED4F from R to AL.
1248 { 0x1ED50, U_RIGHT_TO_LEFT_ARABIC },
1249 { 0x1EE00, U_RIGHT_TO_LEFT },
1250 { 0x1EF00, U_RIGHT_TO_LEFT_ARABIC }, /* Unicode 6.1 changes U+1EE00..U+1EEFF from R to AL */
1251 { 0x1F000, U_RIGHT_TO_LEFT },
1252 { 0x110000, U_LEFT_TO_RIGHT }
1253 };
1254
1255 UChar32 c;
1256 int32_t i;
1257 UCharDirection shouldBeDir;
1258
1259 /*
1260 * LineBreak.txt specifies:
1261 * # - Assigned characters that are not listed explicitly are given the value
1262 * # "AL".
1263 * # - Unassigned characters are given the value "XX".
1264 *
1265 * PUA characters are listed explicitly with "XX".
1266 * Verify that no assigned character has "XX".
1267 */
1268 if(type!=U_UNASSIGNED && type!=U_PRIVATE_USE_CHAR) {
1269 c=start;
1270 while(c<limit) {
1271 if(0==u_getIntPropertyValue(c, UCHAR_LINE_BREAK)) {
1272 log_err("error UCHAR_LINE_BREAK(assigned U+%04lx)=XX\n", c);
1273 }
1274 ++c;
1275 }
1276 }
1277
1278 /*
1279 * Verify default Bidi classes.
1280 * See DerivedBidiClass.txt, especially for unassigned code points.
1281 */
1282 if(type==U_UNASSIGNED || type==U_PRIVATE_USE_CHAR) {
1283 /* enumerate the intersections of defaultBidi ranges with [start..limit[ */
1284 c=start;
1285 for(i=0; i<UPRV_LENGTHOF(defaultBidi) && c<limit; ++i) {
1286 if((int32_t)c<defaultBidi[i][0]) {
1287 while(c<limit && (int32_t)c<defaultBidi[i][0]) {
1288 if(U_IS_UNICODE_NONCHAR(c) || u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
1289 shouldBeDir=U_BOUNDARY_NEUTRAL;
1290 } else {
1291 shouldBeDir=(UCharDirection)defaultBidi[i][1];
1292 }
1293
1294 if( u_charDirection(c)!=shouldBeDir ||
1295 u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)!=shouldBeDir
1296 ) {
1297 log_err("error: u_charDirection(unassigned/PUA U+%04lx)=%s should be %s\n",
1298 c, dirStrings[u_charDirection(c)], dirStrings[shouldBeDir]);
1299 }
1300 ++c;
1301 }
1302 }
1303 }
1304 }
1305
1306 return TRUE;
1307 }
1308
1309 /* tests for several properties */
1310 static void TestUnicodeData()
1311 {
1312 UVersionInfo expectVersionArray;
1313 UVersionInfo versionArray;
1314 char *fields[15][2];
1315 UErrorCode errorCode;
1316 UChar32 c;
1317 int8_t type;
1318
1319 UnicodeDataContext context;
1320
1321 u_versionFromString(expectVersionArray, U_UNICODE_VERSION);
1322 u_getUnicodeVersion(versionArray);
1323 if(memcmp(versionArray, expectVersionArray, U_MAX_VERSION_LENGTH) != 0)
1324 {
1325 log_err("Testing u_getUnicodeVersion() - expected " U_UNICODE_VERSION " got %d.%d.%d.%d\n",
1326 versionArray[0], versionArray[1], versionArray[2], versionArray[3]);
1327 }
1328
1329 #if defined(ICU_UNICODE_VERSION)
1330 /* test only happens where we have configure.in with UNICODE_VERSION - sanity check. */
1331 if(strcmp(U_UNICODE_VERSION, ICU_UNICODE_VERSION))
1332 {
1333 log_err("Testing configure.in's ICU_UNICODE_VERSION - expected " U_UNICODE_VERSION " got " ICU_UNICODE_VERSION "\n");
1334 }
1335 #endif
1336
1337 if (ublock_getCode((UChar)0x0041) != UBLOCK_BASIC_LATIN || u_getIntPropertyValue(0x41, UCHAR_BLOCK)!=(int32_t)UBLOCK_BASIC_LATIN) {
1338 log_err("ublock_getCode(U+0041) property failed! Expected : %i Got: %i \n", UBLOCK_BASIC_LATIN,ublock_getCode((UChar)0x0041));
1339 }
1340
1341 errorCode=U_ZERO_ERROR;
1342 #if !UCONFIG_NO_NORMALIZATION
1343 context.nfc=unorm2_getNFCInstance(&errorCode);
1344 context.nfkc=unorm2_getNFKCInstance(&errorCode);
1345 if(U_FAILURE(errorCode)) {
1346 log_data_err("error: unable to open an NFC or NFKC UNormalizer2 - %s\n", u_errorName(errorCode));
1347 return;
1348 }
1349 #endif
1350 parseUCDFile("UnicodeData.txt", fields, 15, unicodeDataLineFn, &context, &errorCode);
1351 if(U_FAILURE(errorCode)) {
1352 return; /* if we couldn't parse UnicodeData.txt, we should return */
1353 }
1354
1355 /* sanity check on repeated properties */
1356 for(c=0xfffe; c<=0x10ffff;) {
1357 type=u_charType(c);
1358 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1359 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1360 }
1361 if(type!=U_UNASSIGNED) {
1362 log_err("error: u_charType(U+%04lx)!=U_UNASSIGNED (returns %d)\n", c, u_charType(c));
1363 }
1364 if((c&0xffff)==0xfffe) {
1365 ++c;
1366 } else {
1367 c+=0xffff;
1368 }
1369 }
1370
1371 /* test that PUA is not "unassigned" */
1372 for(c=0xe000; c<=0x10fffd;) {
1373 type=u_charType(c);
1374 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1375 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1376 }
1377 if(type==U_UNASSIGNED) {
1378 log_err("error: u_charType(U+%04lx)==U_UNASSIGNED\n", c);
1379 } else if(type!=U_PRIVATE_USE_CHAR) {
1380 log_verbose("PUA override: u_charType(U+%04lx)=%d\n", c, type);
1381 }
1382 if(c==0xf8ff) {
1383 c=0xf0000;
1384 } else if(c==0xffffd) {
1385 c=0x100000;
1386 } else {
1387 ++c;
1388 }
1389 }
1390
1391 /* test u_enumCharTypes() */
1392 u_enumCharTypes(enumTypeRange, "a1");
1393
1394 /* check default properties */
1395 u_enumCharTypes(enumDefaultsRange, NULL);
1396 }
1397
1398 static void TestCodeUnit(){
1399 const UChar codeunit[]={0x0000,0xe065,0x20ac,0xd7ff,0xd800,0xd841,0xd905,0xdbff,0xdc00,0xdc02,0xddee,0xdfff,0};
1400
1401 int32_t i;
1402
1403 for(i=0; i<UPRV_LENGTHOF(codeunit); i++){
1404 UChar c=codeunit[i];
1405 if(i<4){
1406 if(!(U16_IS_SINGLE(c)) || (U16_IS_LEAD(c)) || (U16_IS_TRAIL(c)) ||
1407 U16_IS_SURROGATE(c) || U_IS_SURROGATE(c)) {
1408 log_err("ERROR: U+%04x is a single", c);
1409 }
1410
1411 }
1412 if(i >= 4 && i< 8){
1413 if(!(U16_IS_LEAD(c)) || U16_IS_SINGLE(c) || U16_IS_TRAIL(c) ||
1414 !U16_IS_SURROGATE(c) || !U_IS_SURROGATE(c)){
1415 log_err("ERROR: U+%04x is a first surrogate", c);
1416 }
1417 }
1418 if(i >= 8 && i< 12){
1419 if(!(U16_IS_TRAIL(c)) || U16_IS_SINGLE(c) || U16_IS_LEAD(c) ||
1420 !U16_IS_SURROGATE(c) || !U_IS_SURROGATE(c)){
1421 log_err("ERROR: U+%04x is a second surrogate", c);
1422 }
1423 }
1424 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1425 if(i<4){
1426 if(!(UTF_IS_SINGLE(c)) || (UTF_IS_LEAD(c)) || (UTF_IS_TRAIL(c)) ||(UTF_IS_SURROGATE(c))){
1427 log_err("ERROR: U+%04x is a single", c);
1428 }
1429
1430 }
1431 if(i >= 4 && i< 8){
1432 if(!(UTF_IS_LEAD(c)) || UTF_IS_SINGLE(c) || UTF_IS_TRAIL(c) || !(UTF_IS_SURROGATE(c))){
1433 log_err("ERROR: U+%04x is a first surrogate", c);
1434 }
1435 }
1436 if(i >= 8 && i< 12){
1437 if(!(UTF_IS_TRAIL(c)) || UTF_IS_SINGLE(c) || UTF_IS_LEAD(c) || !(UTF_IS_SURROGATE(c))){
1438 log_err("ERROR: U+%04x is a second surrogate", c);
1439 }
1440 }
1441 #endif
1442 }
1443 }
1444
1445 static void TestCodePoint(){
1446 const UChar32 codePoint[]={
1447 /*surrogate, notvalid(codepoint), not a UnicodeChar, not Error */
1448 0xd800,
1449 0xdbff,
1450 0xdc00,
1451 0xdfff,
1452 0xdc04,
1453 0xd821,
1454 /*not a surrogate, valid, isUnicodeChar , not Error*/
1455 0x20ac,
1456 0xd7ff,
1457 0xe000,
1458 0xe123,
1459 0x0061,
1460 0xe065,
1461 0x20402,
1462 0x24506,
1463 0x23456,
1464 0x20402,
1465 0x10402,
1466 0x23456,
1467 /*not a surrogate, not valid, isUnicodeChar, isError */
1468 0x0015,
1469 0x009f,
1470 /*not a surrogate, not valid, not isUnicodeChar, isError */
1471 0xffff,
1472 0xfffe,
1473 };
1474 int32_t i;
1475 for(i=0; i<UPRV_LENGTHOF(codePoint); i++) {
1476 UChar32 c=codePoint[i];
1477 if(i<6) {
1478 if(!U_IS_SURROGATE(c) || !U16_IS_SURROGATE(c)) {
1479 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1480 }
1481 if(U_IS_UNICODE_CHAR(c)) {
1482 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1483 }
1484 } else if(i >=6 && i<18) {
1485 if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1486 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1487 }
1488 if(!U_IS_UNICODE_CHAR(c)) {
1489 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1490 }
1491 } else if(i >=18 && i<20) {
1492 if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1493 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1494 }
1495 if(!U_IS_UNICODE_CHAR(c)) {
1496 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1497 }
1498 } else if(i >=18 && i<UPRV_LENGTHOF(codePoint)) {
1499 if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1500 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1501 }
1502 if(U_IS_UNICODE_CHAR(c)) {
1503 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1504 }
1505 }
1506 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1507 if(i<6){
1508 if(!UTF_IS_SURROGATE(c)){
1509 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1510 }
1511 if(UTF_IS_VALID(c)){
1512 log_err("ERROR: isValid() failed for U+%04x\n", c);
1513 }
1514 if(UTF_IS_UNICODE_CHAR(c)){
1515 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1516 }
1517 if(UTF_IS_ERROR(c)){
1518 log_err("ERROR: isError() failed for U+%04x\n", c);
1519 }
1520 }else if(i >=6 && i<18){
1521 if(UTF_IS_SURROGATE(c)){
1522 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1523 }
1524 if(!UTF_IS_VALID(c)){
1525 log_err("ERROR: isValid() failed for U+%04x\n", c);
1526 }
1527 if(!UTF_IS_UNICODE_CHAR(c)){
1528 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1529 }
1530 if(UTF_IS_ERROR(c)){
1531 log_err("ERROR: isError() failed for U+%04x\n", c);
1532 }
1533 }else if(i >=18 && i<20){
1534 if(UTF_IS_SURROGATE(c)){
1535 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1536 }
1537 if(UTF_IS_VALID(c)){
1538 log_err("ERROR: isValid() failed for U+%04x\n", c);
1539 }
1540 if(!UTF_IS_UNICODE_CHAR(c)){
1541 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1542 }
1543 if(!UTF_IS_ERROR(c)){
1544 log_err("ERROR: isError() failed for U+%04x\n", c);
1545 }
1546 }
1547 else if(i >=18 && i<UPRV_LENGTHOF(codePoint)){
1548 if(UTF_IS_SURROGATE(c)){
1549 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1550 }
1551 if(UTF_IS_VALID(c)){
1552 log_err("ERROR: isValid() failed for U+%04x\n", c);
1553 }
1554 if(UTF_IS_UNICODE_CHAR(c)){
1555 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1556 }
1557 if(!UTF_IS_ERROR(c)){
1558 log_err("ERROR: isError() failed for U+%04x\n", c);
1559 }
1560 }
1561 #endif
1562 }
1563
1564 if(
1565 !U_IS_BMP(0) || !U_IS_BMP(0x61) || !U_IS_BMP(0x20ac) ||
1566 !U_IS_BMP(0xd9da) || !U_IS_BMP(0xdfed) || !U_IS_BMP(0xffff) ||
1567 U_IS_BMP(U_SENTINEL) || U_IS_BMP(0x10000) || U_IS_BMP(0x50005) ||
1568 U_IS_BMP(0x10ffff) || U_IS_BMP(0x110000) || U_IS_BMP(0x7fffffff)
1569 ) {
1570 log_err("error with U_IS_BMP()\n");
1571 }
1572
1573 if(
1574 U_IS_SUPPLEMENTARY(0) || U_IS_SUPPLEMENTARY(0x61) || U_IS_SUPPLEMENTARY(0x20ac) ||
1575 U_IS_SUPPLEMENTARY(0xd9da) || U_IS_SUPPLEMENTARY(0xdfed) || U_IS_SUPPLEMENTARY(0xffff) ||
1576 U_IS_SUPPLEMENTARY(U_SENTINEL) || !U_IS_SUPPLEMENTARY(0x10000) || !U_IS_SUPPLEMENTARY(0x50005) ||
1577 !U_IS_SUPPLEMENTARY(0x10ffff) || U_IS_SUPPLEMENTARY(0x110000) || U_IS_SUPPLEMENTARY(0x7fffffff)
1578 ) {
1579 log_err("error with U_IS_SUPPLEMENTARY()\n");
1580 }
1581 }
1582
1583 static void TestCharLength()
1584 {
1585 const int32_t codepoint[]={
1586 1, 0x0061,
1587 1, 0xe065,
1588 1, 0x20ac,
1589 2, 0x20402,
1590 2, 0x23456,
1591 2, 0x24506,
1592 2, 0x20402,
1593 2, 0x10402,
1594 1, 0xd7ff,
1595 1, 0xe000
1596 };
1597
1598 int32_t i;
1599 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1600 UBool multiple;
1601 #endif
1602 for(i=0; i<UPRV_LENGTHOF(codepoint); i=(int16_t)(i+2)){
1603 UChar32 c=codepoint[i+1];
1604 if(
1605 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1606 UTF_CHAR_LENGTH(c) != codepoint[i] ||
1607 #endif
1608 U16_LENGTH(c) != codepoint[i]) {
1609 log_err("The no: of code units for U+%04x:- Expected: %d Got: %d\n", c, codepoint[i], U16_LENGTH(c));
1610 }
1611 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1612 multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
1613 if(UTF_NEED_MULTIPLE_UCHAR(c) != multiple){
1614 log_err("ERROR: Unicode::needMultipleUChar() failed for U+%04x\n", c);
1615 }
1616 #endif
1617 }
1618 }
1619
1620 /*internal functions ----*/
1621 static int32_t MakeProp(char* str)
1622 {
1623 int32_t result = 0;
1624 char* matchPosition =0;
1625
1626 matchPosition = strstr(tagStrings, str);
1627 if (matchPosition == 0)
1628 {
1629 log_err("unrecognized type letter ");
1630 log_err(str);
1631 }
1632 else
1633 result = (int32_t)((matchPosition - tagStrings) / 2);
1634 return result;
1635 }
1636
1637 static int32_t MakeDir(char* str)
1638 {
1639 int32_t pos = 0;
1640 for (pos = 0; pos < U_CHAR_DIRECTION_COUNT; pos++) {
1641 if (strcmp(str, dirStrings[pos]) == 0) {
1642 return pos;
1643 }
1644 }
1645 return -1;
1646 }
1647
1648 /* test u_charName() -------------------------------------------------------- */
1649
1650 static const struct {
1651 uint32_t code;
1652 const char *name, *oldName, *extName, *alias;
1653 } names[]={
1654 {0x0061, "LATIN SMALL LETTER A", "", "LATIN SMALL LETTER A"},
1655 {0x01a2, "LATIN CAPITAL LETTER OI", "",
1656 "LATIN CAPITAL LETTER OI",
1657 "LATIN CAPITAL LETTER GHA"},
1658 {0x0284, "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK", "",
1659 "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK" },
1660 {0x0fd0, "TIBETAN MARK BSKA- SHOG GI MGO RGYAN", "",
1661 "TIBETAN MARK BSKA- SHOG GI MGO RGYAN",
1662 "TIBETAN MARK BKA- SHOG GI MGO RGYAN"},
1663 {0x3401, "CJK UNIFIED IDEOGRAPH-3401", "", "CJK UNIFIED IDEOGRAPH-3401" },
1664 {0x7fed, "CJK UNIFIED IDEOGRAPH-7FED", "", "CJK UNIFIED IDEOGRAPH-7FED" },
1665 {0xac00, "HANGUL SYLLABLE GA", "", "HANGUL SYLLABLE GA" },
1666 {0xd7a3, "HANGUL SYLLABLE HIH", "", "HANGUL SYLLABLE HIH" },
1667 {0xd800, "", "", "<lead surrogate-D800>" },
1668 {0xdc00, "", "", "<trail surrogate-DC00>" },
1669 {0xff08, "FULLWIDTH LEFT PARENTHESIS", "", "FULLWIDTH LEFT PARENTHESIS" },
1670 {0xffe5, "FULLWIDTH YEN SIGN", "", "FULLWIDTH YEN SIGN" },
1671 {0xffff, "", "", "<noncharacter-FFFF>" },
1672 {0x1d0c5, "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", "",
1673 "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS",
1674 "BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS"},
1675 {0x23456, "CJK UNIFIED IDEOGRAPH-23456", "", "CJK UNIFIED IDEOGRAPH-23456" }
1676 };
1677
1678 static UBool
1679 enumCharNamesFn(void *context,
1680 UChar32 code, UCharNameChoice nameChoice,
1681 const char *name, int32_t length) {
1682 int32_t *pCount=(int32_t *)context;
1683 const char *expected;
1684 int i;
1685
1686 if(length<=0 || length!=(int32_t)strlen(name)) {
1687 /* should not be called with an empty string or invalid length */
1688 log_err("u_enumCharName(0x%lx)=%s but length=%ld\n", name, length);
1689 return TRUE;
1690 }
1691
1692 ++*pCount;
1693 for(i=0; i<UPRV_LENGTHOF(names); ++i) {
1694 if(code==(UChar32)names[i].code) {
1695 switch (nameChoice) {
1696 case U_EXTENDED_CHAR_NAME:
1697 if(0!=strcmp(name, names[i].extName)) {
1698 log_err("u_enumCharName(0x%lx - Extended)=%s instead of %s\n", code, name, names[i].extName);
1699 }
1700 break;
1701 case U_UNICODE_CHAR_NAME:
1702 if(0!=strcmp(name, names[i].name)) {
1703 log_err("u_enumCharName(0x%lx)=%s instead of %s\n", code, name, names[i].name);
1704 }
1705 break;
1706 case U_UNICODE_10_CHAR_NAME:
1707 expected=names[i].oldName;
1708 if(expected[0]==0 || 0!=strcmp(name, expected)) {
1709 log_err("u_enumCharName(0x%lx - 1.0)=%s instead of %s\n", code, name, expected);
1710 }
1711 break;
1712 case U_CHAR_NAME_ALIAS:
1713 expected=names[i].alias;
1714 if(expected==NULL || expected[0]==0 || 0!=strcmp(name, expected)) {
1715 log_err("u_enumCharName(0x%lx - alias)=%s instead of %s\n", code, name, expected);
1716 }
1717 break;
1718 case U_CHAR_NAME_CHOICE_COUNT:
1719 break;
1720 }
1721 break;
1722 }
1723 }
1724 return TRUE;
1725 }
1726
1727 struct enumExtCharNamesContext {
1728 uint32_t length;
1729 int32_t last;
1730 };
1731
1732 static UBool
1733 enumExtCharNamesFn(void *context,
1734 UChar32 code, UCharNameChoice nameChoice,
1735 const char *name, int32_t length) {
1736 struct enumExtCharNamesContext *ecncp = (struct enumExtCharNamesContext *) context;
1737
1738 if (ecncp->last != (int32_t) code - 1) {
1739 if (ecncp->last < 0) {
1740 log_err("u_enumCharName(0x%lx - Ext) after u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x%lx - Ext)\n", code, ecncp->last, ecncp->last + 1);
1741 } else {
1742 log_err("u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x0 - Ext)\n", code);
1743 }
1744 }
1745 ecncp->last = (int32_t) code;
1746
1747 if (!*name) {
1748 log_err("u_enumCharName(0x%lx - Ext) should not be an empty string\n", code);
1749 }
1750
1751 return enumCharNamesFn(&ecncp->length, code, nameChoice, name, length);
1752 }
1753
1754 /**
1755 * This can be made more efficient by moving it into putil.c and having
1756 * it directly access the ebcdic translation tables.
1757 * TODO: If we get this method in putil.c, then delete it from here.
1758 */
1759 static UChar
1760 u_charToUChar(char c) {
1761 UChar uc;
1762 u_charsToUChars(&c, &uc, 1);
1763 return uc;
1764 }
1765
1766 static void
1767 TestCharNames() {
1768 static char name[80];
1769 UErrorCode errorCode=U_ZERO_ERROR;
1770 struct enumExtCharNamesContext extContext;
1771 const char *expected;
1772 int32_t length;
1773 UChar32 c;
1774 int32_t i;
1775
1776 log_verbose("Testing uprv_getMaxCharNameLength()\n");
1777 length=uprv_getMaxCharNameLength();
1778 if(length==0) {
1779 /* no names data available */
1780 return;
1781 }
1782 if(length<83) { /* Unicode 3.2 max char name length */
1783 log_err("uprv_getMaxCharNameLength()=%d is too short");
1784 }
1785 /* ### TODO same tests for max ISO comment length as for max name length */
1786
1787 log_verbose("Testing u_charName()\n");
1788 for(i=0; i<UPRV_LENGTHOF(names); ++i) {
1789 /* modern Unicode character name */
1790 length=u_charName(names[i].code, U_UNICODE_CHAR_NAME, name, sizeof(name), &errorCode);
1791 if(U_FAILURE(errorCode)) {
1792 log_err("u_charName(0x%lx) error %s\n", names[i].code, u_errorName(errorCode));
1793 return;
1794 }
1795 if(length<0 || 0!=strcmp(name, names[i].name) || length!=(uint16_t)strlen(name)) {
1796 log_err("u_charName(0x%lx) gets: %s (length %ld) instead of: %s\n", names[i].code, name, length, names[i].name);
1797 }
1798
1799 /* find the modern name */
1800 if (*names[i].name) {
1801 c=u_charFromName(U_UNICODE_CHAR_NAME, names[i].name, &errorCode);
1802 if(U_FAILURE(errorCode)) {
1803 log_err("u_charFromName(%s) error %s\n", names[i].name, u_errorName(errorCode));
1804 return;
1805 }
1806 if(c!=(UChar32)names[i].code) {
1807 log_err("u_charFromName(%s) gets 0x%lx instead of 0x%lx\n", names[i].name, c, names[i].code);
1808 }
1809 }
1810
1811 /* Unicode 1.0 character name */
1812 length=u_charName(names[i].code, U_UNICODE_10_CHAR_NAME, name, sizeof(name), &errorCode);
1813 if(U_FAILURE(errorCode)) {
1814 log_err("u_charName(0x%lx - 1.0) error %s\n", names[i].code, u_errorName(errorCode));
1815 return;
1816 }
1817 if(length<0 || (length>0 && 0!=strcmp(name, names[i].oldName)) || length!=(uint16_t)strlen(name)) {
1818 log_err("u_charName(0x%lx - 1.0) gets %s length %ld instead of nothing or %s\n", names[i].code, name, length, names[i].oldName);
1819 }
1820
1821 /* find the Unicode 1.0 name if it is stored (length>0 means that we could read it) */
1822 if(names[i].oldName[0]!=0 /* && length>0 */) {
1823 c=u_charFromName(U_UNICODE_10_CHAR_NAME, names[i].oldName, &errorCode);
1824 if(U_FAILURE(errorCode)) {
1825 log_err("u_charFromName(%s - 1.0) error %s\n", names[i].oldName, u_errorName(errorCode));
1826 return;
1827 }
1828 if(c!=(UChar32)names[i].code) {
1829 log_err("u_charFromName(%s - 1.0) gets 0x%lx instead of 0x%lx\n", names[i].oldName, c, names[i].code);
1830 }
1831 }
1832
1833 /* Unicode character name alias */
1834 length=u_charName(names[i].code, U_CHAR_NAME_ALIAS, name, sizeof(name), &errorCode);
1835 if(U_FAILURE(errorCode)) {
1836 log_err("u_charName(0x%lx - alias) error %s\n", names[i].code, u_errorName(errorCode));
1837 return;
1838 }
1839 expected=names[i].alias;
1840 if(expected==NULL) {
1841 expected="";
1842 }
1843 if(length<0 || (length>0 && 0!=strcmp(name, expected)) || length!=(uint16_t)strlen(name)) {
1844 log_err("u_charName(0x%lx - alias) gets %s length %ld instead of nothing or %s\n",
1845 names[i].code, name, length, expected);
1846 }
1847
1848 /* find the Unicode character name alias if it is stored (length>0 means that we could read it) */
1849 if(expected[0]!=0 /* && length>0 */) {
1850 c=u_charFromName(U_CHAR_NAME_ALIAS, expected, &errorCode);
1851 if(U_FAILURE(errorCode)) {
1852 log_err("u_charFromName(%s - alias) error %s\n",
1853 expected, u_errorName(errorCode));
1854 return;
1855 }
1856 if(c!=(UChar32)names[i].code) {
1857 log_err("u_charFromName(%s - alias) gets 0x%lx instead of 0x%lx\n",
1858 expected, c, names[i].code);
1859 }
1860 }
1861 }
1862
1863 /* test u_enumCharNames() */
1864 length=0;
1865 errorCode=U_ZERO_ERROR;
1866 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumCharNamesFn, &length, U_UNICODE_CHAR_NAME, &errorCode);
1867 if(U_FAILURE(errorCode) || length<94140) {
1868 log_err("u_enumCharNames(%ld..%lx) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE, u_errorName(errorCode), length);
1869 }
1870
1871 extContext.length = 0;
1872 extContext.last = -1;
1873 errorCode=U_ZERO_ERROR;
1874 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumExtCharNamesFn, &extContext, U_EXTENDED_CHAR_NAME, &errorCode);
1875 if(U_FAILURE(errorCode) || extContext.length<UCHAR_MAX_VALUE + 1) {
1876 log_err("u_enumCharNames(%ld..0x%lx - Extended) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, u_errorName(errorCode), extContext.length);
1877 }
1878
1879 /* test that u_charFromName() uppercases the input name, i.e., works with mixed-case names (new in 2.0) */
1880 if(0x61!=u_charFromName(U_UNICODE_CHAR_NAME, "LATin smALl letTER A", &errorCode)) {
1881 log_err("u_charFromName(U_UNICODE_CHAR_NAME, \"LATin smALl letTER A\") did not find U+0061 (%s)\n", u_errorName(errorCode));
1882 }
1883
1884 /* Test getCharNameCharacters */
1885 if(!getTestOption(QUICK_OPTION)) {
1886 enum { BUFSIZE = 256 };
1887 UErrorCode ec = U_ZERO_ERROR;
1888 char buf[BUFSIZE];
1889 int32_t maxLength;
1890 UChar32 cp;
1891 UChar pat[BUFSIZE], dumbPat[BUFSIZE];
1892 int32_t l1, l2;
1893 UBool map[256];
1894 UBool ok;
1895
1896 USet* set = uset_open(1, 0); /* empty set */
1897 USet* dumb = uset_open(1, 0); /* empty set */
1898
1899 /*
1900 * uprv_getCharNameCharacters() will likely return more lowercase
1901 * letters than actual character names contain because
1902 * it includes all the characters in lowercased names of
1903 * general categories, for the full possible set of extended names.
1904 */
1905 {
1906 USetAdder sa={
1907 NULL,
1908 uset_add,
1909 uset_addRange,
1910 uset_addString,
1911 NULL /* don't need remove() */
1912 };
1913 sa.set=set;
1914 uprv_getCharNameCharacters(&sa);
1915 }
1916
1917 /* build set the dumb (but sure-fire) way */
1918 for (i=0; i<256; ++i) {
1919 map[i] = FALSE;
1920 }
1921
1922 maxLength=0;
1923 for (cp=0; cp<0x110000; ++cp) {
1924 int32_t len = u_charName(cp, U_EXTENDED_CHAR_NAME,
1925 buf, BUFSIZE, &ec);
1926 if (U_FAILURE(ec)) {
1927 log_err("FAIL: u_charName failed when it shouldn't\n");
1928 uset_close(set);
1929 uset_close(dumb);
1930 return;
1931 }
1932 if(len>maxLength) {
1933 maxLength=len;
1934 }
1935
1936 for (i=0; i<len; ++i) {
1937 if (!map[(uint8_t) buf[i]]) {
1938 uset_add(dumb, (UChar32)u_charToUChar(buf[i]));
1939 map[(uint8_t) buf[i]] = TRUE;
1940 }
1941 }
1942
1943 /* test for leading/trailing whitespace */
1944 if(buf[0]==' ' || buf[0]=='\t' || buf[len-1]==' ' || buf[len-1]=='\t') {
1945 log_err("u_charName(U+%04x) returns a name with leading or trailing whitespace\n", cp);
1946 }
1947 }
1948
1949 if(map[(uint8_t)'\t']) {
1950 log_err("u_charName() returned a name with a TAB for some code point\n", cp);
1951 }
1952
1953 length=uprv_getMaxCharNameLength();
1954 if(length!=maxLength) {
1955 log_err("uprv_getMaxCharNameLength()=%d differs from the maximum length %d of all extended names\n",
1956 length, maxLength);
1957 }
1958
1959 /* compare the sets. Where is my uset_equals?!! */
1960 ok=TRUE;
1961 for(i=0; i<256; ++i) {
1962 if(uset_contains(set, i)!=uset_contains(dumb, i)) {
1963 if(0x61<=i && i<=0x7a /* a-z */ && uset_contains(set, i) && !uset_contains(dumb, i)) {
1964 /* ignore lowercase a-z that are in set but not in dumb */
1965 ok=TRUE;
1966 } else {
1967 ok=FALSE;
1968 break;
1969 }
1970 }
1971 }
1972
1973 l1 = uset_toPattern(set, pat, BUFSIZE, TRUE, &ec);
1974 l2 = uset_toPattern(dumb, dumbPat, BUFSIZE, TRUE, &ec);
1975 if (U_FAILURE(ec)) {
1976 log_err("FAIL: uset_toPattern failed when it shouldn't\n");
1977 uset_close(set);
1978 uset_close(dumb);
1979 return;
1980 }
1981
1982 if (l1 >= BUFSIZE) {
1983 l1 = BUFSIZE-1;
1984 pat[l1] = 0;
1985 }
1986 if (l2 >= BUFSIZE) {
1987 l2 = BUFSIZE-1;
1988 dumbPat[l2] = 0;
1989 }
1990
1991 if (!ok) {
1992 log_err("FAIL: uprv_getCharNameCharacters() returned %s, expected %s (too many lowercase a-z are ok)\n",
1993 aescstrdup(pat, l1), aescstrdup(dumbPat, l2));
1994 } else if(getTestOption(VERBOSITY_OPTION)) {
1995 log_verbose("Ok: uprv_getCharNameCharacters() returned %s\n", aescstrdup(pat, l1));
1996 }
1997
1998 uset_close(set);
1999 uset_close(dumb);
2000 }
2001
2002 /* ### TODO: test error cases and other interesting things */
2003 }
2004
2005 static void
2006 TestUCharFromNameUnderflow() {
2007 // Ticket #10889: Underflow crash when there is no dash.
2008 const char *name="<NO BREAK SPACE>";
2009 UErrorCode errorCode=U_ZERO_ERROR;
2010 UChar32 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2011 if(U_SUCCESS(errorCode)) {
2012 log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2013 name, c, u_errorName(errorCode));
2014 }
2015
2016 // Test related edge cases.
2017 name="<-00a0>";
2018 errorCode=U_ZERO_ERROR;
2019 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2020 if(U_SUCCESS(errorCode)) {
2021 log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2022 name, c, u_errorName(errorCode));
2023 }
2024
2025 errorCode=U_ZERO_ERROR;
2026 name="<control->";
2027 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2028 if(U_SUCCESS(errorCode)) {
2029 log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2030 name, c, u_errorName(errorCode));
2031 }
2032
2033 errorCode=U_ZERO_ERROR;
2034 name="<control-111111>";
2035 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2036 if(U_SUCCESS(errorCode)) {
2037 log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2038 name, c, u_errorName(errorCode));
2039 }
2040
2041 // ICU-20292: integer overflow
2042 errorCode=U_ZERO_ERROR;
2043 name="<noncharacter-10010FFFF>";
2044 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2045 if(U_SUCCESS(errorCode)) {
2046 log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2047 name, c, u_errorName(errorCode));
2048 }
2049
2050 errorCode=U_ZERO_ERROR;
2051 name="<noncharacter-00010FFFF>"; // too many digits even if only leading 0s
2052 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2053 if(U_SUCCESS(errorCode)) {
2054 log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2055 name, c, u_errorName(errorCode));
2056 }
2057
2058 errorCode=U_ZERO_ERROR;
2059 name="<noncharacter-fFFf>>";
2060 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2061 if(U_SUCCESS(errorCode)) {
2062 log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2063 name, c, u_errorName(errorCode));
2064 }
2065 }
2066
2067 /* test u_isMirrored() and u_charMirror() ----------------------------------- */
2068
2069 static void
2070 TestMirroring() {
2071 USet *set;
2072 UErrorCode errorCode;
2073
2074 UChar32 start, end, c2, c3;
2075 int32_t i;
2076
2077 U_STRING_DECL(mirroredPattern, "[:Bidi_Mirrored:]", 17);
2078
2079 U_STRING_INIT(mirroredPattern, "[:Bidi_Mirrored:]", 17);
2080
2081 log_verbose("Testing u_isMirrored()\n");
2082 if(!(u_isMirrored(0x28) && u_isMirrored(0xbb) && u_isMirrored(0x2045) && u_isMirrored(0x232a) &&
2083 !u_isMirrored(0x27) && !u_isMirrored(0x61) && !u_isMirrored(0x284) && !u_isMirrored(0x3400)
2084 )
2085 ) {
2086 log_err("u_isMirrored() does not work correctly\n");
2087 }
2088
2089 log_verbose("Testing u_charMirror()\n");
2090 if(!(u_charMirror(0x3c)==0x3e && u_charMirror(0x5d)==0x5b && u_charMirror(0x208d)==0x208e && u_charMirror(0x3017)==0x3016 &&
2091 u_charMirror(0xbb)==0xab && u_charMirror(0x2215)==0x29F5 && u_charMirror(0x29F5)==0x2215 && /* large delta between the code points */
2092 u_charMirror(0x2e)==0x2e && u_charMirror(0x6f3)==0x6f3 && u_charMirror(0x301c)==0x301c && u_charMirror(0xa4ab)==0xa4ab &&
2093 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
2094 u_charMirror(0x2018)==0x2018 && u_charMirror(0x201b)==0x201b && u_charMirror(0x301d)==0x301d
2095 )
2096 ) {
2097 log_err("u_charMirror() does not work correctly\n");
2098 }
2099
2100 /* verify that Bidi_Mirroring_Glyph roundtrips */
2101 errorCode=U_ZERO_ERROR;
2102 set=uset_openPattern(mirroredPattern, 17, &errorCode);
2103
2104 if (U_FAILURE(errorCode)) {
2105 log_data_err("uset_openPattern(mirroredPattern, 17, &errorCode) failed!\n");
2106 } else {
2107 for(i=0; 0==uset_getItem(set, i, &start, &end, NULL, 0, &errorCode); ++i) {
2108 do {
2109 c2=u_charMirror(start);
2110 c3=u_charMirror(c2);
2111 if(c3!=start) {
2112 log_err("u_charMirror() does not roundtrip: U+%04lx->U+%04lx->U+%04lx\n", (long)start, (long)c2, (long)c3);
2113 }
2114 c3=u_getBidiPairedBracket(start);
2115 if(u_getIntPropertyValue(start, UCHAR_BIDI_PAIRED_BRACKET_TYPE)==U_BPT_NONE) {
2116 if(c3!=start) {
2117 log_err("u_getBidiPairedBracket(U+%04lx) != self for bpt(c)==None\n",
2118 (long)start);
2119 }
2120 } else {
2121 if(c3!=c2) {
2122 log_err("u_getBidiPairedBracket(U+%04lx) != U+%04lx = bmg(c)'\n",
2123 (long)start, (long)c2);
2124 }
2125 }
2126 } while(++start<=end);
2127 }
2128 }
2129
2130 uset_close(set);
2131 }
2132
2133
2134 struct RunTestData
2135 {
2136 const char *runText;
2137 UScriptCode runCode;
2138 };
2139
2140 typedef struct RunTestData RunTestData;
2141
2142 static void
2143 CheckScriptRuns(UScriptRun *scriptRun, int32_t *runStarts, const RunTestData *testData, int32_t nRuns,
2144 const char *prefix)
2145 {
2146 int32_t run, runStart, runLimit;
2147 UScriptCode runCode;
2148
2149 /* iterate over all the runs */
2150 run = 0;
2151 while (uscript_nextRun(scriptRun, &runStart, &runLimit, &runCode)) {
2152 if (runStart != runStarts[run]) {
2153 log_err("%s: incorrect start offset for run %d: expected %d, got %d\n",
2154 prefix, run, runStarts[run], runStart);
2155 }
2156
2157 if (runLimit != runStarts[run + 1]) {
2158 log_err("%s: incorrect limit offset for run %d: expected %d, got %d\n",
2159 prefix, run, runStarts[run + 1], runLimit);
2160 }
2161
2162 if (runCode != testData[run].runCode) {
2163 log_err("%s: incorrect script for run %d: expected \"%s\", got \"%s\"\n",
2164 prefix, run, uscript_getName(testData[run].runCode), uscript_getName(runCode));
2165 }
2166
2167 run += 1;
2168
2169 /* stop when we've seen all the runs we expect to see */
2170 if (run >= nRuns) {
2171 break;
2172 }
2173 }
2174
2175 /* Complain if we didn't see then number of runs we expected */
2176 if (run != nRuns) {
2177 log_err("%s: incorrect number of runs: expected %d, got %d\n", prefix, run, nRuns);
2178 }
2179 }
2180
2181 static void
2182 TestUScriptRunAPI()
2183 {
2184 static const RunTestData testData1[] = {
2185 {"\\u0020\\u0946\\u0939\\u093F\\u0928\\u094D\\u0926\\u0940\\u0020", USCRIPT_DEVANAGARI},
2186 {"\\u0627\\u0644\\u0639\\u0631\\u0628\\u064A\\u0629\\u0020", USCRIPT_ARABIC},
2187 {"\\u0420\\u0443\\u0441\\u0441\\u043A\\u0438\\u0439\\u0020", USCRIPT_CYRILLIC},
2188 {"English (", USCRIPT_LATIN},
2189 {"\\u0E44\\u0E17\\u0E22", USCRIPT_THAI},
2190 {") ", USCRIPT_LATIN},
2191 {"\\u6F22\\u5B75", USCRIPT_HAN},
2192 {"\\u3068\\u3072\\u3089\\u304C\\u306A\\u3068", USCRIPT_HIRAGANA},
2193 {"\\u30AB\\u30BF\\u30AB\\u30CA", USCRIPT_KATAKANA},
2194 {"\\U00010400\\U00010401\\U00010402\\U00010403", USCRIPT_DESERET}
2195 };
2196
2197 static const RunTestData testData2[] = {
2198 {"((((((((((abc))))))))))", USCRIPT_LATIN}
2199 };
2200
2201 static const struct {
2202 const RunTestData *testData;
2203 int32_t nRuns;
2204 } testDataEntries[] = {
2205 {testData1, UPRV_LENGTHOF(testData1)},
2206 {testData2, UPRV_LENGTHOF(testData2)}
2207 };
2208
2209 static const int32_t nTestEntries = UPRV_LENGTHOF(testDataEntries);
2210 int32_t testEntry;
2211
2212 for (testEntry = 0; testEntry < nTestEntries; testEntry += 1) {
2213 UChar testString[1024];
2214 int32_t runStarts[256];
2215 int32_t nTestRuns = testDataEntries[testEntry].nRuns;
2216 const RunTestData *testData = testDataEntries[testEntry].testData;
2217
2218 int32_t run, stringLimit;
2219 UScriptRun *scriptRun = NULL;
2220 UErrorCode err;
2221
2222 /*
2223 * Fill in the test string and the runStarts array.
2224 */
2225 stringLimit = 0;
2226 for (run = 0; run < nTestRuns; run += 1) {
2227 runStarts[run] = stringLimit;
2228 stringLimit += u_unescape(testData[run].runText, &testString[stringLimit], 1024 - stringLimit);
2229 /*stringLimit -= 1;*/
2230 }
2231
2232 /* The limit of the last run */
2233 runStarts[nTestRuns] = stringLimit;
2234
2235 /*
2236 * Make sure that calling uscript_OpenRun with a NULL text pointer
2237 * and a non-zero text length returns the correct error.
2238 */
2239 err = U_ZERO_ERROR;
2240 scriptRun = uscript_openRun(NULL, stringLimit, &err);
2241
2242 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2243 log_err("uscript_openRun(NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2244 }
2245
2246 if (scriptRun != NULL) {
2247 log_err("uscript_openRun(NULL, stringLimit, &err) returned a non-NULL result.\n");
2248 uscript_closeRun(scriptRun);
2249 }
2250
2251 /*
2252 * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2253 * and a zero text length returns the correct error.
2254 */
2255 err = U_ZERO_ERROR;
2256 scriptRun = uscript_openRun(testString, 0, &err);
2257
2258 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2259 log_err("uscript_openRun(testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2260 }
2261
2262 if (scriptRun != NULL) {
2263 log_err("uscript_openRun(testString, 0, &err) returned a non-NULL result.\n");
2264 uscript_closeRun(scriptRun);
2265 }
2266
2267 /*
2268 * Make sure that calling uscript_openRun with a NULL text pointer
2269 * and a zero text length doesn't return an error.
2270 */
2271 err = U_ZERO_ERROR;
2272 scriptRun = uscript_openRun(NULL, 0, &err);
2273
2274 if (U_FAILURE(err)) {
2275 log_err("Got error %s from uscript_openRun(NULL, 0, &err)\n", u_errorName(err));
2276 }
2277
2278 /* Make sure that the empty iterator doesn't find any runs */
2279 if (uscript_nextRun(scriptRun, NULL, NULL, NULL)) {
2280 log_err("uscript_nextRun(...) returned TRUE for an empty iterator.\n");
2281 }
2282
2283 /*
2284 * Make sure that calling uscript_setRunText with a NULL text pointer
2285 * and a non-zero text length returns the correct error.
2286 */
2287 err = U_ZERO_ERROR;
2288 uscript_setRunText(scriptRun, NULL, stringLimit, &err);
2289
2290 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2291 log_err("uscript_setRunText(scriptRun, NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2292 }
2293
2294 /*
2295 * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2296 * and a zero text length returns the correct error.
2297 */
2298 err = U_ZERO_ERROR;
2299 uscript_setRunText(scriptRun, testString, 0, &err);
2300
2301 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2302 log_err("uscript_setRunText(scriptRun, testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2303 }
2304
2305 /*
2306 * Now call uscript_setRunText on the empty iterator
2307 * and make sure that it works.
2308 */
2309 err = U_ZERO_ERROR;
2310 uscript_setRunText(scriptRun, testString, stringLimit, &err);
2311
2312 if (U_FAILURE(err)) {
2313 log_err("Got error %s from uscript_setRunText(...)\n", u_errorName(err));
2314 } else {
2315 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_setRunText");
2316 }
2317
2318 uscript_closeRun(scriptRun);
2319
2320 /*
2321 * Now open an interator over the testString
2322 * using uscript_openRun and make sure that it works
2323 */
2324 scriptRun = uscript_openRun(testString, stringLimit, &err);
2325
2326 if (U_FAILURE(err)) {
2327 log_err("Got error %s from uscript_openRun(...)\n", u_errorName(err));
2328 } else {
2329 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_openRun");
2330 }
2331
2332 /* Now reset the iterator, and make sure
2333 * that it still works.
2334 */
2335 uscript_resetRun(scriptRun);
2336
2337 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_resetRun");
2338
2339 /* Close the iterator */
2340 uscript_closeRun(scriptRun);
2341 }
2342 }
2343
2344 /* test additional, non-core properties */
2345 static void
2346 TestAdditionalProperties() {
2347 /* test data for u_charAge() */
2348 static const struct {
2349 UChar32 c;
2350 UVersionInfo version;
2351 } charAges[]={
2352 {0x41, { 1, 1, 0, 0 }},
2353 {0xffff, { 1, 1, 0, 0 }},
2354 {0x20ab, { 2, 0, 0, 0 }},
2355 {0x2fffe, { 2, 0, 0, 0 }},
2356 {0x20ac, { 2, 1, 0, 0 }},
2357 {0xfb1d, { 3, 0, 0, 0 }},
2358 {0x3f4, { 3, 1, 0, 0 }},
2359 {0x10300, { 3, 1, 0, 0 }},
2360 {0x220, { 3, 2, 0, 0 }},
2361 {0xff60, { 3, 2, 0, 0 }}
2362 };
2363
2364 /* test data for u_hasBinaryProperty() */
2365 static const int32_t
2366 props[][3]={ /* code point, property, value */
2367 { 0x0627, UCHAR_ALPHABETIC, TRUE },
2368 { 0x1034a, UCHAR_ALPHABETIC, TRUE },
2369 { 0x2028, UCHAR_ALPHABETIC, FALSE },
2370
2371 { 0x0066, UCHAR_ASCII_HEX_DIGIT, TRUE },
2372 { 0x0067, UCHAR_ASCII_HEX_DIGIT, FALSE },
2373
2374 { 0x202c, UCHAR_BIDI_CONTROL, TRUE },
2375 { 0x202f, UCHAR_BIDI_CONTROL, FALSE },
2376
2377 { 0x003c, UCHAR_BIDI_MIRRORED, TRUE },
2378 { 0x003d, UCHAR_BIDI_MIRRORED, FALSE },
2379
2380 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
2381 { 0x2018, UCHAR_BIDI_MIRRORED, FALSE },
2382 { 0x201d, UCHAR_BIDI_MIRRORED, FALSE },
2383 { 0x201f, UCHAR_BIDI_MIRRORED, FALSE },
2384 { 0x301e, UCHAR_BIDI_MIRRORED, FALSE },
2385
2386 { 0x058a, UCHAR_DASH, TRUE },
2387 { 0x007e, UCHAR_DASH, FALSE },
2388
2389 { 0x0c4d, UCHAR_DIACRITIC, TRUE },
2390 { 0x3000, UCHAR_DIACRITIC, FALSE },
2391
2392 { 0x0e46, UCHAR_EXTENDER, TRUE },
2393 { 0x0020, UCHAR_EXTENDER, FALSE },
2394
2395 #if !UCONFIG_NO_NORMALIZATION
2396 { 0xfb1d, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2397 { 0x1d15f, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2398 { 0xfb1e, UCHAR_FULL_COMPOSITION_EXCLUSION, FALSE },
2399
2400 { 0x110a, UCHAR_NFD_INERT, TRUE }, /* Jamo L */
2401 { 0x0308, UCHAR_NFD_INERT, FALSE },
2402
2403 { 0x1164, UCHAR_NFKD_INERT, TRUE }, /* Jamo V */
2404 { 0x1d79d, UCHAR_NFKD_INERT, FALSE }, /* math compat version of xi */
2405
2406 { 0x0021, UCHAR_NFC_INERT, TRUE }, /* ! */
2407 { 0x0061, UCHAR_NFC_INERT, FALSE }, /* a */
2408 { 0x00e4, UCHAR_NFC_INERT, FALSE }, /* a-umlaut */
2409 { 0x0102, UCHAR_NFC_INERT, FALSE }, /* a-breve */
2410 { 0xac1c, UCHAR_NFC_INERT, FALSE }, /* Hangul LV */
2411 { 0xac1d, UCHAR_NFC_INERT, TRUE }, /* Hangul LVT */
2412
2413 { 0x1d79d, UCHAR_NFKC_INERT, FALSE }, /* math compat version of xi */
2414 { 0x2a6d6, UCHAR_NFKC_INERT, TRUE }, /* Han, last of CJK ext. B */
2415
2416 { 0x00e4, UCHAR_SEGMENT_STARTER, TRUE },
2417 { 0x0308, UCHAR_SEGMENT_STARTER, FALSE },
2418 { 0x110a, UCHAR_SEGMENT_STARTER, TRUE }, /* Jamo L */
2419 { 0x1164, UCHAR_SEGMENT_STARTER, FALSE },/* Jamo V */
2420 { 0xac1c, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LV */
2421 { 0xac1d, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LVT */
2422 #endif
2423
2424 { 0x0044, UCHAR_HEX_DIGIT, TRUE },
2425 { 0xff46, UCHAR_HEX_DIGIT, TRUE },
2426 { 0x0047, UCHAR_HEX_DIGIT, FALSE },
2427
2428 { 0x30fb, UCHAR_HYPHEN, TRUE },
2429 { 0xfe58, UCHAR_HYPHEN, FALSE },
2430
2431 { 0x2172, UCHAR_ID_CONTINUE, TRUE },
2432 { 0x0307, UCHAR_ID_CONTINUE, TRUE },
2433 { 0x005c, UCHAR_ID_CONTINUE, FALSE },
2434
2435 { 0x2172, UCHAR_ID_START, TRUE },
2436 { 0x007a, UCHAR_ID_START, TRUE },
2437 { 0x0039, UCHAR_ID_START, FALSE },
2438
2439 { 0x4db5, UCHAR_IDEOGRAPHIC, TRUE },
2440 { 0x2f999, UCHAR_IDEOGRAPHIC, TRUE },
2441 { 0x2f99, UCHAR_IDEOGRAPHIC, FALSE },
2442
2443 { 0x200c, UCHAR_JOIN_CONTROL, TRUE },
2444 { 0x2029, UCHAR_JOIN_CONTROL, FALSE },
2445
2446 { 0x1d7bc, UCHAR_LOWERCASE, TRUE },
2447 { 0x0345, UCHAR_LOWERCASE, TRUE },
2448 { 0x0030, UCHAR_LOWERCASE, FALSE },
2449
2450 { 0x1d7a9, UCHAR_MATH, TRUE },
2451 { 0x2135, UCHAR_MATH, TRUE },
2452 { 0x0062, UCHAR_MATH, FALSE },
2453
2454 { 0xfde1, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2455 { 0x10ffff, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2456 { 0x10fffd, UCHAR_NONCHARACTER_CODE_POINT, FALSE },
2457
2458 { 0x0022, UCHAR_QUOTATION_MARK, TRUE },
2459 { 0xff62, UCHAR_QUOTATION_MARK, TRUE },
2460 { 0xd840, UCHAR_QUOTATION_MARK, FALSE },
2461
2462 { 0x061f, UCHAR_TERMINAL_PUNCTUATION, TRUE },
2463 { 0xe003f, UCHAR_TERMINAL_PUNCTUATION, FALSE },
2464
2465 { 0x1d44a, UCHAR_UPPERCASE, TRUE },
2466 { 0x2162, UCHAR_UPPERCASE, TRUE },
2467 { 0x0345, UCHAR_UPPERCASE, FALSE },
2468
2469 { 0x0020, UCHAR_WHITE_SPACE, TRUE },
2470 { 0x202f, UCHAR_WHITE_SPACE, TRUE },
2471 { 0x3001, UCHAR_WHITE_SPACE, FALSE },
2472
2473 { 0x0711, UCHAR_XID_CONTINUE, TRUE },
2474 { 0x1d1aa, UCHAR_XID_CONTINUE, TRUE },
2475 { 0x007c, UCHAR_XID_CONTINUE, FALSE },
2476
2477 { 0x16ee, UCHAR_XID_START, TRUE },
2478 { 0x23456, UCHAR_XID_START, TRUE },
2479 { 0x1d1aa, UCHAR_XID_START, FALSE },
2480
2481 /*
2482 * Version break:
2483 * The following properties are only supported starting with the
2484 * Unicode version indicated in the second field.
2485 */
2486 { -1, 0x320, 0 },
2487
2488 { 0x180c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2489 { 0xfe02, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2490 { 0x1801, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, FALSE },
2491
2492 { 0x0149, UCHAR_DEPRECATED, TRUE }, /* changed in Unicode 5.2 */
2493 { 0x0341, UCHAR_DEPRECATED, FALSE }, /* changed in Unicode 5.2 */
2494 { 0xe0001, UCHAR_DEPRECATED, TRUE }, /* changed from Unicode 5 to 5.1 */
2495 { 0xe0100, UCHAR_DEPRECATED, FALSE },
2496
2497 { 0x00a0, UCHAR_GRAPHEME_BASE, TRUE },
2498 { 0x0a4d, UCHAR_GRAPHEME_BASE, FALSE },
2499 { 0xff9d, UCHAR_GRAPHEME_BASE, TRUE },
2500 { 0xff9f, UCHAR_GRAPHEME_BASE, FALSE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2501
2502 { 0x0300, UCHAR_GRAPHEME_EXTEND, TRUE },
2503 { 0xff9d, UCHAR_GRAPHEME_EXTEND, FALSE },
2504 { 0xff9f, UCHAR_GRAPHEME_EXTEND, TRUE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2505 { 0x0603, UCHAR_GRAPHEME_EXTEND, FALSE },
2506
2507 { 0x0a4d, UCHAR_GRAPHEME_LINK, TRUE },
2508 { 0xff9f, UCHAR_GRAPHEME_LINK, FALSE },
2509
2510 { 0x2ff7, UCHAR_IDS_BINARY_OPERATOR, TRUE },
2511 { 0x2ff3, UCHAR_IDS_BINARY_OPERATOR, FALSE },
2512
2513 { 0x2ff3, UCHAR_IDS_TRINARY_OPERATOR, TRUE },
2514 { 0x2f03, UCHAR_IDS_TRINARY_OPERATOR, FALSE },
2515
2516 { 0x0ec1, UCHAR_LOGICAL_ORDER_EXCEPTION, TRUE },
2517 { 0xdcba, UCHAR_LOGICAL_ORDER_EXCEPTION, FALSE },
2518
2519 { 0x2e9b, UCHAR_RADICAL, TRUE },
2520 { 0x4e00, UCHAR_RADICAL, FALSE },
2521
2522 { 0x012f, UCHAR_SOFT_DOTTED, TRUE },
2523 { 0x0049, UCHAR_SOFT_DOTTED, FALSE },
2524
2525 { 0xfa11, UCHAR_UNIFIED_IDEOGRAPH, TRUE },
2526 { 0xfa12, UCHAR_UNIFIED_IDEOGRAPH, FALSE },
2527
2528 { -1, 0x401, 0 }, /* version break for Unicode 4.0.1 */
2529
2530 { 0x002e, UCHAR_S_TERM, TRUE },
2531 { 0x0061, UCHAR_S_TERM, FALSE },
2532
2533 { 0x180c, UCHAR_VARIATION_SELECTOR, TRUE },
2534 { 0xfe03, UCHAR_VARIATION_SELECTOR, TRUE },
2535 { 0xe01ef, UCHAR_VARIATION_SELECTOR, TRUE },
2536 { 0xe0200, UCHAR_VARIATION_SELECTOR, FALSE },
2537
2538 /* enum/integer type properties */
2539
2540 /* UCHAR_BIDI_CLASS tested for assigned characters in TestUnicodeData() */
2541 /* test default Bidi classes for unassigned code points */
2542 { 0x0590, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2543 { 0x05cf, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2544 { 0x05ed, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2545 { 0x07f2, UCHAR_BIDI_CLASS, U_DIR_NON_SPACING_MARK }, /* Nko, new in Unicode 5.0 */
2546 { 0x07fe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, /* unassigned R */
2547 { 0x089f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2548 { 0xfb37, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2549 { 0xfb42, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2550 { 0x10806, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2551 { 0x10909, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2552 { 0x10fe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2553
2554 { 0x061d, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2555 { 0x063f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2556 { 0x070e, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2557 { 0x0775, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2558 { 0xfbc2, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2559 { 0xfd90, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2560 { 0xfefe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2561
2562 { 0x02AF, UCHAR_BLOCK, UBLOCK_IPA_EXTENSIONS },
2563 { 0x0C4E, UCHAR_BLOCK, UBLOCK_TELUGU },
2564 { 0x155A, UCHAR_BLOCK, UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS },
2565 { 0x1717, UCHAR_BLOCK, UBLOCK_TAGALOG },
2566 { 0x1900, UCHAR_BLOCK, UBLOCK_LIMBU },
2567 { 0x0870, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2568 { 0x3040, UCHAR_BLOCK, UBLOCK_HIRAGANA },
2569 { 0x1D0FF, UCHAR_BLOCK, UBLOCK_BYZANTINE_MUSICAL_SYMBOLS },
2570 { 0x50000, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2571 { 0xEFFFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2572 { 0x10D0FF, UCHAR_BLOCK, UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B },
2573
2574 /* UCHAR_CANONICAL_COMBINING_CLASS tested for assigned characters in TestUnicodeData() */
2575 { 0xd7d7, UCHAR_CANONICAL_COMBINING_CLASS, 0 },
2576
2577 { 0x00A0, UCHAR_DECOMPOSITION_TYPE, U_DT_NOBREAK },
2578 { 0x00A8, UCHAR_DECOMPOSITION_TYPE, U_DT_COMPAT },
2579 { 0x00bf, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2580 { 0x00c0, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2581 { 0x1E9B, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2582 { 0xBCDE, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2583 { 0xFB5D, UCHAR_DECOMPOSITION_TYPE, U_DT_MEDIAL },
2584 { 0x1D736, UCHAR_DECOMPOSITION_TYPE, U_DT_FONT },
2585 { 0xe0033, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2586
2587 { 0x0009, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2588 { 0x0020, UCHAR_EAST_ASIAN_WIDTH, U_EA_NARROW },
2589 { 0x00B1, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2590 { 0x20A9, UCHAR_EAST_ASIAN_WIDTH, U_EA_HALFWIDTH },
2591 { 0x2FFB, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2592 { 0x3000, UCHAR_EAST_ASIAN_WIDTH, U_EA_FULLWIDTH },
2593 { 0x35bb, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2594 { 0x58bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2595 { 0xD7A3, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2596 { 0xEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2597 { 0x1D198, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2598 { 0x20000, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2599 { 0x2F8C7, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2600 { 0x3a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, /* plane 3 got default W values in Unicode 4 */
2601 { 0x5a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2602 { 0xFEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2603 { 0x10EEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2604
2605 /* UCHAR_GENERAL_CATEGORY tested for assigned characters in TestUnicodeData() */
2606 { 0xd7c7, UCHAR_GENERAL_CATEGORY, 0 },
2607 { 0xd7d7, UCHAR_GENERAL_CATEGORY, U_OTHER_LETTER }, /* changed in Unicode 5.2 */
2608
2609 { 0x0444, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2610 { 0x0639, UCHAR_JOINING_GROUP, U_JG_AIN },
2611 { 0x072A, UCHAR_JOINING_GROUP, U_JG_DALATH_RISH },
2612 { 0x0647, UCHAR_JOINING_GROUP, U_JG_HEH },
2613 { 0x06C1, UCHAR_JOINING_GROUP, U_JG_HEH_GOAL },
2614
2615 { 0x200C, UCHAR_JOINING_TYPE, U_JT_NON_JOINING },
2616 { 0x200D, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2617 { 0x0639, UCHAR_JOINING_TYPE, U_JT_DUAL_JOINING },
2618 { 0x0640, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2619 { 0x06C3, UCHAR_JOINING_TYPE, U_JT_RIGHT_JOINING },
2620 { 0x0300, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2621 { 0x070F, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2622 { 0xe0033, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2623
2624 /* TestUnicodeData() verifies that no assigned character has "XX" (unknown) */
2625 { 0xe7e7, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2626 { 0x10fffd, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2627 { 0x0028, UCHAR_LINE_BREAK, U_LB_OPEN_PUNCTUATION },
2628 { 0x232A, UCHAR_LINE_BREAK, U_LB_CLOSE_PUNCTUATION },
2629 { 0x3401, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2630 { 0x4e02, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2631 { 0x20004, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2632 { 0xf905, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2633 { 0xdb7e, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2634 { 0xdbfd, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2635 { 0xdffc, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2636 { 0x2762, UCHAR_LINE_BREAK, U_LB_EXCLAMATION },
2637 { 0x002F, UCHAR_LINE_BREAK, U_LB_BREAK_SYMBOLS },
2638 { 0x1D49C, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2639 { 0x1731, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2640
2641 /* UCHAR_NUMERIC_TYPE tested in TestNumericProperties() */
2642
2643 /* UCHAR_SCRIPT tested in cucdapi.c TestUScriptCodeAPI() */
2644
2645 { 0x10ff, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2646 { 0x1100, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2647 { 0x1111, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2648 { 0x1159, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2649 { 0x115a, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2650 { 0x115e, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2651 { 0x115f, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2652
2653 { 0xa95f, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2654 { 0xa960, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2655 { 0xa97c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2656 { 0xa97d, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2657
2658 { 0x1160, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2659 { 0x1161, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2660 { 0x1172, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2661 { 0x11a2, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2662 { 0x11a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2663 { 0x11a7, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2664
2665 { 0xd7af, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2666 { 0xd7b0, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2667 { 0xd7c6, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2668 { 0xd7c7, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2669
2670 { 0x11a8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2671 { 0x11b8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2672 { 0x11c8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2673 { 0x11f9, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2674 { 0x11fa, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2675 { 0x11ff, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2676 { 0x1200, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2677
2678 { 0xd7ca, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2679 { 0xd7cb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2680 { 0xd7fb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2681 { 0xd7fc, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2682
2683 { 0xac00, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2684 { 0xac1c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2685 { 0xc5ec, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2686 { 0xd788, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2687
2688 { 0xac01, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2689 { 0xac1b, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2690 { 0xac1d, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2691 { 0xc5ee, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2692 { 0xd7a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2693
2694 { 0xd7a4, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2695
2696 { -1, 0x410, 0 }, /* version break for Unicode 4.1 */
2697
2698 { 0x00d7, UCHAR_PATTERN_SYNTAX, TRUE },
2699 { 0xfe45, UCHAR_PATTERN_SYNTAX, TRUE },
2700 { 0x0061, UCHAR_PATTERN_SYNTAX, FALSE },
2701
2702 { 0x0020, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2703 { 0x0085, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2704 { 0x200f, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2705 { 0x00a0, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2706 { 0x3000, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2707
2708 { 0x1d200, UCHAR_BLOCK, UBLOCK_ANCIENT_GREEK_MUSICAL_NOTATION },
2709 { 0x2c8e, UCHAR_BLOCK, UBLOCK_COPTIC },
2710 { 0xfe17, UCHAR_BLOCK, UBLOCK_VERTICAL_FORMS },
2711
2712 { 0x1a00, UCHAR_SCRIPT, USCRIPT_BUGINESE },
2713 { 0x2cea, UCHAR_SCRIPT, USCRIPT_COPTIC },
2714 { 0xa82b, UCHAR_SCRIPT, USCRIPT_SYLOTI_NAGRI },
2715 { 0x103d0, UCHAR_SCRIPT, USCRIPT_OLD_PERSIAN },
2716
2717 { 0xcc28, UCHAR_LINE_BREAK, U_LB_H2 },
2718 { 0xcc29, UCHAR_LINE_BREAK, U_LB_H3 },
2719 { 0xac03, UCHAR_LINE_BREAK, U_LB_H3 },
2720 { 0x115f, UCHAR_LINE_BREAK, U_LB_JL },
2721 { 0x11aa, UCHAR_LINE_BREAK, U_LB_JT },
2722 { 0x11a1, UCHAR_LINE_BREAK, U_LB_JV },
2723
2724 { 0xb2c9, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_LVT },
2725 { 0x036f, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_EXTEND },
2726 { 0x0000, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_CONTROL },
2727 { 0x1160, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_V },
2728
2729 { 0x05f4, UCHAR_WORD_BREAK, U_WB_MIDLETTER },
2730 { 0x4ef0, UCHAR_WORD_BREAK, U_WB_OTHER },
2731 { 0x19d9, UCHAR_WORD_BREAK, U_WB_NUMERIC },
2732 { 0x2044, UCHAR_WORD_BREAK, U_WB_MIDNUM },
2733
2734 { 0xfffd, UCHAR_SENTENCE_BREAK, U_SB_OTHER },
2735 { 0x1ffc, UCHAR_SENTENCE_BREAK, U_SB_UPPER },
2736 { 0xff63, UCHAR_SENTENCE_BREAK, U_SB_CLOSE },
2737 { 0x2028, UCHAR_SENTENCE_BREAK, U_SB_SEP },
2738
2739 { -1, 0x520, 0 }, /* version break for Unicode 5.2 */
2740
2741 /* unassigned code points in new default Bidi R blocks */
2742 { 0x1ede4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2743 { 0x1efe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2744
2745 /* test some script codes >127 */
2746 { 0xa6e6, UCHAR_SCRIPT, USCRIPT_BAMUM },
2747 { 0xa4d0, UCHAR_SCRIPT, USCRIPT_LISU },
2748 { 0x10a7f, UCHAR_SCRIPT, USCRIPT_OLD_SOUTH_ARABIAN },
2749
2750 { -1, 0x600, 0 }, /* version break for Unicode 6.0 */
2751
2752 /* value changed in Unicode 6.0 */
2753 { 0x06C3, UCHAR_JOINING_GROUP, U_JG_TEH_MARBUTA_GOAL },
2754
2755 { -1, 0x610, 0 }, /* version break for Unicode 6.1 */
2756
2757 /* unassigned code points in new/changed default Bidi AL blocks */
2758 { 0x08ba, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2759 { 0x1eee4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2760
2761 { -1, 0x630, 0 }, /* version break for Unicode 6.3 */
2762
2763 /* unassigned code points in the currency symbols block now default to ET */
2764 { 0x20C0, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR },
2765 { 0x20CF, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR },
2766
2767 /* new property in Unicode 6.3 */
2768 { 0x0027, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE },
2769 { 0x0028, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN },
2770 { 0x0029, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE },
2771 { 0xFF5C, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE },
2772 { 0xFF5B, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN },
2773 { 0xFF5D, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE },
2774
2775 { -1, 0x700, 0 }, /* version break for Unicode 7.0 */
2776
2777 /* new character range with Joining_Group values */
2778 { 0x10ABF, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2779 { 0x10AC0, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_ALEPH },
2780 { 0x10AC1, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_BETH },
2781 { 0x10AEF, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_HUNDRED },
2782 { 0x10AF0, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2783
2784 { -1, 0xa00, 0 }, // version break for Unicode 10
2785
2786 { 0x1F1E5, UCHAR_REGIONAL_INDICATOR, FALSE },
2787 { 0x1F1E7, UCHAR_REGIONAL_INDICATOR, TRUE },
2788 { 0x1F1FF, UCHAR_REGIONAL_INDICATOR, TRUE },
2789 { 0x1F200, UCHAR_REGIONAL_INDICATOR, FALSE },
2790
2791 { 0x0600, UCHAR_PREPENDED_CONCATENATION_MARK, TRUE },
2792 { 0x0606, UCHAR_PREPENDED_CONCATENATION_MARK, FALSE },
2793 { 0x110BD, UCHAR_PREPENDED_CONCATENATION_MARK, TRUE },
2794
2795 /* undefined UProperty values */
2796 { 0x61, 0x4a7, 0 },
2797 { 0x234bc, 0x15ed, 0 }
2798 };
2799
2800 UVersionInfo version;
2801 UChar32 c;
2802 int32_t i, result, uVersion;
2803 UProperty which;
2804
2805 /* what is our Unicode version? */
2806 u_getUnicodeVersion(version);
2807 uVersion=((int32_t)version[0]<<8)|(version[1]<<4)|version[2]; /* major/minor/update version numbers */
2808
2809 u_charAge(0x20, version);
2810 if(version[0]==0) {
2811 /* no additional properties available */
2812 log_err("TestAdditionalProperties: no additional properties available, not tested\n");
2813 return;
2814 }
2815
2816 /* test u_charAge() */
2817 for(i=0; i<UPRV_LENGTHOF(charAges); ++i) {
2818 u_charAge(charAges[i].c, version);
2819 if(0!=memcmp(version, charAges[i].version, sizeof(UVersionInfo))) {
2820 log_err("error: u_charAge(U+%04lx)={ %u, %u, %u, %u } instead of { %u, %u, %u, %u }\n",
2821 charAges[i].c,
2822 version[0], version[1], version[2], version[3],
2823 charAges[i].version[0], charAges[i].version[1], charAges[i].version[2], charAges[i].version[3]);
2824 }
2825 }
2826
2827 if( u_getIntPropertyMinValue(UCHAR_DASH)!=0 ||
2828 u_getIntPropertyMinValue(UCHAR_BIDI_CLASS)!=0 ||
2829 u_getIntPropertyMinValue(UCHAR_BLOCK)!=0 || /* j2478 */
2830 u_getIntPropertyMinValue(UCHAR_SCRIPT)!=0 || /*JB#2410*/
2831 u_getIntPropertyMinValue(0x2345)!=0
2832 ) {
2833 log_err("error: u_getIntPropertyMinValue() wrong\n");
2834 }
2835 if( u_getIntPropertyMaxValue(UCHAR_DASH)!=1) {
2836 log_err("error: u_getIntPropertyMaxValue(UCHAR_DASH) wrong\n");
2837 }
2838 if( u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE)!=1) {
2839 log_err("error: u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE) wrong\n");
2840 }
2841 if( u_getIntPropertyMaxValue((UProperty)(UCHAR_BINARY_LIMIT-1))!=1) {
2842 log_err("error: u_getIntPropertyMaxValue(UCHAR_BINARY_LIMIT-1) wrong\n");
2843 }
2844 if( u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS)!=(int32_t)U_CHAR_DIRECTION_COUNT-1 ) {
2845 log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS) wrong\n");
2846 }
2847 if( u_getIntPropertyMaxValue(UCHAR_BLOCK)!=(int32_t)UBLOCK_COUNT-1 ) {
2848 log_err("error: u_getIntPropertyMaxValue(UCHAR_BLOCK) wrong\n");
2849 }
2850 if(u_getIntPropertyMaxValue(UCHAR_LINE_BREAK)!=(int32_t)U_LB_COUNT-1) {
2851 log_err("error: u_getIntPropertyMaxValue(UCHAR_LINE_BREAK) wrong\n");
2852 }
2853 if(u_getIntPropertyMaxValue(UCHAR_SCRIPT)!=(int32_t)USCRIPT_CODE_LIMIT-1) {
2854 log_err("error: u_getIntPropertyMaxValue(UCHAR_SCRIPT) wrong\n");
2855 }
2856 if(u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE)!=(int32_t)U_NT_COUNT-1) {
2857 log_err("error: u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE) wrong\n");
2858 }
2859 if(u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY)!=(int32_t)U_CHAR_CATEGORY_COUNT-1) {
2860 log_err("error: u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY) wrong\n");
2861 }
2862 if(u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE)!=(int32_t)U_HST_COUNT-1) {
2863 log_err("error: u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE) wrong\n");
2864 }
2865 if(u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK)!=(int32_t)U_GCB_COUNT-1) {
2866 log_err("error: u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK) wrong\n");
2867 }
2868 if(u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK)!=(int32_t)U_SB_COUNT-1) {
2869 log_err("error: u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK) wrong\n");
2870 }
2871 if(u_getIntPropertyMaxValue(UCHAR_WORD_BREAK)!=(int32_t)U_WB_COUNT-1) {
2872 log_err("error: u_getIntPropertyMaxValue(UCHAR_WORD_BREAK) wrong\n");
2873 }
2874 if(u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE)!=(int32_t)U_BPT_COUNT-1) {
2875 log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE) wrong\n");
2876 }
2877 /*JB#2410*/
2878 if( u_getIntPropertyMaxValue(0x2345)!=-1) {
2879 log_err("error: u_getIntPropertyMaxValue(0x2345) wrong\n");
2880 }
2881 if( u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) != (int32_t) (U_DT_COUNT - 1)) {
2882 log_err("error: u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) wrong\n");
2883 }
2884 if( u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) != (int32_t) (U_JG_COUNT -1)) {
2885 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) wrong\n");
2886 }
2887 if( u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) != (int32_t) (U_JT_COUNT -1)) {
2888 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) wrong\n");
2889 }
2890 if( u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) != (int32_t) (U_EA_COUNT -1)) {
2891 log_err("error: u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) wrong\n");
2892 }
2893
2894 /* test u_hasBinaryProperty() and u_getIntPropertyValue() */
2895 for(i=0; i<UPRV_LENGTHOF(props); ++i) {
2896 const char *whichName;
2897
2898 if(props[i][0]<0) {
2899 /* Unicode version break */
2900 if(uVersion<props[i][1]) {
2901 break; /* do not test properties that are not yet supported */
2902 } else {
2903 continue; /* skip this row */
2904 }
2905 }
2906
2907 c=(UChar32)props[i][0];
2908 which=(UProperty)props[i][1];
2909 whichName=u_getPropertyName(which, U_LONG_PROPERTY_NAME);
2910
2911 if(which<UCHAR_INT_START) {
2912 result=u_hasBinaryProperty(c, which);
2913 if(result!=props[i][2]) {
2914 log_data_err("error: u_hasBinaryProperty(U+%04lx, %s)=%d is wrong (props[%d]) - (Are you missing data?)\n",
2915 c, whichName, result, i);
2916 }
2917 }
2918
2919 result=u_getIntPropertyValue(c, which);
2920 if(result!=props[i][2]) {
2921 log_data_err("error: u_getIntPropertyValue(U+%04lx, %s)=%d is wrong, should be %d (props[%d]) - (Are you missing data?)\n",
2922 c, whichName, result, props[i][2], i);
2923 }
2924
2925 /* test separate functions, too */
2926 switch((UProperty)props[i][1]) {
2927 case UCHAR_ALPHABETIC:
2928 if(u_isUAlphabetic((UChar32)props[i][0])!=(UBool)props[i][2]) {
2929 log_err("error: u_isUAlphabetic(U+%04lx)=%d is wrong (props[%d])\n",
2930 props[i][0], result, i);
2931 }
2932 break;
2933 case UCHAR_LOWERCASE:
2934 if(u_isULowercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2935 log_err("error: u_isULowercase(U+%04lx)=%d is wrong (props[%d])\n",
2936 props[i][0], result, i);
2937 }
2938 break;
2939 case UCHAR_UPPERCASE:
2940 if(u_isUUppercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2941 log_err("error: u_isUUppercase(U+%04lx)=%d is wrong (props[%d])\n",
2942 props[i][0], result, i);
2943 }
2944 break;
2945 case UCHAR_WHITE_SPACE:
2946 if(u_isUWhiteSpace((UChar32)props[i][0])!=(UBool)props[i][2]) {
2947 log_err("error: u_isUWhiteSpace(U+%04lx)=%d is wrong (props[%d])\n",
2948 props[i][0], result, i);
2949 }
2950 break;
2951 default:
2952 break;
2953 }
2954 }
2955 }
2956
2957 static void
2958 TestNumericProperties(void) {
2959 /* see UnicodeData.txt, DerivedNumericValues.txt */
2960 static const struct {
2961 UChar32 c;
2962 int32_t type;
2963 double numValue;
2964 } values[]={
2965 { 0x0F33, U_NT_NUMERIC, -1./2. },
2966 { 0x0C66, U_NT_DECIMAL, 0 },
2967 { 0x96f6, U_NT_NUMERIC, 0 },
2968 { 0xa833, U_NT_NUMERIC, 1./16. },
2969 { 0x2152, U_NT_NUMERIC, 1./10. },
2970 { 0x2151, U_NT_NUMERIC, 1./9. },
2971 { 0x1245f, U_NT_NUMERIC, 1./8. },
2972 { 0x2150, U_NT_NUMERIC, 1./7. },
2973 { 0x2159, U_NT_NUMERIC, 1./6. },
2974 { 0x09f6, U_NT_NUMERIC, 3./16. },
2975 { 0x2155, U_NT_NUMERIC, 1./5. },
2976 { 0x00BD, U_NT_NUMERIC, 1./2. },
2977 { 0x0031, U_NT_DECIMAL, 1. },
2978 { 0x4e00, U_NT_NUMERIC, 1. },
2979 { 0x58f1, U_NT_NUMERIC, 1. },
2980 { 0x10320, U_NT_NUMERIC, 1. },
2981 { 0x0F2B, U_NT_NUMERIC, 3./2. },
2982 { 0x00B2, U_NT_DIGIT, 2. },
2983 { 0x5f10, U_NT_NUMERIC, 2. },
2984 { 0x1813, U_NT_DECIMAL, 3. },
2985 { 0x5f0e, U_NT_NUMERIC, 3. },
2986 { 0x2173, U_NT_NUMERIC, 4. },
2987 { 0x8086, U_NT_NUMERIC, 4. },
2988 { 0x278E, U_NT_DIGIT, 5. },
2989 { 0x1D7F2, U_NT_DECIMAL, 6. },
2990 { 0x247A, U_NT_DIGIT, 7. },
2991 { 0x7396, U_NT_NUMERIC, 9. },
2992 { 0x1372, U_NT_NUMERIC, 10. },
2993 { 0x216B, U_NT_NUMERIC, 12. },
2994 { 0x16EE, U_NT_NUMERIC, 17. },
2995 { 0x249A, U_NT_NUMERIC, 19. },
2996 { 0x303A, U_NT_NUMERIC, 30. },
2997 { 0x5345, U_NT_NUMERIC, 30. },
2998 { 0x32B2, U_NT_NUMERIC, 37. },
2999 { 0x1375, U_NT_NUMERIC, 40. },
3000 { 0x10323, U_NT_NUMERIC, 50. },
3001 { 0x0BF1, U_NT_NUMERIC, 100. },
3002 { 0x964c, U_NT_NUMERIC, 100. },
3003 { 0x217E, U_NT_NUMERIC, 500. },
3004 { 0x2180, U_NT_NUMERIC, 1000. },
3005 { 0x4edf, U_NT_NUMERIC, 1000. },
3006 { 0x2181, U_NT_NUMERIC, 5000. },
3007 { 0x137C, U_NT_NUMERIC, 10000. },
3008 { 0x4e07, U_NT_NUMERIC, 10000. },
3009 { 0x12432, U_NT_NUMERIC, 216000. },
3010 { 0x12433, U_NT_NUMERIC, 432000. },
3011 { 0x4ebf, U_NT_NUMERIC, 100000000. },
3012 { 0x5146, U_NT_NUMERIC, 1000000000000. },
3013 { -1, U_NT_NONE, U_NO_NUMERIC_VALUE },
3014 { 0x61, U_NT_NONE, U_NO_NUMERIC_VALUE },
3015 { 0x3000, U_NT_NONE, U_NO_NUMERIC_VALUE },
3016 { 0xfffe, U_NT_NONE, U_NO_NUMERIC_VALUE },
3017 { 0x10301, U_NT_NONE, U_NO_NUMERIC_VALUE },
3018 { 0xe0033, U_NT_NONE, U_NO_NUMERIC_VALUE },
3019 { 0x10ffff, U_NT_NONE, U_NO_NUMERIC_VALUE },
3020 { 0x110000, U_NT_NONE, U_NO_NUMERIC_VALUE }
3021 };
3022
3023 double nv;
3024 UChar32 c;
3025 int32_t i, type;
3026
3027 for(i=0; i<UPRV_LENGTHOF(values); ++i) {
3028 c=values[i].c;
3029 type=u_getIntPropertyValue(c, UCHAR_NUMERIC_TYPE);
3030 nv=u_getNumericValue(c);
3031
3032 if(type!=values[i].type) {
3033 log_err("UCHAR_NUMERIC_TYPE(U+%04lx)=%d should be %d\n", c, type, values[i].type);
3034 }
3035 if(0.000001 <= fabs(nv - values[i].numValue)) {
3036 log_err("u_getNumericValue(U+%04lx)=%g should be %g\n", c, nv, values[i].numValue);
3037 }
3038 }
3039 }
3040
3041 /**
3042 * Test the property names and property value names API.
3043 */
3044 static void
3045 TestPropertyNames(void) {
3046 int32_t p, v, choice=0, rev;
3047 UBool atLeastSomething = FALSE;
3048
3049 for (p=0; ; ++p) {
3050 UProperty propEnum = (UProperty)p;
3051 UBool sawProp = FALSE;
3052 if(p > 10 && !atLeastSomething) {
3053 log_data_err("Never got anything after 10 tries.\nYour data is probably fried. Quitting this test\n", p, choice);
3054 return;
3055 }
3056
3057 for (choice=0; ; ++choice) {
3058 const char* name = u_getPropertyName(propEnum, (UPropertyNameChoice)choice);
3059 if (name) {
3060 if (!sawProp)
3061 log_verbose("prop 0x%04x+%2d:", p&~0xfff, p&0xfff);
3062 log_verbose("%d=\"%s\"", choice, name);
3063 sawProp = TRUE;
3064 atLeastSomething = TRUE;
3065
3066 /* test reverse mapping */
3067 rev = u_getPropertyEnum(name);
3068 if (rev != p) {
3069 log_err("Property round-trip failure: %d -> %s -> %d\n",
3070 p, name, rev);
3071 }
3072 }
3073 if (!name && choice>0) break;
3074 }
3075 if (sawProp) {
3076 /* looks like a valid property; check the values */
3077 const char* pname = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
3078 int32_t max = 0;
3079 if (p == UCHAR_CANONICAL_COMBINING_CLASS) {
3080 max = 255;
3081 } else if (p == UCHAR_GENERAL_CATEGORY_MASK) {
3082 /* it's far too slow to iterate all the way up to
3083 the real max, U_GC_P_MASK */
3084 max = U_GC_NL_MASK;
3085 } else if (p == UCHAR_BLOCK) {
3086 /* UBlockCodes, unlike other values, start at 1 */
3087 max = 1;
3088 }
3089 log_verbose("\n");
3090 for (v=-1; ; ++v) {
3091 UBool sawValue = FALSE;
3092 for (choice=0; ; ++choice) {
3093 const char* vname = u_getPropertyValueName(propEnum, v, (UPropertyNameChoice)choice);
3094 if (vname) {
3095 if (!sawValue) log_verbose(" %s, value %d:", pname, v);
3096 log_verbose("%d=\"%s\"", choice, vname);
3097 sawValue = TRUE;
3098
3099 /* test reverse mapping */
3100 rev = u_getPropertyValueEnum(propEnum, vname);
3101 if (rev != v) {
3102 log_err("Value round-trip failure (%s): %d -> %s -> %d\n",
3103 pname, v, vname, rev);
3104 }
3105 }
3106 if (!vname && choice>0) break;
3107 }
3108 if (sawValue) {
3109 log_verbose("\n");
3110 }
3111 if (!sawValue && v>=max) break;
3112 }
3113 }
3114 if (!sawProp) {
3115 if (p>=UCHAR_STRING_LIMIT) {
3116 break;
3117 } else if (p>=UCHAR_DOUBLE_LIMIT) {
3118 p = UCHAR_STRING_START - 1;
3119 } else if (p>=UCHAR_MASK_LIMIT) {
3120 p = UCHAR_DOUBLE_START - 1;
3121 } else if (p>=UCHAR_INT_LIMIT) {
3122 p = UCHAR_MASK_START - 1;
3123 } else if (p>=UCHAR_BINARY_LIMIT) {
3124 p = UCHAR_INT_START - 1;
3125 }
3126 }
3127 }
3128 }
3129
3130 /**
3131 * Test the property values API. See JB#2410.
3132 */
3133 static void
3134 TestPropertyValues(void) {
3135 int32_t i, p, min, max;
3136 UErrorCode ec;
3137
3138 /* Min should be 0 for everything. */
3139 /* Until JB#2478 is fixed, the one exception is UCHAR_BLOCK. */
3140 for (p=UCHAR_INT_START; p<UCHAR_INT_LIMIT; ++p) {
3141 UProperty propEnum = (UProperty)p;
3142 min = u_getIntPropertyMinValue(propEnum);
3143 if (min != 0) {
3144 if (p == UCHAR_BLOCK) {
3145 /* This is okay...for now. See JB#2487.
3146 TODO Update this for JB#2487. */
3147 } else {
3148 const char* name;
3149 name = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
3150 if (name == NULL)
3151 name = "<ERROR>";
3152 log_err("FAIL: u_getIntPropertyMinValue(%s) = %d, exp. 0\n",
3153 name, min);
3154 }
3155 }
3156 }
3157
3158 if( u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY_MASK)!=0 ||
3159 u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY_MASK)!=-1) {
3160 log_err("error: u_getIntPropertyMin/MaxValue(UCHAR_GENERAL_CATEGORY_MASK) is wrong\n");
3161 }
3162
3163 /* Max should be -1 for invalid properties. */
3164 max = u_getIntPropertyMaxValue(UCHAR_INVALID_CODE);
3165 if (max != -1) {
3166 log_err("FAIL: u_getIntPropertyMaxValue(-1) = %d, exp. -1\n",
3167 max);
3168 }
3169
3170 /* Script should return USCRIPT_INVALID_CODE for an invalid code point. */
3171 for (i=0; i<2; ++i) {
3172 int32_t script;
3173 const char* desc;
3174 ec = U_ZERO_ERROR;
3175 switch (i) {
3176 case 0:
3177 script = uscript_getScript(-1, &ec);
3178 desc = "uscript_getScript(-1)";
3179 break;
3180 case 1:
3181 script = u_getIntPropertyValue(-1, UCHAR_SCRIPT);
3182 desc = "u_getIntPropertyValue(-1, UCHAR_SCRIPT)";
3183 break;
3184 default:
3185 log_err("Internal test error. Too many scripts\n");
3186 return;
3187 }
3188 /* We don't explicitly test ec. It should be U_FAILURE but it
3189 isn't documented as such. */
3190 if (script != (int32_t)USCRIPT_INVALID_CODE) {
3191 log_err("FAIL: %s = %d, exp. 0\n",
3192 desc, script);
3193 }
3194 }
3195 }
3196
3197 /* various tests for consistency of UCD data and API behavior */
3198 static void
3199 TestConsistency() {
3200 char buffer[300];
3201 USet *set1, *set2, *set3, *set4;
3202 UErrorCode errorCode;
3203
3204 UChar32 start, end;
3205 int32_t i, length;
3206
3207 U_STRING_DECL(hyphenPattern, "[:Hyphen:]", 10);
3208 U_STRING_DECL(dashPattern, "[:Dash:]", 8);
3209 U_STRING_DECL(lowerPattern, "[:Lowercase:]", 13);
3210 U_STRING_DECL(formatPattern, "[:Cf:]", 6);
3211 U_STRING_DECL(alphaPattern, "[:Alphabetic:]", 14);
3212
3213 U_STRING_DECL(mathBlocksPattern,
3214 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
3215 214);
3216 U_STRING_DECL(mathPattern, "[:Math:]", 8);
3217 U_STRING_DECL(unassignedPattern, "[:Cn:]", 6);
3218 U_STRING_DECL(unknownPattern, "[:sc=Unknown:]", 14);
3219 U_STRING_DECL(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3220
3221 U_STRING_INIT(hyphenPattern, "[:Hyphen:]", 10);
3222 U_STRING_INIT(dashPattern, "[:Dash:]", 8);
3223 U_STRING_INIT(lowerPattern, "[:Lowercase:]", 13);
3224 U_STRING_INIT(formatPattern, "[:Cf:]", 6);
3225 U_STRING_INIT(alphaPattern, "[:Alphabetic:]", 14);
3226
3227 U_STRING_INIT(mathBlocksPattern,
3228 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
3229 214);
3230 U_STRING_INIT(mathPattern, "[:Math:]", 8);
3231 U_STRING_INIT(unassignedPattern, "[:Cn:]", 6);
3232 U_STRING_INIT(unknownPattern, "[:sc=Unknown:]", 14);
3233 U_STRING_INIT(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3234
3235 /*
3236 * It used to be that UCD.html and its precursors said
3237 * "Those dashes used to mark connections between pieces of words,
3238 * plus the Katakana middle dot."
3239 *
3240 * Unicode 4 changed 00AD Soft Hyphen to Cf and removed it from Dash
3241 * but not from Hyphen.
3242 * UTC 94 (2003mar) decided to leave it that way and to change UCD.html.
3243 * Therefore, do not show errors when testing the Hyphen property.
3244 */
3245 log_verbose("Starting with Unicode 4, inconsistencies with [:Hyphen:] are\n"
3246 "known to the UTC and not considered errors.\n");
3247
3248 errorCode=U_ZERO_ERROR;
3249 set1=uset_openPattern(hyphenPattern, 10, &errorCode);
3250 set2=uset_openPattern(dashPattern, 8, &errorCode);
3251 if(U_SUCCESS(errorCode)) {
3252 /* remove the Katakana middle dot(s) from set1 */
3253 uset_remove(set1, 0x30fb);
3254 uset_remove(set1, 0xff65); /* halfwidth variant */
3255 showAMinusB(set1, set2, "[:Hyphen:]", "[:Dash:]", FALSE);
3256 } else {
3257 log_data_err("error opening [:Hyphen:] or [:Dash:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3258 }
3259
3260 /* check that Cf is neither Hyphen nor Dash nor Alphabetic */
3261 set3=uset_openPattern(formatPattern, 6, &errorCode);
3262 set4=uset_openPattern(alphaPattern, 14, &errorCode);
3263 if(U_SUCCESS(errorCode)) {
3264 showAIntersectB(set3, set1, "[:Cf:]", "[:Hyphen:]", FALSE);
3265 showAIntersectB(set3, set2, "[:Cf:]", "[:Dash:]", TRUE);
3266 showAIntersectB(set3, set4, "[:Cf:]", "[:Alphabetic:]", TRUE);
3267 } else {
3268 log_data_err("error opening [:Cf:] or [:Alpbabetic:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3269 }
3270
3271 uset_close(set1);
3272 uset_close(set2);
3273 uset_close(set3);
3274 uset_close(set4);
3275
3276 /*
3277 * Check that each lowercase character has "small" in its name
3278 * and not "capital".
3279 * There are some such characters, some of which seem odd.
3280 * Use the verbose flag to see these notices.
3281 */
3282 errorCode=U_ZERO_ERROR;
3283 set1=uset_openPattern(lowerPattern, 13, &errorCode);
3284 if(U_SUCCESS(errorCode)) {
3285 for(i=0;; ++i) {
3286 length=uset_getItem(set1, i, &start, &end, NULL, 0, &errorCode);
3287 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
3288 break; /* done */
3289 }
3290 if(U_FAILURE(errorCode)) {
3291 log_err("error iterating over [:Lowercase:] at item %d: %s\n",
3292 i, u_errorName(errorCode));
3293 break;
3294 }
3295 if(length!=0) {
3296 break; /* done with code points, got a string or -1 */
3297 }
3298
3299 while(start<=end) {
3300 length=u_charName(start, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode);
3301 if(U_FAILURE(errorCode)) {
3302 log_data_err("error getting the name of U+%04x - %s\n", start, u_errorName(errorCode));
3303 errorCode=U_ZERO_ERROR;
3304 }
3305 if( (strstr(buffer, "SMALL")==NULL || strstr(buffer, "CAPITAL")!=NULL) &&
3306 strstr(buffer, "SMALL CAPITAL")==NULL
3307 ) {
3308 log_verbose("info: [:Lowercase:] contains U+%04x whose name does not suggest lowercase: %s\n", start, buffer);
3309 }
3310 ++start;
3311 }
3312 }
3313 } else {
3314 log_data_err("error opening [:Lowercase:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3315 }
3316 uset_close(set1);
3317
3318 /* verify that all assigned characters in Math blocks are exactly Math characters */
3319 errorCode=U_ZERO_ERROR;
3320 set1=uset_openPattern(mathBlocksPattern, -1, &errorCode);
3321 set2=uset_openPattern(mathPattern, 8, &errorCode);
3322 set3=uset_openPattern(unassignedPattern, 6, &errorCode);
3323 if(U_SUCCESS(errorCode)) {
3324 uset_retainAll(set2, set1); /* [math blocks]&[:Math:] */
3325 uset_complement(set3); /* assigned characters */
3326 uset_retainAll(set1, set3); /* [math blocks]&[assigned] */
3327 compareUSets(set1, set2,
3328 "[assigned Math block chars]", "[math blocks]&[:Math:]",
3329 TRUE);
3330 } else {
3331 log_data_err("error opening [math blocks] or [:Math:] or [:Cn:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3332 }
3333 uset_close(set1);
3334 uset_close(set2);
3335 uset_close(set3);
3336
3337 /* new in Unicode 5.0: exactly all unassigned+PUA+surrogate code points have script=Unknown */
3338 errorCode=U_ZERO_ERROR;
3339 set1=uset_openPattern(unknownPattern, 14, &errorCode);
3340 set2=uset_openPattern(reservedPattern, 20, &errorCode);
3341 if(U_SUCCESS(errorCode)) {
3342 compareUSets(set1, set2,
3343 "[:sc=Unknown:]", "[[:Cn:][:Co:][:Cs:]]",
3344 TRUE);
3345 } else {
3346 log_data_err("error opening [:sc=Unknown:] or [[:Cn:][:Co:][:Cs:]] - %s (Are you missing data?)\n", u_errorName(errorCode));
3347 }
3348 uset_close(set1);
3349 uset_close(set2);
3350 }
3351
3352 /* test case folding, compare return values with CaseFolding.txt ------------ */
3353
3354 /* bit set for which case foldings for a character have been tested already */
3355 enum {
3356 CF_SIMPLE=1,
3357 CF_FULL=2,
3358 CF_TURKIC=4,
3359 CF_ALL=7
3360 };
3361
3362 static void
3363 testFold(UChar32 c, int which,
3364 UChar32 simple, UChar32 turkic,
3365 const UChar *full, int32_t fullLength,
3366 const UChar *turkicFull, int32_t turkicFullLength) {
3367 UChar s[2], t[32];
3368 UChar32 c2;
3369 int32_t length, length2;
3370
3371 UErrorCode errorCode=U_ZERO_ERROR;
3372
3373 length=0;
3374 U16_APPEND_UNSAFE(s, length, c);
3375
3376 if((which&CF_SIMPLE)!=0 && (c2=u_foldCase(c, 0))!=simple) {
3377 log_err("u_foldCase(U+%04lx, default)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3378 }
3379 if((which&CF_FULL)!=0) {
3380 length2=u_strFoldCase(t, UPRV_LENGTHOF(t), s, length, 0, &errorCode);
3381 if(length2!=fullLength || 0!=u_memcmp(t, full, fullLength)) {
3382 log_err("u_strFoldCase(U+%04lx, default) does not fold properly\n", (long)c);
3383 }
3384 }
3385 if((which&CF_TURKIC)!=0) {
3386 if((c2=u_foldCase(c, U_FOLD_CASE_EXCLUDE_SPECIAL_I))!=turkic) {
3387 log_err("u_foldCase(U+%04lx, turkic)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3388 }
3389
3390 length2=u_strFoldCase(t, UPRV_LENGTHOF(t), s, length, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &errorCode);
3391 if(length2!=turkicFullLength || 0!=u_memcmp(t, turkicFull, length2)) {
3392 log_err("u_strFoldCase(U+%04lx, turkic) does not fold properly\n", (long)c);
3393 }
3394 }
3395 }
3396
3397 /* test that c case-folds to itself */
3398 static void
3399 testFoldToSelf(UChar32 c, int which) {
3400 UChar s[2];
3401 int32_t length;
3402
3403 length=0;
3404 U16_APPEND_UNSAFE(s, length, c);
3405 testFold(c, which, c, c, s, length, s, length);
3406 }
3407
3408 struct CaseFoldingData {
3409 USet *notSeen;
3410 UChar32 prev, prevSimple;
3411 UChar prevFull[32];
3412 int32_t prevFullLength;
3413 int which;
3414 };
3415 typedef struct CaseFoldingData CaseFoldingData;
3416
3417 static void U_CALLCONV
3418 caseFoldingLineFn(void *context,
3419 char *fields[][2], int32_t fieldCount,
3420 UErrorCode *pErrorCode) {
3421 CaseFoldingData *pData=(CaseFoldingData *)context;
3422 char *end;
3423 UChar full[32];
3424 UChar32 c, prev, simple;
3425 int32_t count;
3426 int which;
3427 char status;
3428
3429 /* get code point */
3430 const char *s=u_skipWhitespace(fields[0][0]);
3431 if(0==strncmp(s, "0000..10FFFF", 12)) {
3432 /*
3433 * Ignore the line
3434 * # @missing: 0000..10FFFF; C; <code point>
3435 * because maps-to-self is already our default, and this line breaks this parser.
3436 */
3437 return;
3438 }
3439 c=(UChar32)strtoul(s, &end, 16);
3440 end=(char *)u_skipWhitespace(end);
3441 if(end<=fields[0][0] || end!=fields[0][1]) {
3442 log_err("syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
3443 *pErrorCode=U_PARSE_ERROR;
3444 return;
3445 }
3446
3447 /* get the status of this mapping */
3448 status=*u_skipWhitespace(fields[1][0]);
3449 if(status!='C' && status!='S' && status!='F' && status!='T') {
3450 log_err("unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
3451 *pErrorCode=U_PARSE_ERROR;
3452 return;
3453 }
3454
3455 /* get the mapping */
3456 count=u_parseString(fields[2][0], full, 32, (uint32_t *)&simple, pErrorCode);
3457 if(U_FAILURE(*pErrorCode)) {
3458 log_err("error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
3459 return;
3460 }
3461
3462 /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
3463 if(count==0 || count>2 || (count==2 && U16_IS_SINGLE(full[1]))) {
3464 simple=c;
3465 }
3466
3467 if(c!=(prev=pData->prev)) {
3468 /*
3469 * Test remaining mappings for the previous code point.
3470 * If a turkic folding was not mentioned, then it should fold the same
3471 * as the regular simple case folding.
3472 */
3473 UChar prevString[2];
3474 int32_t length;
3475
3476 length=0;
3477 U16_APPEND_UNSAFE(prevString, length, prev);
3478 testFold(prev, (~pData->which)&CF_ALL,
3479 prev, pData->prevSimple,
3480 prevString, length,
3481 pData->prevFull, pData->prevFullLength);
3482 pData->prev=pData->prevSimple=c;
3483 length=0;
3484 U16_APPEND_UNSAFE(pData->prevFull, length, c);
3485 pData->prevFullLength=length;
3486 pData->which=0;
3487 }
3488
3489 /*
3490 * Turn the status into a bit set of case foldings to test.
3491 * Remember non-Turkic case foldings as defaults for Turkic mode.
3492 */
3493 switch(status) {
3494 case 'C':
3495 which=CF_SIMPLE|CF_FULL;
3496 pData->prevSimple=simple;
3497 u_memcpy(pData->prevFull, full, count);
3498 pData->prevFullLength=count;
3499 break;
3500 case 'S':
3501 which=CF_SIMPLE;
3502 pData->prevSimple=simple;
3503 break;
3504 case 'F':
3505 which=CF_FULL;
3506 u_memcpy(pData->prevFull, full, count);
3507 pData->prevFullLength=count;
3508 break;
3509 case 'T':
3510 which=CF_TURKIC;
3511 break;
3512 default:
3513 which=0;
3514 break; /* won't happen because of test above */
3515 }
3516
3517 testFold(c, which, simple, simple, full, count, full, count);
3518
3519 /* remember which case foldings of c have been tested */
3520 pData->which|=which;
3521
3522 /* remove c from the set of ones not mentioned in CaseFolding.txt */
3523 uset_remove(pData->notSeen, c);
3524 }
3525
3526 static void
3527 TestCaseFolding() {
3528 CaseFoldingData data={ NULL };
3529 char *fields[3][2];
3530 UErrorCode errorCode;
3531
3532 static char *lastLine= (char *)"10FFFF; C; 10FFFF;";
3533
3534 errorCode=U_ZERO_ERROR;
3535 /* test BMP & plane 1 - nothing interesting above */
3536 data.notSeen=uset_open(0, 0x1ffff);
3537 data.prevFullLength=1; /* length of full case folding of U+0000 */
3538
3539 parseUCDFile("CaseFolding.txt", fields, 3, caseFoldingLineFn, &data, &errorCode);
3540 if(U_SUCCESS(errorCode)) {
3541 int32_t i, start, end;
3542
3543 /* add a pseudo-last line to finish testing of the actual last one */
3544 fields[0][0]=lastLine;
3545 fields[0][1]=lastLine+6;
3546 fields[1][0]=lastLine+7;
3547 fields[1][1]=lastLine+9;
3548 fields[2][0]=lastLine+10;
3549 fields[2][1]=lastLine+17;
3550 caseFoldingLineFn(&data, fields, 3, &errorCode);
3551
3552 /* verify that all code points that are not mentioned in CaseFolding.txt fold to themselves */
3553 for(i=0;
3554 0==uset_getItem(data.notSeen, i, &start, &end, NULL, 0, &errorCode) &&
3555 U_SUCCESS(errorCode);
3556 ++i
3557 ) {
3558 do {
3559 testFoldToSelf(start, CF_ALL);
3560 } while(++start<=end);
3561 }
3562 }
3563
3564 uset_close(data.notSeen);
3565 }
3566
3567 static void TestBinaryCharacterPropertiesAPI() {
3568 // API test only. See intltest/ucdtest.cpp for functional test.
3569 UErrorCode errorCode = U_ZERO_ERROR;
3570 const USet *set = u_getBinaryPropertySet(-1, &errorCode);
3571 if (U_SUCCESS(errorCode)) {
3572 log_err("u_getBinaryPropertySet(-1) did not fail\n");
3573 }
3574 errorCode = U_ZERO_ERROR;
3575 set = u_getBinaryPropertySet(UCHAR_BINARY_LIMIT, &errorCode);
3576 if (U_SUCCESS(errorCode)) {
3577 log_err("u_getBinaryPropertySet(UCHAR_BINARY_LIMIT) did not fail\n");
3578 }
3579 errorCode = U_ZERO_ERROR;
3580 set = u_getBinaryPropertySet(UCHAR_WHITE_SPACE, &errorCode);
3581 if (!uset_contains(set, 0x20) || uset_contains(set, 0x61)) {
3582 log_err("u_getBinaryPropertySet(UCHAR_WHITE_SPACE) wrong contents\n");
3583 }
3584 }
3585
3586 static void TestIntCharacterPropertiesAPI() {
3587 // API test only. See intltest/ucdtest.cpp for functional test.
3588 UErrorCode errorCode = U_ZERO_ERROR;
3589 const UCPMap *map = u_getIntPropertyMap(UCHAR_INT_START - 1, &errorCode);
3590 if (U_SUCCESS(errorCode)) {
3591 log_err("u_getIntPropertyMap(UCHAR_INT_START - 1) did not fail\n");
3592 }
3593 errorCode = U_ZERO_ERROR;
3594 map = u_getIntPropertyMap(UCHAR_INT_LIMIT, &errorCode);
3595 if (U_SUCCESS(errorCode)) {
3596 log_err("u_getIntPropertyMap(UCHAR_INT_LIMIT) did not fail\n");
3597 }
3598 errorCode = U_ZERO_ERROR;
3599 map = u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY, &errorCode);
3600 if (ucpmap_get(map, 0x20) != U_SPACE_SEPARATOR || ucpmap_get(map, 0x23456) != U_OTHER_LETTER) {
3601 log_err("u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY) wrong contents\n");
3602 }
3603 }