]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/cintltst/cucdtst.c
ICU-66108.tar.gz
[apple/icu.git] / icuSources / test / cintltst / cucdtst.c
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 1997-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8 /*******************************************************************************
9 *
10 * File CUCDTST.C
11 *
12 * Modification History:
13 * Name Description
14 * Madhu Katragadda Ported for C API, added tests for string functions
15 ********************************************************************************
16 */
17
18 #include <string.h>
19 #include <math.h>
20 #include <stdlib.h>
21
22 #include "unicode/utypes.h"
23 #include "unicode/uchar.h"
24 #include "unicode/putil.h"
25 #include "unicode/ustring.h"
26 #include "unicode/uloc.h"
27 #include "unicode/unorm2.h"
28 #include "unicode/utf16.h"
29 #include "unicode/utf_old.h"
30 #include "cintltst.h"
31 #include "putilimp.h"
32 #include "uparse.h"
33 #include "ucase.h"
34 #include "ubidi_props.h"
35 #include "uprops.h"
36 #include "uset_imp.h"
37 #include "usc_impl.h"
38 #include "udatamem.h"
39 #include "cucdapi.h"
40 #include "cmemory.h"
41
42 /* prototypes --------------------------------------------------------------- */
43
44 static void TestUpperLower(void);
45 static void TestLetterNumber(void);
46 static void TestMisc(void);
47 static void TestPOSIX(void);
48 static void TestControlPrint(void);
49 static void TestIdentifier(void);
50 static void TestUnicodeData(void);
51 static void TestCodeUnit(void);
52 static void TestCodePoint(void);
53 static void TestCharLength(void);
54 static void TestCharNames(void);
55 static void TestUCharFromNameUnderflow(void);
56 static void TestMirroring(void);
57 static void TestUScriptRunAPI(void);
58 static void TestAdditionalProperties(void);
59 static void TestNumericProperties(void);
60 static void TestPropertyNames(void);
61 static void TestPropertyValues(void);
62 static void TestConsistency(void);
63 static void TestCaseFolding(void);
64 static void TestBinaryCharacterPropertiesAPI(void);
65 static void TestIntCharacterPropertiesAPI(void);
66
67 /* internal methods used */
68 static int32_t MakeProp(char* str);
69 static int32_t MakeDir(char* str);
70
71 /* helpers ------------------------------------------------------------------ */
72
73 static void
74 parseUCDFile(const char *filename,
75 char *fields[][2], int32_t fieldCount,
76 UParseLineFn *lineFn, void *context,
77 UErrorCode *pErrorCode) {
78 // buffer sizes changed from 256 for APPLE_XCODE_BUILD (which is generating really long pathnames)
79 char path[512];
80 char backupPath[512];
81
82 if(U_FAILURE(*pErrorCode)) {
83 return;
84 }
85
86 /* Look inside ICU_DATA first */
87 strcpy(path, u_getDataDirectory());
88 strcat(path, ".." U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING);
89 strcat(path, filename);
90
91 /* As a fallback, try to guess where the source data was located
92 * at the time ICU was built, and look there.
93 */
94 strcpy(backupPath, ctest_dataSrcDir());
95 strcat(backupPath, U_FILE_SEP_STRING);
96 strcat(backupPath, "unidata" U_FILE_SEP_STRING);
97 strcat(backupPath, filename);
98
99 u_parseDelimitedFile(path, ';', fields, fieldCount, lineFn, context, pErrorCode);
100 if(*pErrorCode==U_FILE_ACCESS_ERROR) {
101 *pErrorCode=U_ZERO_ERROR;
102 u_parseDelimitedFile(backupPath, ';', fields, fieldCount, lineFn, context, pErrorCode);
103 }
104 if(U_FAILURE(*pErrorCode)) {
105 log_err_status(*pErrorCode, "error parsing %s: %s\n", filename, u_errorName(*pErrorCode));
106 }
107 }
108
109 /* test data ---------------------------------------------------------------- */
110
111 static const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf";
112 static const int32_t tagValues[] =
113 {
114 /* Mn */ U_NON_SPACING_MARK,
115 /* Mc */ U_COMBINING_SPACING_MARK,
116 /* Me */ U_ENCLOSING_MARK,
117 /* Nd */ U_DECIMAL_DIGIT_NUMBER,
118 /* Nl */ U_LETTER_NUMBER,
119 /* No */ U_OTHER_NUMBER,
120 /* Zs */ U_SPACE_SEPARATOR,
121 /* Zl */ U_LINE_SEPARATOR,
122 /* Zp */ U_PARAGRAPH_SEPARATOR,
123 /* Cc */ U_CONTROL_CHAR,
124 /* Cf */ U_FORMAT_CHAR,
125 /* Cs */ U_SURROGATE,
126 /* Co */ U_PRIVATE_USE_CHAR,
127 /* Cn */ U_UNASSIGNED,
128 /* Lu */ U_UPPERCASE_LETTER,
129 /* Ll */ U_LOWERCASE_LETTER,
130 /* Lt */ U_TITLECASE_LETTER,
131 /* Lm */ U_MODIFIER_LETTER,
132 /* Lo */ U_OTHER_LETTER,
133 /* Pc */ U_CONNECTOR_PUNCTUATION,
134 /* Pd */ U_DASH_PUNCTUATION,
135 /* Ps */ U_START_PUNCTUATION,
136 /* Pe */ U_END_PUNCTUATION,
137 /* Po */ U_OTHER_PUNCTUATION,
138 /* Sm */ U_MATH_SYMBOL,
139 /* Sc */ U_CURRENCY_SYMBOL,
140 /* Sk */ U_MODIFIER_SYMBOL,
141 /* So */ U_OTHER_SYMBOL,
142 /* Pi */ U_INITIAL_PUNCTUATION,
143 /* Pf */ U_FINAL_PUNCTUATION
144 };
145
146 static const char dirStrings[][5] = {
147 "L",
148 "R",
149 "EN",
150 "ES",
151 "ET",
152 "AN",
153 "CS",
154 "B",
155 "S",
156 "WS",
157 "ON",
158 "LRE",
159 "LRO",
160 "AL",
161 "RLE",
162 "RLO",
163 "PDF",
164 "NSM",
165 "BN",
166 /* new in Unicode 6.3/ICU 52 */
167 "FSI",
168 "LRI",
169 "RLI",
170 "PDI"
171 };
172
173 void addUnicodeTest(TestNode** root);
174
175 void addUnicodeTest(TestNode** root)
176 {
177 addTest(root, &TestCodeUnit, "tsutil/cucdtst/TestCodeUnit");
178 addTest(root, &TestCodePoint, "tsutil/cucdtst/TestCodePoint");
179 addTest(root, &TestCharLength, "tsutil/cucdtst/TestCharLength");
180 addTest(root, &TestBinaryValues, "tsutil/cucdtst/TestBinaryValues");
181 addTest(root, &TestUnicodeData, "tsutil/cucdtst/TestUnicodeData");
182 addTest(root, &TestAdditionalProperties, "tsutil/cucdtst/TestAdditionalProperties");
183 addTest(root, &TestNumericProperties, "tsutil/cucdtst/TestNumericProperties");
184 addTest(root, &TestUpperLower, "tsutil/cucdtst/TestUpperLower");
185 addTest(root, &TestLetterNumber, "tsutil/cucdtst/TestLetterNumber");
186 addTest(root, &TestMisc, "tsutil/cucdtst/TestMisc");
187 addTest(root, &TestPOSIX, "tsutil/cucdtst/TestPOSIX");
188 addTest(root, &TestControlPrint, "tsutil/cucdtst/TestControlPrint");
189 addTest(root, &TestIdentifier, "tsutil/cucdtst/TestIdentifier");
190 addTest(root, &TestCharNames, "tsutil/cucdtst/TestCharNames");
191 addTest(root, &TestUCharFromNameUnderflow, "tsutil/cucdtst/TestUCharFromNameUnderflow");
192 addTest(root, &TestMirroring, "tsutil/cucdtst/TestMirroring");
193 addTest(root, &TestUScriptCodeAPI, "tsutil/cucdtst/TestUScriptCodeAPI");
194 addTest(root, &TestHasScript, "tsutil/cucdtst/TestHasScript");
195 addTest(root, &TestGetScriptExtensions, "tsutil/cucdtst/TestGetScriptExtensions");
196 addTest(root, &TestScriptMetadataAPI, "tsutil/cucdtst/TestScriptMetadataAPI");
197 addTest(root, &TestUScriptRunAPI, "tsutil/cucdtst/TestUScriptRunAPI");
198 addTest(root, &TestPropertyNames, "tsutil/cucdtst/TestPropertyNames");
199 addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues");
200 addTest(root, &TestConsistency, "tsutil/cucdtst/TestConsistency");
201 addTest(root, &TestCaseFolding, "tsutil/cucdtst/TestCaseFolding");
202 addTest(root, &TestBinaryCharacterPropertiesAPI,
203 "tsutil/cucdtst/TestBinaryCharacterPropertiesAPI");
204 addTest(root, &TestIntCharacterPropertiesAPI,
205 "tsutil/cucdtst/TestIntCharacterPropertiesAPI");
206 }
207
208 /*==================================================== */
209 /* test u_toupper() and u_tolower() */
210 /*==================================================== */
211 static void TestUpperLower()
212 {
213 const UChar upper[] = {0x41, 0x42, 0x00b2, 0x01c4, 0x01c6, 0x01c9, 0x01c8, 0x01c9, 0x000c, 0x0000};
214 const UChar lower[] = {0x61, 0x62, 0x00b2, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01c9, 0x000c, 0x0000};
215 U_STRING_DECL(upperTest, "abcdefg123hij.?:klmno", 21);
216 U_STRING_DECL(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
217 int32_t i;
218
219 U_STRING_INIT(upperTest, "abcdefg123hij.?:klmno", 21);
220 U_STRING_INIT(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
221
222 /*
223 Checks LetterLike Symbols which were previously a source of confusion
224 [Bertrand A. D. 02/04/98]
225 */
226 for (i=0x2100;i<0x2138;i++)
227 {
228 /* Unicode 5.0 adds lowercase U+214E (TURNED SMALL F) to U+2132 (TURNED CAPITAL F) */
229 if(i!=0x2126 && i!=0x212a && i!=0x212b && i!=0x2132)
230 {
231 if (i != (int)u_tolower(i)) /* itself */
232 log_err("Failed case conversion with itself: U+%04x\n", i);
233 if (i != (int)u_toupper(i))
234 log_err("Failed case conversion with itself: U+%04x\n", i);
235 }
236 }
237
238 for(i=0; i < u_strlen(upper); i++){
239 if(u_tolower(upper[i]) != lower[i]){
240 log_err("FAILED u_tolower() for %lx Expected %lx Got %lx\n", upper[i], lower[i], u_tolower(upper[i]));
241 }
242 }
243
244 log_verbose("testing upper lower\n");
245 for (i = 0; i < 21; i++) {
246
247 if (u_isalpha(upperTest[i]) && !u_islower(upperTest[i]))
248 {
249 log_err("Failed isLowerCase test at %c\n", upperTest[i]);
250 }
251 else if (u_isalpha(lowerTest[i]) && !u_isupper(lowerTest[i]))
252 {
253 log_err("Failed isUpperCase test at %c\n", lowerTest[i]);
254 }
255 else if (upperTest[i] != u_tolower(lowerTest[i]))
256 {
257 log_err("Failed case conversion from %c To %c :\n", lowerTest[i], upperTest[i]);
258 }
259 else if (lowerTest[i] != u_toupper(upperTest[i]))
260 {
261 log_err("Failed case conversion : %c To %c \n", upperTest[i], lowerTest[i]);
262 }
263 else if (upperTest[i] != u_tolower(upperTest[i]))
264 {
265 log_err("Failed case conversion with itself: %c\n", upperTest[i]);
266 }
267 else if (lowerTest[i] != u_toupper(lowerTest[i]))
268 {
269 log_err("Failed case conversion with itself: %c\n", lowerTest[i]);
270 }
271 }
272 log_verbose("done testing upper lower\n");
273
274 log_verbose("testing u_istitle\n");
275 {
276 static const UChar expected[] = {
277 0x1F88,
278 0x1F89,
279 0x1F8A,
280 0x1F8B,
281 0x1F8C,
282 0x1F8D,
283 0x1F8E,
284 0x1F8F,
285 0x1F88,
286 0x1F89,
287 0x1F8A,
288 0x1F8B,
289 0x1F8C,
290 0x1F8D,
291 0x1F8E,
292 0x1F8F,
293 0x1F98,
294 0x1F99,
295 0x1F9A,
296 0x1F9B,
297 0x1F9C,
298 0x1F9D,
299 0x1F9E,
300 0x1F9F,
301 0x1F98,
302 0x1F99,
303 0x1F9A,
304 0x1F9B,
305 0x1F9C,
306 0x1F9D,
307 0x1F9E,
308 0x1F9F,
309 0x1FA8,
310 0x1FA9,
311 0x1FAA,
312 0x1FAB,
313 0x1FAC,
314 0x1FAD,
315 0x1FAE,
316 0x1FAF,
317 0x1FA8,
318 0x1FA9,
319 0x1FAA,
320 0x1FAB,
321 0x1FAC,
322 0x1FAD,
323 0x1FAE,
324 0x1FAF,
325 0x1FBC,
326 0x1FBC,
327 0x1FCC,
328 0x1FCC,
329 0x1FFC,
330 0x1FFC,
331 };
332 int32_t num = UPRV_LENGTHOF(expected);
333 for(i=0; i<num; i++){
334 if(!u_istitle(expected[i])){
335 log_err("u_istitle failed for 0x%4X. Expected TRUE, got FALSE\n",expected[i]);
336 }
337 }
338
339 }
340 }
341
342 /* compare two sets and verify that their difference or intersection is empty */
343 static UBool
344 showADiffB(const USet *a, const USet *b,
345 const char *a_name, const char *b_name,
346 UBool expect, UBool diffIsError) {
347 USet *aa;
348 int32_t i, start, end, length;
349 UErrorCode errorCode;
350
351 /*
352 * expect:
353 * TRUE -> a-b should be empty, that is, b should contain all of a
354 * FALSE -> a&b should be empty, that is, a should contain none of b (and vice versa)
355 */
356 if(expect ? uset_containsAll(b, a) : uset_containsNone(a, b)) {
357 return TRUE;
358 }
359
360 /* clone a to aa because a is const */
361 aa=uset_open(1, 0);
362 if(aa==NULL) {
363 /* unusual problem - out of memory? */
364 return FALSE;
365 }
366 uset_addAll(aa, a);
367
368 /* compute the set in question */
369 if(expect) {
370 /* a-b */
371 uset_removeAll(aa, b);
372 } else {
373 /* a&b */
374 uset_retainAll(aa, b);
375 }
376
377 /* aa is not empty because of the initial tests above; show its contents */
378 errorCode=U_ZERO_ERROR;
379 i=0;
380 for(;;) {
381 length=uset_getItem(aa, i, &start, &end, NULL, 0, &errorCode);
382 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
383 break; /* done */
384 }
385 if(U_FAILURE(errorCode)) {
386 log_err("error comparing %s with %s at difference item %d: %s\n",
387 a_name, b_name, i, u_errorName(errorCode));
388 break;
389 }
390 if(length!=0) {
391 break; /* done with code points, got a string or -1 */
392 }
393
394 if(diffIsError) {
395 if(expect) {
396 log_err("error: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
397 } else {
398 log_err("error: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
399 }
400 } else {
401 if(expect) {
402 log_verbose("info: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
403 } else {
404 log_verbose("info: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
405 }
406 }
407
408 ++i;
409 }
410
411 uset_close(aa);
412 return FALSE;
413 }
414
415 static UBool
416 showAMinusB(const USet *a, const USet *b,
417 const char *a_name, const char *b_name,
418 UBool diffIsError) {
419 return showADiffB(a, b, a_name, b_name, TRUE, diffIsError);
420 }
421
422 static UBool
423 showAIntersectB(const USet *a, const USet *b,
424 const char *a_name, const char *b_name,
425 UBool diffIsError) {
426 return showADiffB(a, b, a_name, b_name, FALSE, diffIsError);
427 }
428
429 static UBool
430 compareUSets(const USet *a, const USet *b,
431 const char *a_name, const char *b_name,
432 UBool diffIsError) {
433 /*
434 * Use an arithmetic & not a logical && so that both branches
435 * are always taken and all differences are shown.
436 */
437 return
438 showAMinusB(a, b, a_name, b_name, diffIsError) &
439 showAMinusB(b, a, b_name, a_name, diffIsError);
440 }
441
442 /* test isLetter(u_isapha()) and isDigit(u_isdigit()) */
443 static void TestLetterNumber()
444 {
445 UChar i = 0x0000;
446
447 log_verbose("Testing for isalpha\n");
448 for (i = 0x0041; i < 0x005B; i++) {
449 if (!u_isalpha(i))
450 {
451 log_err("Failed isLetter test at %.4X\n", i);
452 }
453 }
454 for (i = 0x0660; i < 0x066A; i++) {
455 if (u_isalpha(i))
456 {
457 log_err("Failed isLetter test with numbers at %.4X\n", i);
458 }
459 }
460
461 log_verbose("Testing for isdigit\n");
462 for (i = 0x0660; i < 0x066A; i++) {
463 if (!u_isdigit(i))
464 {
465 log_verbose("Failed isNumber test at %.4X\n", i);
466 }
467 }
468
469 log_verbose("Testing for isalnum\n");
470 for (i = 0x0041; i < 0x005B; i++) {
471 if (!u_isalnum(i))
472 {
473 log_err("Failed isAlNum test at %.4X\n", i);
474 }
475 }
476 for (i = 0x0660; i < 0x066A; i++) {
477 if (!u_isalnum(i))
478 {
479 log_err("Failed isAlNum test at %.4X\n", i);
480 }
481 }
482
483 {
484 /*
485 * The following checks work only starting from Unicode 4.0.
486 * Check the version number here.
487 */
488 static UVersionInfo u401={ 4, 0, 1, 0 };
489 UVersionInfo version;
490 u_getUnicodeVersion(version);
491 if(version[0]<4 || 0==memcmp(version, u401, 4)) {
492 return;
493 }
494 }
495
496 {
497 /*
498 * Sanity check:
499 * Verify that exactly the digit characters have decimal digit values.
500 * This assumption is used in the implementation of u_digit()
501 * (which checks nt=de)
502 * compared with the parallel java.lang.Character.digit()
503 * (which checks Nd).
504 *
505 * This was not true in Unicode 3.2 and earlier.
506 * Unicode 4.0 fixed discrepancies.
507 * Unicode 4.0.1 re-introduced problems in this area due to an
508 * unintentionally incomplete last-minute change.
509 */
510 U_STRING_DECL(digitsPattern, "[:Nd:]", 6);
511 U_STRING_DECL(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
512
513 USet *digits, *decimalValues;
514 UErrorCode errorCode;
515
516 U_STRING_INIT(digitsPattern, "[:Nd:]", 6);
517 U_STRING_INIT(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
518 errorCode=U_ZERO_ERROR;
519 digits=uset_openPattern(digitsPattern, 6, &errorCode);
520 decimalValues=uset_openPattern(decimalValuesPattern, 24, &errorCode);
521
522 if(U_SUCCESS(errorCode)) {
523 compareUSets(digits, decimalValues, "[:Nd:]", "[:Numeric_Type=Decimal:]", TRUE);
524 }
525
526 uset_close(digits);
527 uset_close(decimalValues);
528 }
529 }
530
531 static void testSampleCharProps(UBool propFn(UChar32), const char *propName,
532 const UChar32 *sampleChars, int32_t sampleCharsLength,
533 UBool expected) {
534 int32_t i;
535 for (i = 0; i < sampleCharsLength; ++i) {
536 UBool result = propFn(sampleChars[i]);
537 if (result != expected) {
538 log_err("error: character property function %s(U+%04x)=%d is wrong\n",
539 propName, sampleChars[i], result);
540 }
541 }
542 }
543
544 /* Tests for isDefined(u_isdefined)(, isBaseForm(u_isbase()), isSpaceChar(u_isspace()), isWhiteSpace(), u_CharDigitValue() */
545 static void TestMisc()
546 {
547 static const UChar32 sampleSpaces[] = {0x0020, 0x00a0, 0x2000, 0x2001, 0x2005};
548 static const UChar32 sampleNonSpaces[] = {0x61, 0x62, 0x63, 0x64, 0x74};
549 static const UChar32 sampleUndefined[] = {0xfff1, 0xfff7, 0xfa6e};
550 static const UChar32 sampleDefined[] = {0x523E, 0x4f88, 0xfffd};
551 static const UChar32 sampleBase[] = {0x0061, 0x0031, 0x03d2};
552 static const UChar32 sampleNonBase[] = {0x002B, 0x0020, 0x203B};
553 /* static const UChar sampleChars[] = {0x000a, 0x0045, 0x4e00, 0xDC00, 0xFFE8, 0xFFF0};*/
554 static const UChar32 sampleDigits[]= {0x0030, 0x0662, 0x0F23, 0x0ED5};
555 static const UChar32 sampleNonDigits[] = {0x0010, 0x0041, 0x0122, 0x68FE};
556 static const UChar32 sampleWhiteSpaces[] = {0x2008, 0x2009, 0x200a, 0x001c, 0x000c};
557 static const UChar32 sampleNonWhiteSpaces[] = {0x61, 0x62, 0x3c, 0x28, 0x3f, 0x85, 0x2007, 0xffef};
558
559 static const int32_t sampleDigitValues[] = {0, 2, 3, 5};
560
561 uint32_t mask;
562
563 int32_t i;
564 char icuVersion[U_MAX_VERSION_STRING_LENGTH];
565 UVersionInfo realVersion;
566
567 memset(icuVersion, 0, U_MAX_VERSION_STRING_LENGTH);
568
569 testSampleCharProps(u_isspace, "u_isspace", sampleSpaces, UPRV_LENGTHOF(sampleSpaces), TRUE);
570 testSampleCharProps(u_isspace, "u_isspace", sampleNonSpaces, UPRV_LENGTHOF(sampleNonSpaces), FALSE);
571
572 testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
573 sampleSpaces, UPRV_LENGTHOF(sampleSpaces), TRUE);
574 testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
575 sampleNonSpaces, UPRV_LENGTHOF(sampleNonSpaces), FALSE);
576
577 testSampleCharProps(u_isWhitespace, "u_isWhitespace",
578 sampleWhiteSpaces, UPRV_LENGTHOF(sampleWhiteSpaces), TRUE);
579 testSampleCharProps(u_isWhitespace, "u_isWhitespace",
580 sampleNonWhiteSpaces, UPRV_LENGTHOF(sampleNonWhiteSpaces), FALSE);
581
582 testSampleCharProps(u_isdefined, "u_isdefined",
583 sampleDefined, UPRV_LENGTHOF(sampleDefined), TRUE);
584 testSampleCharProps(u_isdefined, "u_isdefined",
585 sampleUndefined, UPRV_LENGTHOF(sampleUndefined), FALSE);
586
587 testSampleCharProps(u_isbase, "u_isbase", sampleBase, UPRV_LENGTHOF(sampleBase), TRUE);
588 testSampleCharProps(u_isbase, "u_isbase", sampleNonBase, UPRV_LENGTHOF(sampleNonBase), FALSE);
589
590 testSampleCharProps(u_isdigit, "u_isdigit", sampleDigits, UPRV_LENGTHOF(sampleDigits), TRUE);
591 testSampleCharProps(u_isdigit, "u_isdigit", sampleNonDigits, UPRV_LENGTHOF(sampleNonDigits), FALSE);
592
593 for (i = 0; i < UPRV_LENGTHOF(sampleDigits); i++) {
594 if (u_charDigitValue(sampleDigits[i]) != sampleDigitValues[i]) {
595 log_err("error: u_charDigitValue(U+04x)=%d != %d\n",
596 sampleDigits[i], u_charDigitValue(sampleDigits[i]), sampleDigitValues[i]);
597 }
598 }
599
600 /* Tests the ICU version #*/
601 u_getVersion(realVersion);
602 u_versionToString(realVersion, icuVersion);
603 if (strncmp(icuVersion, U_ICU_VERSION, uprv_min((int32_t)strlen(icuVersion), (int32_t)strlen(U_ICU_VERSION))) != 0)
604 {
605 log_err("ICU version test failed. Header says=%s, got=%s \n", U_ICU_VERSION, icuVersion);
606 }
607 #if defined(ICU_VERSION)
608 /* test only happens where we have configure.in with VERSION - sanity check. */
609 if(strcmp(U_ICU_VERSION, ICU_VERSION))
610 {
611 log_err("ICU version mismatch: Header says %s, build environment says %s.\n", U_ICU_VERSION, ICU_VERSION);
612 }
613 #endif
614
615 /* test U_GC_... */
616 if(
617 U_GET_GC_MASK(0x41)!=U_GC_LU_MASK ||
618 U_GET_GC_MASK(0x662)!=U_GC_ND_MASK ||
619 U_GET_GC_MASK(0xa0)!=U_GC_ZS_MASK ||
620 U_GET_GC_MASK(0x28)!=U_GC_PS_MASK ||
621 U_GET_GC_MASK(0x2044)!=U_GC_SM_MASK ||
622 U_GET_GC_MASK(0xe0063)!=U_GC_CF_MASK
623 ) {
624 log_err("error: U_GET_GC_MASK does not work properly\n");
625 }
626
627 mask=0;
628 mask=(mask&~U_GC_CN_MASK)|U_GC_CN_MASK;
629
630 mask=(mask&~U_GC_LU_MASK)|U_GC_LU_MASK;
631 mask=(mask&~U_GC_LL_MASK)|U_GC_LL_MASK;
632 mask=(mask&~U_GC_LT_MASK)|U_GC_LT_MASK;
633 mask=(mask&~U_GC_LM_MASK)|U_GC_LM_MASK;
634 mask=(mask&~U_GC_LO_MASK)|U_GC_LO_MASK;
635
636 mask=(mask&~U_GC_MN_MASK)|U_GC_MN_MASK;
637 mask=(mask&~U_GC_ME_MASK)|U_GC_ME_MASK;
638 mask=(mask&~U_GC_MC_MASK)|U_GC_MC_MASK;
639
640 mask=(mask&~U_GC_ND_MASK)|U_GC_ND_MASK;
641 mask=(mask&~U_GC_NL_MASK)|U_GC_NL_MASK;
642 mask=(mask&~U_GC_NO_MASK)|U_GC_NO_MASK;
643
644 mask=(mask&~U_GC_ZS_MASK)|U_GC_ZS_MASK;
645 mask=(mask&~U_GC_ZL_MASK)|U_GC_ZL_MASK;
646 mask=(mask&~U_GC_ZP_MASK)|U_GC_ZP_MASK;
647
648 mask=(mask&~U_GC_CC_MASK)|U_GC_CC_MASK;
649 mask=(mask&~U_GC_CF_MASK)|U_GC_CF_MASK;
650 mask=(mask&~U_GC_CO_MASK)|U_GC_CO_MASK;
651 mask=(mask&~U_GC_CS_MASK)|U_GC_CS_MASK;
652
653 mask=(mask&~U_GC_PD_MASK)|U_GC_PD_MASK;
654 mask=(mask&~U_GC_PS_MASK)|U_GC_PS_MASK;
655 mask=(mask&~U_GC_PE_MASK)|U_GC_PE_MASK;
656 mask=(mask&~U_GC_PC_MASK)|U_GC_PC_MASK;
657 mask=(mask&~U_GC_PO_MASK)|U_GC_PO_MASK;
658
659 mask=(mask&~U_GC_SM_MASK)|U_GC_SM_MASK;
660 mask=(mask&~U_GC_SC_MASK)|U_GC_SC_MASK;
661 mask=(mask&~U_GC_SK_MASK)|U_GC_SK_MASK;
662 mask=(mask&~U_GC_SO_MASK)|U_GC_SO_MASK;
663
664 mask=(mask&~U_GC_PI_MASK)|U_GC_PI_MASK;
665 mask=(mask&~U_GC_PF_MASK)|U_GC_PF_MASK;
666
667 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
668 log_err("error: problems with U_GC_XX_MASK constants\n");
669 }
670
671 mask=0;
672 mask=(mask&~U_GC_C_MASK)|U_GC_C_MASK;
673 mask=(mask&~U_GC_L_MASK)|U_GC_L_MASK;
674 mask=(mask&~U_GC_M_MASK)|U_GC_M_MASK;
675 mask=(mask&~U_GC_N_MASK)|U_GC_N_MASK;
676 mask=(mask&~U_GC_Z_MASK)|U_GC_Z_MASK;
677 mask=(mask&~U_GC_P_MASK)|U_GC_P_MASK;
678 mask=(mask&~U_GC_S_MASK)|U_GC_S_MASK;
679
680 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
681 log_err("error: problems with U_GC_Y_MASK constants\n");
682 }
683 {
684 static const UChar32 digit[10]={ 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039 };
685 for(i=0; i<10; i++){
686 if(digit[i]!=u_forDigit(i,10)){
687 log_err("u_forDigit failed for %i. Expected: 0x%4X Got: 0x%4X\n",i,digit[i],u_forDigit(i,10));
688 }
689 }
690 }
691
692 /* test u_digit() */
693 {
694 static const struct {
695 UChar32 c;
696 int8_t radix, value;
697 } data[]={
698 /* base 16 */
699 { 0x0031, 16, 1 },
700 { 0x0038, 16, 8 },
701 { 0x0043, 16, 12 },
702 { 0x0066, 16, 15 },
703 { 0x00e4, 16, -1 },
704 { 0x0662, 16, 2 },
705 { 0x06f5, 16, 5 },
706 { 0xff13, 16, 3 },
707 { 0xff41, 16, 10 },
708
709 /* base 8 */
710 { 0x0031, 8, 1 },
711 { 0x0038, 8, -1 },
712 { 0x0043, 8, -1 },
713 { 0x0066, 8, -1 },
714 { 0x00e4, 8, -1 },
715 { 0x0662, 8, 2 },
716 { 0x06f5, 8, 5 },
717 { 0xff13, 8, 3 },
718 { 0xff41, 8, -1 },
719
720 /* base 36 */
721 { 0x5a, 36, 35 },
722 { 0x7a, 36, 35 },
723 { 0xff3a, 36, 35 },
724 { 0xff5a, 36, 35 },
725
726 /* wrong radix values */
727 { 0x0031, 1, -1 },
728 { 0xff3a, 37, -1 }
729 };
730
731 for(i=0; i<UPRV_LENGTHOF(data); ++i) {
732 if(u_digit(data[i].c, data[i].radix)!=data[i].value) {
733 log_err("u_digit(U+%04x, %d)=%d expected %d\n",
734 data[i].c,
735 data[i].radix,
736 u_digit(data[i].c, data[i].radix),
737 data[i].value);
738 }
739 }
740 }
741 }
742
743 /* test C/POSIX-style functions --------------------------------------------- */
744
745 /* bit flags */
746 #define ISAL 1
747 #define ISLO 2
748 #define ISUP 4
749
750 #define ISDI 8
751 #define ISXD 0x10
752
753 #define ISAN 0x20
754
755 #define ISPU 0x40
756 #define ISGR 0x80
757 #define ISPR 0x100
758
759 #define ISSP 0x200
760 #define ISBL 0x400
761 #define ISCN 0x800
762
763 /* C/POSIX-style functions, in the same order as the bit flags */
764 typedef UBool U_EXPORT2 IsPOSIXClass(UChar32 c);
765
766 static const struct {
767 IsPOSIXClass *fn;
768 const char *name;
769 } posixClasses[]={
770 { u_isalpha, "isalpha" },
771 { u_islower, "islower" },
772 { u_isupper, "isupper" },
773 { u_isdigit, "isdigit" },
774 { u_isxdigit, "isxdigit" },
775 { u_isalnum, "isalnum" },
776 { u_ispunct, "ispunct" },
777 { u_isgraph, "isgraph" },
778 { u_isprint, "isprint" },
779 { u_isspace, "isspace" },
780 { u_isblank, "isblank" },
781 { u_iscntrl, "iscntrl" }
782 };
783
784 static const struct {
785 UChar32 c;
786 uint32_t posixResults;
787 } posixData[]={
788 { 0x0008, ISCN }, /* backspace */
789 { 0x0009, ISSP|ISBL|ISCN }, /* TAB */
790 { 0x000a, ISSP| ISCN }, /* LF */
791 { 0x000c, ISSP| ISCN }, /* FF */
792 { 0x000d, ISSP| ISCN }, /* CR */
793 { 0x0020, ISPR|ISSP|ISBL }, /* space */
794 { 0x0021, ISPU|ISGR|ISPR }, /* ! */
795 { 0x0033, ISDI|ISXD|ISAN| ISGR|ISPR }, /* 3 */
796 { 0x0040, ISPU|ISGR|ISPR }, /* @ */
797 { 0x0041, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* A */
798 { 0x007a, ISAL|ISLO| ISAN| ISGR|ISPR }, /* z */
799 { 0x007b, ISPU|ISGR|ISPR }, /* { */
800 { 0x0085, ISSP| ISCN }, /* NEL */
801 { 0x00a0, ISPR|ISSP|ISBL }, /* NBSP */
802 { 0x00a4, ISGR|ISPR }, /* currency sign */
803 { 0x00e4, ISAL|ISLO| ISAN| ISGR|ISPR }, /* a-umlaut */
804 { 0x0300, ISGR|ISPR }, /* combining grave */
805 { 0x0600, ISCN }, /* arabic number sign */
806 { 0x0627, ISAL| ISAN| ISGR|ISPR }, /* alef */
807 { 0x0663, ISDI|ISXD|ISAN| ISGR|ISPR }, /* arabic 3 */
808 { 0x2002, ISPR|ISSP|ISBL }, /* en space */
809 { 0x2007, ISPR|ISSP|ISBL }, /* figure space */
810 { 0x2009, ISPR|ISSP|ISBL }, /* thin space */
811 { 0x200b, ISCN }, /* ZWSP */
812 /*{ 0x200b, ISPR|ISSP },*/ /* ZWSP */ /* ZWSP became a control char in 4.0.1*/
813 { 0x200e, ISCN }, /* LRM */
814 { 0x2028, ISPR|ISSP| ISCN }, /* LS */
815 { 0x2029, ISPR|ISSP| ISCN }, /* PS */
816 { 0x20ac, ISGR|ISPR }, /* Euro */
817 { 0xff15, ISDI|ISXD|ISAN| ISGR|ISPR }, /* fullwidth 5 */
818 { 0xff25, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* fullwidth E */
819 { 0xff35, ISAL| ISUP| ISAN| ISGR|ISPR }, /* fullwidth U */
820 { 0xff45, ISAL|ISLO| ISXD|ISAN| ISGR|ISPR }, /* fullwidth e */
821 { 0xff55, ISAL|ISLO| ISAN| ISGR|ISPR } /* fullwidth u */
822 };
823
824 static void
825 TestPOSIX() {
826 uint32_t mask;
827 int32_t cl, i;
828 UBool expect;
829
830 mask=1;
831 for(cl=0; cl<12; ++cl) {
832 for(i=0; i<UPRV_LENGTHOF(posixData); ++i) {
833 expect=(UBool)((posixData[i].posixResults&mask)!=0);
834 if(posixClasses[cl].fn(posixData[i].c)!=expect) {
835 log_err("u_%s(U+%04x)=%s is wrong\n",
836 posixClasses[cl].name, posixData[i].c, expect ? "FALSE" : "TRUE");
837 }
838 }
839 mask<<=1;
840 }
841 }
842
843 /* Tests for isControl(u_iscntrl()) and isPrintable(u_isprint()) */
844 static void TestControlPrint()
845 {
846 const UChar32 sampleControl[] = {0x1b, 0x97, 0x82, 0x2028, 0x2029, 0x200c, 0x202b};
847 const UChar32 sampleNonControl[] = {0x61, 0x0031, 0x00e2};
848 const UChar32 samplePrintable[] = {0x0042, 0x005f, 0x2014};
849 const UChar32 sampleNonPrintable[] = {0x200c, 0x009f, 0x001b};
850 UChar32 c;
851
852 testSampleCharProps(u_iscntrl, "u_iscntrl", sampleControl, UPRV_LENGTHOF(sampleControl), TRUE);
853 testSampleCharProps(u_iscntrl, "u_iscntrl", sampleNonControl, UPRV_LENGTHOF(sampleNonControl), FALSE);
854
855 testSampleCharProps(u_isprint, "u_isprint",
856 samplePrintable, UPRV_LENGTHOF(samplePrintable), TRUE);
857 testSampleCharProps(u_isprint, "u_isprint",
858 sampleNonPrintable, UPRV_LENGTHOF(sampleNonPrintable), FALSE);
859
860 /* test all ISO 8 controls */
861 for(c=0; c<=0x9f; ++c) {
862 if(c==0x20) {
863 /* skip ASCII graphic characters and continue with DEL */
864 c=0x7f;
865 }
866 if(!u_iscntrl(c)) {
867 log_err("error: u_iscntrl(ISO 8 control U+%04x)=FALSE\n", c);
868 }
869 if(!u_isISOControl(c)) {
870 log_err("error: u_isISOControl(ISO 8 control U+%04x)=FALSE\n", c);
871 }
872 if(u_isprint(c)) {
873 log_err("error: u_isprint(ISO 8 control U+%04x)=TRUE\n", c);
874 }
875 }
876
877 /* test all Latin-1 graphic characters */
878 for(c=0x20; c<=0xff; ++c) {
879 if(c==0x7f) {
880 c=0xa0;
881 } else if(c==0xad) {
882 /* Unicode 4 changes 00AD Soft Hyphen to Cf (and it is in fact not printable) */
883 ++c;
884 }
885 if(!u_isprint(c)) {
886 log_err("error: u_isprint(Latin-1 graphic character U+%04x)=FALSE\n", c);
887 }
888 }
889 }
890
891 /* u_isJavaIDStart, u_isJavaIDPart, u_isIDStart(), u_isIDPart(), u_isIDIgnorable()*/
892 static void TestIdentifier()
893 {
894 const UChar32 sampleJavaIDStart[] = {0x0071, 0x00e4, 0x005f};
895 const UChar32 sampleNonJavaIDStart[] = {0x0020, 0x2030, 0x0082};
896 const UChar32 sampleJavaIDPart[] = {0x005f, 0x0032, 0x0045};
897 const UChar32 sampleNonJavaIDPart[] = {0x2030, 0x2020, 0x0020};
898 const UChar32 sampleUnicodeIDStart[] = {0x0250, 0x00e2, 0x0061};
899 const UChar32 sampleNonUnicodeIDStart[] = {0x2000, 0x000a, 0x2019};
900 const UChar32 sampleUnicodeIDPart[] = {0x005f, 0x0032, 0x0045};
901 const UChar32 sampleNonUnicodeIDPart[] = {0x2030, 0x00a3, 0x0020};
902 const UChar32 sampleIDIgnore[] = {0x0006, 0x0010, 0x206b, 0x85};
903 const UChar32 sampleNonIDIgnore[] = {0x0075, 0x00a3, 0x0061};
904
905 testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
906 sampleJavaIDStart, UPRV_LENGTHOF(sampleJavaIDStart), TRUE);
907 testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
908 sampleNonJavaIDStart, UPRV_LENGTHOF(sampleNonJavaIDStart), FALSE);
909
910 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
911 sampleJavaIDPart, UPRV_LENGTHOF(sampleJavaIDPart), TRUE);
912 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
913 sampleNonJavaIDPart, UPRV_LENGTHOF(sampleNonJavaIDPart), FALSE);
914
915 /* IDPart should imply IDStart */
916 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
917 sampleJavaIDStart, UPRV_LENGTHOF(sampleJavaIDStart), TRUE);
918
919 testSampleCharProps(u_isIDStart, "u_isIDStart",
920 sampleUnicodeIDStart, UPRV_LENGTHOF(sampleUnicodeIDStart), TRUE);
921 testSampleCharProps(u_isIDStart, "u_isIDStart",
922 sampleNonUnicodeIDStart, UPRV_LENGTHOF(sampleNonUnicodeIDStart), FALSE);
923
924 testSampleCharProps(u_isIDPart, "u_isIDPart",
925 sampleUnicodeIDPart, UPRV_LENGTHOF(sampleUnicodeIDPart), TRUE);
926 testSampleCharProps(u_isIDPart, "u_isIDPart",
927 sampleNonUnicodeIDPart, UPRV_LENGTHOF(sampleNonUnicodeIDPart), FALSE);
928
929 /* IDPart should imply IDStart */
930 testSampleCharProps(u_isIDPart, "u_isIDPart",
931 sampleUnicodeIDStart, UPRV_LENGTHOF(sampleUnicodeIDStart), TRUE);
932
933 testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
934 sampleIDIgnore, UPRV_LENGTHOF(sampleIDIgnore), TRUE);
935 testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
936 sampleNonIDIgnore, UPRV_LENGTHOF(sampleNonIDIgnore), FALSE);
937 }
938
939 /* for each line of UnicodeData.txt, check some of the properties */
940 typedef struct UnicodeDataContext {
941 #if UCONFIG_NO_NORMALIZATION
942 const void *dummy;
943 #else
944 const UNormalizer2 *nfc;
945 const UNormalizer2 *nfkc;
946 #endif
947 } UnicodeDataContext;
948
949 /*
950 * ### TODO
951 * This test fails incorrectly if the First or Last code point of a repetitive area
952 * is overridden, which is allowed and is encouraged for the PUAs.
953 * Currently, this means that both area First/Last and override lines are
954 * tested against the properties from the API,
955 * and the area boundary will not match and cause an error.
956 *
957 * This function should detect area boundaries and skip them for the test of individual
958 * code points' properties.
959 * Then it should check that the areas contain all the same properties except where overridden.
960 * For this, it would have had to set a flag for which code points were listed explicitly.
961 */
962 static void U_CALLCONV
963 unicodeDataLineFn(void *context,
964 char *fields[][2], int32_t fieldCount,
965 UErrorCode *pErrorCode)
966 {
967 (void)fieldCount; // suppress compiler warnings about unused variable
968 char buffer[100];
969 const char *d;
970 char *end;
971 uint32_t value;
972 UChar32 c;
973 int32_t i;
974 int8_t type;
975 int32_t dt;
976 UChar dm[32], s[32];
977 int32_t dmLength, length;
978
979 #if !UCONFIG_NO_NORMALIZATION
980 const UNormalizer2 *nfc, *nfkc;
981 #endif
982
983 /* get the character code, field 0 */
984 c=strtoul(fields[0][0], &end, 16);
985 if(end<=fields[0][0] || end!=fields[0][1]) {
986 log_err("error: syntax error in field 0 at %s\n", fields[0][0]);
987 return;
988 }
989 if((uint32_t)c>=UCHAR_MAX_VALUE + 1) {
990 log_err("error in UnicodeData.txt: code point %lu out of range\n", c);
991 return;
992 }
993
994 /* get general category, field 2 */
995 *fields[2][1]=0;
996 type = (int8_t)tagValues[MakeProp(fields[2][0])];
997 if(u_charType(c)!=type) {
998 log_err("error: u_charType(U+%04lx)==%u instead of %u\n", c, u_charType(c), type);
999 }
1000 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1001 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1002 }
1003
1004 /* get canonical combining class, field 3 */
1005 value=strtoul(fields[3][0], &end, 10);
1006 if(end<=fields[3][0] || end!=fields[3][1]) {
1007 log_err("error: syntax error in field 3 at code 0x%lx\n", c);
1008 return;
1009 }
1010 if(value>255) {
1011 log_err("error in UnicodeData.txt: combining class %lu out of range\n", value);
1012 return;
1013 }
1014 #if !UCONFIG_NO_NORMALIZATION
1015 if(value!=u_getCombiningClass(c) || value!=(uint32_t)u_getIntPropertyValue(c, UCHAR_CANONICAL_COMBINING_CLASS)) {
1016 log_err("error: u_getCombiningClass(U+%04lx)==%hu instead of %lu\n", c, u_getCombiningClass(c), value);
1017 }
1018 nfkc=((UnicodeDataContext *)context)->nfkc;
1019 if(value!=unorm2_getCombiningClass(nfkc, c)) {
1020 log_err("error: unorm2_getCombiningClass(nfkc, U+%04lx)==%hu instead of %lu\n", c, unorm2_getCombiningClass(nfkc, c), value);
1021 }
1022 #endif
1023
1024 /* get BiDi category, field 4 */
1025 *fields[4][1]=0;
1026 i=MakeDir(fields[4][0]);
1027 if(i!=(int32_t)u_charDirection(c) || i!=u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)) {
1028 log_err("error: u_charDirection(U+%04lx)==%u instead of %u (%s)\n", c, u_charDirection(c), MakeDir(fields[4][0]), fields[4][0]);
1029 }
1030
1031 /* get Decomposition_Type & Decomposition_Mapping, field 5 */
1032 d=NULL;
1033 if(fields[5][0]==fields[5][1]) {
1034 /* no decomposition, except UnicodeData.txt omits Hangul syllable decompositions */
1035 if(c==0xac00 || c==0xd7a3) {
1036 dt=U_DT_CANONICAL;
1037 } else {
1038 dt=U_DT_NONE;
1039 }
1040 } else {
1041 d=fields[5][0];
1042 *fields[5][1]=0;
1043 dt=UCHAR_INVALID_CODE;
1044 if(*d=='<') {
1045 end=strchr(++d, '>');
1046 if(end!=NULL) {
1047 *end=0;
1048 dt=u_getPropertyValueEnum(UCHAR_DECOMPOSITION_TYPE, d);
1049 d=u_skipWhitespace(end+1);
1050 }
1051 } else {
1052 dt=U_DT_CANONICAL;
1053 }
1054 }
1055 if(dt>U_DT_NONE) {
1056 if(c==0xac00) {
1057 dm[0]=0x1100;
1058 dm[1]=0x1161;
1059 dm[2]=0;
1060 dmLength=2;
1061 } else if(c==0xd7a3) {
1062 dm[0]=0xd788;
1063 dm[1]=0x11c2;
1064 dm[2]=0;
1065 dmLength=2;
1066 } else {
1067 dmLength=u_parseString(d, dm, 32, NULL, pErrorCode);
1068 }
1069 } else {
1070 dmLength=-1;
1071 }
1072 if(dt<0 || U_FAILURE(*pErrorCode)) {
1073 log_err("error in UnicodeData.txt: syntax error in U+%04lX decomposition field\n", (long)c);
1074 return;
1075 }
1076 #if !UCONFIG_NO_NORMALIZATION
1077 i=u_getIntPropertyValue(c, UCHAR_DECOMPOSITION_TYPE);
1078 if(i!=dt) {
1079 log_err("error: u_getIntPropertyValue(U+%04lx, UCHAR_DECOMPOSITION_TYPE)==%d instead of %d\n", c, i, dt);
1080 }
1081 /* Expect Decomposition_Mapping=nfkc.getRawDecomposition(c). */
1082 length=unorm2_getRawDecomposition(nfkc, c, s, 32, pErrorCode);
1083 if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1084 log_err("error: unorm2_getRawDecomposition(nfkc, U+%04lx)==%d instead of %d "
1085 "or the Decomposition_Mapping is different (%s)\n",
1086 c, length, dmLength, u_errorName(*pErrorCode));
1087 return;
1088 }
1089 /* For canonical decompositions only, expect Decomposition_Mapping=nfc.getRawDecomposition(c). */
1090 if(dt!=U_DT_CANONICAL) {
1091 dmLength=-1;
1092 }
1093 nfc=((UnicodeDataContext *)context)->nfc;
1094 length=unorm2_getRawDecomposition(nfc, c, s, 32, pErrorCode);
1095 if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1096 log_err("error: unorm2_getRawDecomposition(nfc, U+%04lx)==%d instead of %d "
1097 "or the Decomposition_Mapping is different (%s)\n",
1098 c, length, dmLength, u_errorName(*pErrorCode));
1099 return;
1100 }
1101 /* recompose */
1102 if(dt==U_DT_CANONICAL && !u_hasBinaryProperty(c, UCHAR_FULL_COMPOSITION_EXCLUSION)) {
1103 UChar32 a, b, composite;
1104 i=0;
1105 U16_NEXT(dm, i, dmLength, a);
1106 U16_NEXT(dm, i, dmLength, b);
1107 /* i==dmLength */
1108 composite=unorm2_composePair(nfc, a, b);
1109 if(composite!=c) {
1110 log_err("error: nfc U+%04lX decomposes to U+%04lX+U+%04lX but does not compose back (instead U+%04lX)\n",
1111 (long)c, (long)a, (long)b, (long)composite);
1112 }
1113 /*
1114 * Note: NFKC has fewer round-trip mappings than NFC,
1115 * so we can't just test unorm2_composePair(nfkc, a, b) here without further data.
1116 */
1117 }
1118 #endif
1119
1120 /* get ISO Comment, field 11 */
1121 *fields[11][1]=0;
1122 i=u_getISOComment(c, buffer, sizeof(buffer), pErrorCode);
1123 if(U_FAILURE(*pErrorCode) || 0!=strcmp(fields[11][0], buffer)) {
1124 log_err_status(*pErrorCode, "error: u_getISOComment(U+%04lx) wrong (%s): \"%s\" should be \"%s\"\n",
1125 c, u_errorName(*pErrorCode),
1126 U_FAILURE(*pErrorCode) ? buffer : "[error]",
1127 fields[11][0]);
1128 }
1129
1130 /* get uppercase mapping, field 12 */
1131 if(fields[12][0]!=fields[12][1]) {
1132 value=strtoul(fields[12][0], &end, 16);
1133 if(end!=fields[12][1]) {
1134 log_err("error: syntax error in field 12 at code 0x%lx\n", c);
1135 return;
1136 }
1137 if((UChar32)value!=u_toupper(c)) {
1138 log_err("error: u_toupper(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_toupper(c), value);
1139 }
1140 } else {
1141 /* no case mapping: the API must map the code point to itself */
1142 if(c!=u_toupper(c)) {
1143 log_err("error: U+%04lx does not have an uppercase mapping but u_toupper()==U+%04lx\n", c, u_toupper(c));
1144 }
1145 }
1146
1147 /* get lowercase mapping, field 13 */
1148 if(fields[13][0]!=fields[13][1]) {
1149 value=strtoul(fields[13][0], &end, 16);
1150 if(end!=fields[13][1]) {
1151 log_err("error: syntax error in field 13 at code 0x%lx\n", c);
1152 return;
1153 }
1154 if((UChar32)value!=u_tolower(c)) {
1155 log_err("error: u_tolower(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_tolower(c), value);
1156 }
1157 } else {
1158 /* no case mapping: the API must map the code point to itself */
1159 if(c!=u_tolower(c)) {
1160 log_err("error: U+%04lx does not have a lowercase mapping but u_tolower()==U+%04lx\n", c, u_tolower(c));
1161 }
1162 }
1163
1164 /* get titlecase mapping, field 14 */
1165 if(fields[14][0]!=fields[14][1]) {
1166 value=strtoul(fields[14][0], &end, 16);
1167 if(end!=fields[14][1]) {
1168 log_err("error: syntax error in field 14 at code 0x%lx\n", c);
1169 return;
1170 }
1171 if((UChar32)value!=u_totitle(c)) {
1172 log_err("error: u_totitle(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_totitle(c), value);
1173 }
1174 } else {
1175 /* no case mapping: the API must map the code point to itself */
1176 if(c!=u_totitle(c)) {
1177 log_err("error: U+%04lx does not have a titlecase mapping but u_totitle()==U+%04lx\n", c, u_totitle(c));
1178 }
1179 }
1180 }
1181
1182 static UBool U_CALLCONV
1183 enumTypeRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1184 static const UChar32 test[][2]={
1185 {0x41, U_UPPERCASE_LETTER},
1186 {0x308, U_NON_SPACING_MARK},
1187 {0xfffe, U_GENERAL_OTHER_TYPES},
1188 {0xe0041, U_FORMAT_CHAR},
1189 {0xeffff, U_UNASSIGNED}
1190 };
1191
1192 int32_t i, count;
1193
1194 if(0!=strcmp((const char *)context, "a1")) {
1195 log_err("error: u_enumCharTypes() passes on an incorrect context pointer\n");
1196 return FALSE;
1197 }
1198
1199 count=UPRV_LENGTHOF(test);
1200 for(i=0; i<count; ++i) {
1201 if(start<=test[i][0] && test[i][0]<limit) {
1202 if(type!=(UCharCategory)test[i][1]) {
1203 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld instead of U+%04lx with %ld\n",
1204 start, limit, (long)type, test[i][0], test[i][1]);
1205 }
1206 /* stop at the range that includes the last test code point (increases code coverage for enumeration) */
1207 return i==(count-1) ? FALSE : TRUE;
1208 }
1209 }
1210
1211 if(start>test[count-1][0]) {
1212 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld after it should have stopped\n",
1213 start, limit, (long)type);
1214 return FALSE;
1215 }
1216
1217 return TRUE;
1218 }
1219
1220 static UBool U_CALLCONV
1221 enumDefaultsRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1222 (void)context; // suppress compiler warnings about unused variable
1223
1224 /* default Bidi classes for unassigned code points, from the DerivedBidiClass.txt header */
1225 static const int32_t defaultBidi[][2]={ /* { limit, class } */
1226 { 0x0590, U_LEFT_TO_RIGHT },
1227 { 0x0600, U_RIGHT_TO_LEFT },
1228 { 0x07C0, U_RIGHT_TO_LEFT_ARABIC },
1229 { 0x0860, U_RIGHT_TO_LEFT },
1230 { 0x0870, U_RIGHT_TO_LEFT_ARABIC }, // Unicode 10 changes U+0860..U+086F from R to AL.
1231 { 0x08A0, U_RIGHT_TO_LEFT },
1232 { 0x0900, U_RIGHT_TO_LEFT_ARABIC }, /* Unicode 6.1 changes U+08A0..U+08FF from R to AL */
1233 { 0x20A0, U_LEFT_TO_RIGHT },
1234 { 0x20D0, U_EUROPEAN_NUMBER_TERMINATOR }, /* Unicode 6.3 changes the currency symbols block U+20A0..U+20CF to default to ET not L */
1235 { 0xFB1D, U_LEFT_TO_RIGHT },
1236 { 0xFB50, U_RIGHT_TO_LEFT },
1237 { 0xFE00, U_RIGHT_TO_LEFT_ARABIC },
1238 { 0xFE70, U_LEFT_TO_RIGHT },
1239 { 0xFF00, U_RIGHT_TO_LEFT_ARABIC },
1240
1241 { 0x10800, U_LEFT_TO_RIGHT },
1242 { 0x10D00, U_RIGHT_TO_LEFT }, // Unicode 11 changes U+10D00..U+10D3F from R to AL.
1243 { 0x10D40, U_RIGHT_TO_LEFT_ARABIC },
1244 { 0x10F30, U_RIGHT_TO_LEFT }, // Unicode 11 changes U+10F30..U+10F6F from R to AL.
1245 { 0x10F70, U_RIGHT_TO_LEFT_ARABIC },
1246 { 0x11000, U_RIGHT_TO_LEFT },
1247
1248 { 0x1E800, U_LEFT_TO_RIGHT }, /* new default-R range in Unicode 5.2: U+1E800 - U+1EFFF */
1249 { 0x1EC70, U_RIGHT_TO_LEFT }, // Unicode 11 changes U+1EC70..U+1ECBF from R to AL.
1250 { 0x1ECC0, U_RIGHT_TO_LEFT_ARABIC },
1251 { 0x1ED00, U_RIGHT_TO_LEFT }, // Unicode 12 changes U+1ED00..U+1ED4F from R to AL.
1252 { 0x1ED50, U_RIGHT_TO_LEFT_ARABIC },
1253 { 0x1EE00, U_RIGHT_TO_LEFT },
1254 { 0x1EF00, U_RIGHT_TO_LEFT_ARABIC }, /* Unicode 6.1 changes U+1EE00..U+1EEFF from R to AL */
1255 { 0x1F000, U_RIGHT_TO_LEFT },
1256 { 0x110000, U_LEFT_TO_RIGHT }
1257 };
1258
1259 UChar32 c;
1260 int32_t i;
1261 UCharDirection shouldBeDir;
1262
1263 /*
1264 * LineBreak.txt specifies:
1265 * # - Assigned characters that are not listed explicitly are given the value
1266 * # "AL".
1267 * # - Unassigned characters are given the value "XX".
1268 *
1269 * PUA characters are listed explicitly with "XX".
1270 * Verify that no assigned character has "XX".
1271 */
1272 if(type!=U_UNASSIGNED && type!=U_PRIVATE_USE_CHAR) {
1273 c=start;
1274 while(c<limit) {
1275 if(0==u_getIntPropertyValue(c, UCHAR_LINE_BREAK)) {
1276 log_err("error UCHAR_LINE_BREAK(assigned U+%04lx)=XX\n", c);
1277 }
1278 ++c;
1279 }
1280 }
1281
1282 /*
1283 * Verify default Bidi classes.
1284 * See DerivedBidiClass.txt, especially for unassigned code points.
1285 */
1286 if(type==U_UNASSIGNED || type==U_PRIVATE_USE_CHAR) {
1287 /* enumerate the intersections of defaultBidi ranges with [start..limit[ */
1288 c=start;
1289 for(i=0; i<UPRV_LENGTHOF(defaultBidi) && c<limit; ++i) {
1290 if((int32_t)c<defaultBidi[i][0]) {
1291 while(c<limit && (int32_t)c<defaultBidi[i][0]) {
1292 if(U_IS_UNICODE_NONCHAR(c) || u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
1293 shouldBeDir=U_BOUNDARY_NEUTRAL;
1294 } else {
1295 shouldBeDir=(UCharDirection)defaultBidi[i][1];
1296 }
1297
1298 if( u_charDirection(c)!=shouldBeDir ||
1299 (UCharDirection)u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)!=shouldBeDir
1300 ) {
1301 log_err("error: u_charDirection(unassigned/PUA U+%04lx)=%s should be %s\n",
1302 c, dirStrings[u_charDirection(c)], dirStrings[shouldBeDir]);
1303 }
1304 ++c;
1305 }
1306 }
1307 }
1308 }
1309
1310 return TRUE;
1311 }
1312
1313 /* tests for several properties */
1314 static void TestUnicodeData()
1315 {
1316 UVersionInfo expectVersionArray;
1317 UVersionInfo versionArray;
1318 char *fields[15][2];
1319 UErrorCode errorCode;
1320 UChar32 c;
1321 int8_t type;
1322
1323 UnicodeDataContext context;
1324
1325 u_versionFromString(expectVersionArray, U_UNICODE_VERSION);
1326 u_getUnicodeVersion(versionArray);
1327 if(memcmp(versionArray, expectVersionArray, U_MAX_VERSION_LENGTH) != 0)
1328 {
1329 log_err("Testing u_getUnicodeVersion() - expected " U_UNICODE_VERSION " got %d.%d.%d.%d\n",
1330 versionArray[0], versionArray[1], versionArray[2], versionArray[3]);
1331 }
1332
1333 #if defined(ICU_UNICODE_VERSION)
1334 /* test only happens where we have configure.in with UNICODE_VERSION - sanity check. */
1335 if(strcmp(U_UNICODE_VERSION, ICU_UNICODE_VERSION))
1336 {
1337 log_err("Testing configure.in's ICU_UNICODE_VERSION - expected " U_UNICODE_VERSION " got " ICU_UNICODE_VERSION "\n");
1338 }
1339 #endif
1340
1341 if (ublock_getCode((UChar)0x0041) != UBLOCK_BASIC_LATIN || u_getIntPropertyValue(0x41, UCHAR_BLOCK)!=(int32_t)UBLOCK_BASIC_LATIN) {
1342 log_err("ublock_getCode(U+0041) property failed! Expected : %i Got: %i \n", UBLOCK_BASIC_LATIN,ublock_getCode((UChar)0x0041));
1343 }
1344
1345 errorCode=U_ZERO_ERROR;
1346 #if !UCONFIG_NO_NORMALIZATION
1347 context.nfc=unorm2_getNFCInstance(&errorCode);
1348 context.nfkc=unorm2_getNFKCInstance(&errorCode);
1349 if(U_FAILURE(errorCode)) {
1350 log_data_err("error: unable to open an NFC or NFKC UNormalizer2 - %s\n", u_errorName(errorCode));
1351 return;
1352 }
1353 #endif
1354 parseUCDFile("UnicodeData.txt", fields, 15, unicodeDataLineFn, &context, &errorCode);
1355 if(U_FAILURE(errorCode)) {
1356 return; /* if we couldn't parse UnicodeData.txt, we should return */
1357 }
1358
1359 /* sanity check on repeated properties */
1360 for(c=0xfffe; c<=0x10ffff;) {
1361 type=u_charType(c);
1362 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1363 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1364 }
1365 if(type!=U_UNASSIGNED) {
1366 log_err("error: u_charType(U+%04lx)!=U_UNASSIGNED (returns %d)\n", c, u_charType(c));
1367 }
1368 if((c&0xffff)==0xfffe) {
1369 ++c;
1370 } else {
1371 c+=0xffff;
1372 }
1373 }
1374
1375 /* test that PUA is not "unassigned" */
1376 for(c=0xe000; c<=0x10fffd;) {
1377 type=u_charType(c);
1378 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1379 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1380 }
1381 if(type==U_UNASSIGNED) {
1382 log_err("error: u_charType(U+%04lx)==U_UNASSIGNED\n", c);
1383 } else if(type!=U_PRIVATE_USE_CHAR) {
1384 log_verbose("PUA override: u_charType(U+%04lx)=%d\n", c, type);
1385 }
1386 if(c==0xf8ff) {
1387 c=0xf0000;
1388 } else if(c==0xffffd) {
1389 c=0x100000;
1390 } else {
1391 ++c;
1392 }
1393 }
1394
1395 /* test u_enumCharTypes() */
1396 u_enumCharTypes(enumTypeRange, "a1");
1397
1398 /* check default properties */
1399 u_enumCharTypes(enumDefaultsRange, NULL);
1400 }
1401
1402 static void TestCodeUnit(){
1403 const UChar codeunit[]={0x0000,0xe065,0x20ac,0xd7ff,0xd800,0xd841,0xd905,0xdbff,0xdc00,0xdc02,0xddee,0xdfff,0};
1404
1405 int32_t i;
1406
1407 for(i=0; i<UPRV_LENGTHOF(codeunit); i++){
1408 UChar c=codeunit[i];
1409 if(i<4){
1410 if(!(U16_IS_SINGLE(c)) || (U16_IS_LEAD(c)) || (U16_IS_TRAIL(c)) ||
1411 U16_IS_SURROGATE(c) || U_IS_SURROGATE(c)) {
1412 log_err("ERROR: U+%04x is a single", c);
1413 }
1414
1415 }
1416 if(i >= 4 && i< 8){
1417 if(!(U16_IS_LEAD(c)) || U16_IS_SINGLE(c) || U16_IS_TRAIL(c) ||
1418 !U16_IS_SURROGATE(c) || !U_IS_SURROGATE(c)){
1419 log_err("ERROR: U+%04x is a first surrogate", c);
1420 }
1421 }
1422 if(i >= 8 && i< 12){
1423 if(!(U16_IS_TRAIL(c)) || U16_IS_SINGLE(c) || U16_IS_LEAD(c) ||
1424 !U16_IS_SURROGATE(c) || !U_IS_SURROGATE(c)){
1425 log_err("ERROR: U+%04x is a second surrogate", c);
1426 }
1427 }
1428 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1429 if(i<4){
1430 if(!(UTF_IS_SINGLE(c)) || (UTF_IS_LEAD(c)) || (UTF_IS_TRAIL(c)) ||(UTF_IS_SURROGATE(c))){
1431 log_err("ERROR: U+%04x is a single", c);
1432 }
1433
1434 }
1435 if(i >= 4 && i< 8){
1436 if(!(UTF_IS_LEAD(c)) || UTF_IS_SINGLE(c) || UTF_IS_TRAIL(c) || !(UTF_IS_SURROGATE(c))){
1437 log_err("ERROR: U+%04x is a first surrogate", c);
1438 }
1439 }
1440 if(i >= 8 && i< 12){
1441 if(!(UTF_IS_TRAIL(c)) || UTF_IS_SINGLE(c) || UTF_IS_LEAD(c) || !(UTF_IS_SURROGATE(c))){
1442 log_err("ERROR: U+%04x is a second surrogate", c);
1443 }
1444 }
1445 #endif
1446 }
1447 }
1448
1449 static void TestCodePoint(){
1450 const UChar32 codePoint[]={
1451 /*surrogate, notvalid(codepoint), not a UnicodeChar, not Error */
1452 0xd800,
1453 0xdbff,
1454 0xdc00,
1455 0xdfff,
1456 0xdc04,
1457 0xd821,
1458 /*not a surrogate, valid, isUnicodeChar , not Error*/
1459 0x20ac,
1460 0xd7ff,
1461 0xe000,
1462 0xe123,
1463 0x0061,
1464 0xe065,
1465 0x20402,
1466 0x24506,
1467 0x23456,
1468 0x20402,
1469 0x10402,
1470 0x23456,
1471 /*not a surrogate, not valid, isUnicodeChar, isError */
1472 0x0015,
1473 0x009f,
1474 /*not a surrogate, not valid, not isUnicodeChar, isError */
1475 0xffff,
1476 0xfffe,
1477 };
1478 int32_t i;
1479 for(i=0; i<UPRV_LENGTHOF(codePoint); i++) {
1480 UChar32 c=codePoint[i];
1481 if(i<6) {
1482 if(!U_IS_SURROGATE(c) || !U16_IS_SURROGATE(c)) {
1483 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1484 }
1485 if(U_IS_UNICODE_CHAR(c)) {
1486 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1487 }
1488 } else if(i >=6 && i<18) {
1489 if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1490 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1491 }
1492 if(!U_IS_UNICODE_CHAR(c)) {
1493 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1494 }
1495 } else if(i >=18 && i<20) {
1496 if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1497 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1498 }
1499 if(!U_IS_UNICODE_CHAR(c)) {
1500 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1501 }
1502 } else if(i >=18 && i<UPRV_LENGTHOF(codePoint)) {
1503 if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1504 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1505 }
1506 if(U_IS_UNICODE_CHAR(c)) {
1507 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1508 }
1509 }
1510 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1511 if(i<6){
1512 if(!UTF_IS_SURROGATE(c)){
1513 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1514 }
1515 if(UTF_IS_VALID(c)){
1516 log_err("ERROR: isValid() failed for U+%04x\n", c);
1517 }
1518 if(UTF_IS_UNICODE_CHAR(c)){
1519 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1520 }
1521 if(UTF_IS_ERROR(c)){
1522 log_err("ERROR: isError() failed for U+%04x\n", c);
1523 }
1524 }else if(i >=6 && i<18){
1525 if(UTF_IS_SURROGATE(c)){
1526 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1527 }
1528 if(!UTF_IS_VALID(c)){
1529 log_err("ERROR: isValid() failed for U+%04x\n", c);
1530 }
1531 if(!UTF_IS_UNICODE_CHAR(c)){
1532 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1533 }
1534 if(UTF_IS_ERROR(c)){
1535 log_err("ERROR: isError() failed for U+%04x\n", c);
1536 }
1537 }else if(i >=18 && i<20){
1538 if(UTF_IS_SURROGATE(c)){
1539 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1540 }
1541 if(UTF_IS_VALID(c)){
1542 log_err("ERROR: isValid() failed for U+%04x\n", c);
1543 }
1544 if(!UTF_IS_UNICODE_CHAR(c)){
1545 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1546 }
1547 if(!UTF_IS_ERROR(c)){
1548 log_err("ERROR: isError() failed for U+%04x\n", c);
1549 }
1550 }
1551 else if(i >=18 && i<UPRV_LENGTHOF(codePoint)){
1552 if(UTF_IS_SURROGATE(c)){
1553 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1554 }
1555 if(UTF_IS_VALID(c)){
1556 log_err("ERROR: isValid() failed for U+%04x\n", c);
1557 }
1558 if(UTF_IS_UNICODE_CHAR(c)){
1559 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1560 }
1561 if(!UTF_IS_ERROR(c)){
1562 log_err("ERROR: isError() failed for U+%04x\n", c);
1563 }
1564 }
1565 #endif
1566 }
1567
1568 if(
1569 !U_IS_BMP(0) || !U_IS_BMP(0x61) || !U_IS_BMP(0x20ac) ||
1570 !U_IS_BMP(0xd9da) || !U_IS_BMP(0xdfed) || !U_IS_BMP(0xffff) ||
1571 U_IS_BMP(U_SENTINEL) || U_IS_BMP(0x10000) || U_IS_BMP(0x50005) ||
1572 U_IS_BMP(0x10ffff) || U_IS_BMP(0x110000) || U_IS_BMP(0x7fffffff)
1573 ) {
1574 log_err("error with U_IS_BMP()\n");
1575 }
1576
1577 if(
1578 U_IS_SUPPLEMENTARY(0) || U_IS_SUPPLEMENTARY(0x61) || U_IS_SUPPLEMENTARY(0x20ac) ||
1579 U_IS_SUPPLEMENTARY(0xd9da) || U_IS_SUPPLEMENTARY(0xdfed) || U_IS_SUPPLEMENTARY(0xffff) ||
1580 U_IS_SUPPLEMENTARY(U_SENTINEL) || !U_IS_SUPPLEMENTARY(0x10000) || !U_IS_SUPPLEMENTARY(0x50005) ||
1581 !U_IS_SUPPLEMENTARY(0x10ffff) || U_IS_SUPPLEMENTARY(0x110000) || U_IS_SUPPLEMENTARY(0x7fffffff)
1582 ) {
1583 log_err("error with U_IS_SUPPLEMENTARY()\n");
1584 }
1585 }
1586
1587 static void TestCharLength()
1588 {
1589 const int32_t codepoint[]={
1590 1, 0x0061,
1591 1, 0xe065,
1592 1, 0x20ac,
1593 2, 0x20402,
1594 2, 0x23456,
1595 2, 0x24506,
1596 2, 0x20402,
1597 2, 0x10402,
1598 1, 0xd7ff,
1599 1, 0xe000
1600 };
1601
1602 int32_t i;
1603 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1604 UBool multiple;
1605 #endif
1606 for(i=0; i<UPRV_LENGTHOF(codepoint); i=(int16_t)(i+2)){
1607 UChar32 c=codepoint[i+1];
1608 if(
1609 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1610 UTF_CHAR_LENGTH(c) != codepoint[i] ||
1611 #endif
1612 U16_LENGTH(c) != codepoint[i]) {
1613 log_err("The no: of code units for U+%04x:- Expected: %d Got: %d\n", c, codepoint[i], U16_LENGTH(c));
1614 }
1615 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1616 multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
1617 if(UTF_NEED_MULTIPLE_UCHAR(c) != multiple){
1618 log_err("ERROR: Unicode::needMultipleUChar() failed for U+%04x\n", c);
1619 }
1620 #endif
1621 }
1622 }
1623
1624 /*internal functions ----*/
1625 static int32_t MakeProp(char* str)
1626 {
1627 int32_t result = 0;
1628 char* matchPosition =0;
1629
1630 matchPosition = strstr(tagStrings, str);
1631 if (matchPosition == 0)
1632 {
1633 log_err("unrecognized type letter ");
1634 log_err(str);
1635 }
1636 else
1637 result = (int32_t)((matchPosition - tagStrings) / 2);
1638 return result;
1639 }
1640
1641 static int32_t MakeDir(char* str)
1642 {
1643 int32_t pos = 0;
1644 for (pos = 0; pos < U_CHAR_DIRECTION_COUNT; pos++) {
1645 if (strcmp(str, dirStrings[pos]) == 0) {
1646 return pos;
1647 }
1648 }
1649 return -1;
1650 }
1651
1652 /* test u_charName() -------------------------------------------------------- */
1653
1654 static const struct {
1655 uint32_t code;
1656 const char *name, *oldName, *extName, *alias;
1657 } names[]={
1658 {0x0061, "LATIN SMALL LETTER A", "", "LATIN SMALL LETTER A", NULL},
1659 {0x01a2, "LATIN CAPITAL LETTER OI", "",
1660 "LATIN CAPITAL LETTER OI",
1661 "LATIN CAPITAL LETTER GHA"},
1662 {0x0284, "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK", "",
1663 "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK", NULL},
1664 {0x0fd0, "TIBETAN MARK BSKA- SHOG GI MGO RGYAN", "",
1665 "TIBETAN MARK BSKA- SHOG GI MGO RGYAN",
1666 "TIBETAN MARK BKA- SHOG GI MGO RGYAN"},
1667 {0x3401, "CJK UNIFIED IDEOGRAPH-3401", "", "CJK UNIFIED IDEOGRAPH-3401", NULL},
1668 {0x7fed, "CJK UNIFIED IDEOGRAPH-7FED", "", "CJK UNIFIED IDEOGRAPH-7FED", NULL},
1669 {0xac00, "HANGUL SYLLABLE GA", "", "HANGUL SYLLABLE GA", NULL},
1670 {0xd7a3, "HANGUL SYLLABLE HIH", "", "HANGUL SYLLABLE HIH", NULL},
1671 {0xd800, "", "", "<lead surrogate-D800>", NULL},
1672 {0xdc00, "", "", "<trail surrogate-DC00>", NULL},
1673 {0xff08, "FULLWIDTH LEFT PARENTHESIS", "", "FULLWIDTH LEFT PARENTHESIS", NULL},
1674 {0xffe5, "FULLWIDTH YEN SIGN", "", "FULLWIDTH YEN SIGN", NULL},
1675 {0xffff, "", "", "<noncharacter-FFFF>", NULL},
1676 {0x1d0c5, "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", "",
1677 "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS",
1678 "BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS"},
1679 {0x23456, "CJK UNIFIED IDEOGRAPH-23456", "", "CJK UNIFIED IDEOGRAPH-23456", NULL}
1680 };
1681
1682 static UBool
1683 enumCharNamesFn(void *context,
1684 UChar32 code, UCharNameChoice nameChoice,
1685 const char *name, int32_t length) {
1686 int32_t *pCount=(int32_t *)context;
1687 const char *expected;
1688 int i;
1689
1690 if(length<=0 || length!=(int32_t)strlen(name)) {
1691 /* should not be called with an empty string or invalid length */
1692 log_err("u_enumCharName(0x%lx)=%s but length=%ld\n", name, length);
1693 return TRUE;
1694 }
1695
1696 ++*pCount;
1697 for(i=0; i<UPRV_LENGTHOF(names); ++i) {
1698 if(code==(UChar32)names[i].code) {
1699 switch (nameChoice) {
1700 case U_EXTENDED_CHAR_NAME:
1701 if(0!=strcmp(name, names[i].extName)) {
1702 log_err("u_enumCharName(0x%lx - Extended)=%s instead of %s\n", code, name, names[i].extName);
1703 }
1704 break;
1705 case U_UNICODE_CHAR_NAME:
1706 if(0!=strcmp(name, names[i].name)) {
1707 log_err("u_enumCharName(0x%lx)=%s instead of %s\n", code, name, names[i].name);
1708 }
1709 break;
1710 case U_UNICODE_10_CHAR_NAME:
1711 expected=names[i].oldName;
1712 if(expected[0]==0 || 0!=strcmp(name, expected)) {
1713 log_err("u_enumCharName(0x%lx - 1.0)=%s instead of %s\n", code, name, expected);
1714 }
1715 break;
1716 case U_CHAR_NAME_ALIAS:
1717 expected=names[i].alias;
1718 if(expected==NULL || expected[0]==0 || 0!=strcmp(name, expected)) {
1719 log_err("u_enumCharName(0x%lx - alias)=%s instead of %s\n", code, name, expected);
1720 }
1721 break;
1722 case U_CHAR_NAME_CHOICE_COUNT:
1723 break;
1724 }
1725 break;
1726 }
1727 }
1728 return TRUE;
1729 }
1730
1731 struct enumExtCharNamesContext {
1732 uint32_t length;
1733 int32_t last;
1734 };
1735
1736 static UBool
1737 enumExtCharNamesFn(void *context,
1738 UChar32 code, UCharNameChoice nameChoice,
1739 const char *name, int32_t length) {
1740 struct enumExtCharNamesContext *ecncp = (struct enumExtCharNamesContext *) context;
1741
1742 if (ecncp->last != (int32_t) code - 1) {
1743 if (ecncp->last < 0) {
1744 log_err("u_enumCharName(0x%lx - Ext) after u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x%lx - Ext)\n", code, ecncp->last, ecncp->last + 1);
1745 } else {
1746 log_err("u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x0 - Ext)\n", code);
1747 }
1748 }
1749 ecncp->last = (int32_t) code;
1750
1751 if (!*name) {
1752 log_err("u_enumCharName(0x%lx - Ext) should not be an empty string\n", code);
1753 }
1754
1755 return enumCharNamesFn(&ecncp->length, code, nameChoice, name, length);
1756 }
1757
1758 /**
1759 * This can be made more efficient by moving it into putil.c and having
1760 * it directly access the ebcdic translation tables.
1761 * TODO: If we get this method in putil.c, then delete it from here.
1762 */
1763 static UChar
1764 u_charToUChar(char c) {
1765 UChar uc;
1766 u_charsToUChars(&c, &uc, 1);
1767 return uc;
1768 }
1769
1770 static void
1771 TestCharNames() {
1772 static char name[80];
1773 UErrorCode errorCode=U_ZERO_ERROR;
1774 struct enumExtCharNamesContext extContext;
1775 const char *expected;
1776 int32_t length;
1777 UChar32 c;
1778 int32_t i;
1779
1780 log_verbose("Testing uprv_getMaxCharNameLength()\n");
1781 length=uprv_getMaxCharNameLength();
1782 if(length==0) {
1783 /* no names data available */
1784 return;
1785 }
1786 if(length<83) { /* Unicode 3.2 max char name length */
1787 log_err("uprv_getMaxCharNameLength()=%d is too short");
1788 }
1789 /* ### TODO same tests for max ISO comment length as for max name length */
1790
1791 log_verbose("Testing u_charName()\n");
1792 for(i=0; i<UPRV_LENGTHOF(names); ++i) {
1793 /* modern Unicode character name */
1794 length=u_charName(names[i].code, U_UNICODE_CHAR_NAME, name, sizeof(name), &errorCode);
1795 if(U_FAILURE(errorCode)) {
1796 log_err("u_charName(0x%lx) error %s\n", names[i].code, u_errorName(errorCode));
1797 return;
1798 }
1799 if(length<0 || 0!=strcmp(name, names[i].name) || length!=(uint16_t)strlen(name)) {
1800 log_err("u_charName(0x%lx) gets: %s (length %ld) instead of: %s\n", names[i].code, name, length, names[i].name);
1801 }
1802
1803 /* find the modern name */
1804 if (*names[i].name) {
1805 c=u_charFromName(U_UNICODE_CHAR_NAME, names[i].name, &errorCode);
1806 if(U_FAILURE(errorCode)) {
1807 log_err("u_charFromName(%s) error %s\n", names[i].name, u_errorName(errorCode));
1808 return;
1809 }
1810 if(c!=(UChar32)names[i].code) {
1811 log_err("u_charFromName(%s) gets 0x%lx instead of 0x%lx\n", names[i].name, c, names[i].code);
1812 }
1813 }
1814
1815 /* Unicode 1.0 character name */
1816 length=u_charName(names[i].code, U_UNICODE_10_CHAR_NAME, name, sizeof(name), &errorCode);
1817 if(U_FAILURE(errorCode)) {
1818 log_err("u_charName(0x%lx - 1.0) error %s\n", names[i].code, u_errorName(errorCode));
1819 return;
1820 }
1821 if(length<0 || (length>0 && 0!=strcmp(name, names[i].oldName)) || length!=(uint16_t)strlen(name)) {
1822 log_err("u_charName(0x%lx - 1.0) gets %s length %ld instead of nothing or %s\n", names[i].code, name, length, names[i].oldName);
1823 }
1824
1825 /* find the Unicode 1.0 name if it is stored (length>0 means that we could read it) */
1826 if(names[i].oldName[0]!=0 /* && length>0 */) {
1827 c=u_charFromName(U_UNICODE_10_CHAR_NAME, names[i].oldName, &errorCode);
1828 if(U_FAILURE(errorCode)) {
1829 log_err("u_charFromName(%s - 1.0) error %s\n", names[i].oldName, u_errorName(errorCode));
1830 return;
1831 }
1832 if(c!=(UChar32)names[i].code) {
1833 log_err("u_charFromName(%s - 1.0) gets 0x%lx instead of 0x%lx\n", names[i].oldName, c, names[i].code);
1834 }
1835 }
1836
1837 /* Unicode character name alias */
1838 length=u_charName(names[i].code, U_CHAR_NAME_ALIAS, name, sizeof(name), &errorCode);
1839 if(U_FAILURE(errorCode)) {
1840 log_err("u_charName(0x%lx - alias) error %s\n", names[i].code, u_errorName(errorCode));
1841 return;
1842 }
1843 expected=names[i].alias;
1844 if(expected==NULL) {
1845 expected="";
1846 }
1847 if(length<0 || (length>0 && 0!=strcmp(name, expected)) || length!=(uint16_t)strlen(name)) {
1848 log_err("u_charName(0x%lx - alias) gets %s length %ld instead of nothing or %s\n",
1849 names[i].code, name, length, expected);
1850 }
1851
1852 /* find the Unicode character name alias if it is stored (length>0 means that we could read it) */
1853 if(expected[0]!=0 /* && length>0 */) {
1854 c=u_charFromName(U_CHAR_NAME_ALIAS, expected, &errorCode);
1855 if(U_FAILURE(errorCode)) {
1856 log_err("u_charFromName(%s - alias) error %s\n",
1857 expected, u_errorName(errorCode));
1858 return;
1859 }
1860 if(c!=(UChar32)names[i].code) {
1861 log_err("u_charFromName(%s - alias) gets 0x%lx instead of 0x%lx\n",
1862 expected, c, names[i].code);
1863 }
1864 }
1865 }
1866
1867 /* test u_enumCharNames() */
1868 length=0;
1869 errorCode=U_ZERO_ERROR;
1870 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumCharNamesFn, &length, U_UNICODE_CHAR_NAME, &errorCode);
1871 if(U_FAILURE(errorCode) || length<94140) {
1872 log_err("u_enumCharNames(%ld..%lx) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE, u_errorName(errorCode), length);
1873 }
1874
1875 extContext.length = 0;
1876 extContext.last = -1;
1877 errorCode=U_ZERO_ERROR;
1878 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumExtCharNamesFn, &extContext, U_EXTENDED_CHAR_NAME, &errorCode);
1879 if(U_FAILURE(errorCode) || extContext.length<UCHAR_MAX_VALUE + 1) {
1880 log_err("u_enumCharNames(%ld..0x%lx - Extended) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, u_errorName(errorCode), extContext.length);
1881 }
1882
1883 /* test that u_charFromName() uppercases the input name, i.e., works with mixed-case names (new in 2.0) */
1884 if(0x61!=u_charFromName(U_UNICODE_CHAR_NAME, "LATin smALl letTER A", &errorCode)) {
1885 log_err("u_charFromName(U_UNICODE_CHAR_NAME, \"LATin smALl letTER A\") did not find U+0061 (%s)\n", u_errorName(errorCode));
1886 }
1887
1888 /* Test getCharNameCharacters */
1889 if(!getTestOption(QUICK_OPTION)) {
1890 enum { BUFSIZE = 256 };
1891 UErrorCode ec = U_ZERO_ERROR;
1892 char buf[BUFSIZE];
1893 int32_t maxLength;
1894 UChar32 cp;
1895 UChar pat[BUFSIZE], dumbPat[BUFSIZE];
1896 int32_t l1, l2;
1897 UBool map[256];
1898 UBool ok;
1899
1900 USet* set = uset_open(1, 0); /* empty set */
1901 USet* dumb = uset_open(1, 0); /* empty set */
1902
1903 /*
1904 * uprv_getCharNameCharacters() will likely return more lowercase
1905 * letters than actual character names contain because
1906 * it includes all the characters in lowercased names of
1907 * general categories, for the full possible set of extended names.
1908 */
1909 {
1910 USetAdder sa={
1911 NULL,
1912 uset_add,
1913 uset_addRange,
1914 uset_addString,
1915 NULL, /* don't need remove() */
1916 NULL /* don't need removeRange() */
1917 };
1918 sa.set=set;
1919 uprv_getCharNameCharacters(&sa);
1920 }
1921
1922 /* build set the dumb (but sure-fire) way */
1923 for (i=0; i<256; ++i) {
1924 map[i] = FALSE;
1925 }
1926
1927 maxLength=0;
1928 for (cp=0; cp<0x110000; ++cp) {
1929 int32_t len = u_charName(cp, U_EXTENDED_CHAR_NAME,
1930 buf, BUFSIZE, &ec);
1931 if (U_FAILURE(ec)) {
1932 log_err("FAIL: u_charName failed when it shouldn't\n");
1933 uset_close(set);
1934 uset_close(dumb);
1935 return;
1936 }
1937 if(len>maxLength) {
1938 maxLength=len;
1939 }
1940
1941 for (i=0; i<len; ++i) {
1942 if (!map[(uint8_t) buf[i]]) {
1943 uset_add(dumb, (UChar32)u_charToUChar(buf[i]));
1944 map[(uint8_t) buf[i]] = TRUE;
1945 }
1946 }
1947
1948 /* test for leading/trailing whitespace */
1949 if(buf[0]==' ' || buf[0]=='\t' || buf[len-1]==' ' || buf[len-1]=='\t') {
1950 log_err("u_charName(U+%04x) returns a name with leading or trailing whitespace\n", cp);
1951 }
1952 }
1953
1954 if(map[(uint8_t)'\t']) {
1955 log_err("u_charName() returned a name with a TAB for some code point\n", cp);
1956 }
1957
1958 length=uprv_getMaxCharNameLength();
1959 if(length!=maxLength) {
1960 log_err("uprv_getMaxCharNameLength()=%d differs from the maximum length %d of all extended names\n",
1961 length, maxLength);
1962 }
1963
1964 /* compare the sets. Where is my uset_equals?!! */
1965 ok=TRUE;
1966 for(i=0; i<256; ++i) {
1967 if(uset_contains(set, i)!=uset_contains(dumb, i)) {
1968 if(0x61<=i && i<=0x7a /* a-z */ && uset_contains(set, i) && !uset_contains(dumb, i)) {
1969 /* ignore lowercase a-z that are in set but not in dumb */
1970 ok=TRUE;
1971 } else {
1972 ok=FALSE;
1973 break;
1974 }
1975 }
1976 }
1977
1978 l1 = uset_toPattern(set, pat, BUFSIZE, TRUE, &ec);
1979 l2 = uset_toPattern(dumb, dumbPat, BUFSIZE, TRUE, &ec);
1980 if (U_FAILURE(ec)) {
1981 log_err("FAIL: uset_toPattern failed when it shouldn't\n");
1982 uset_close(set);
1983 uset_close(dumb);
1984 return;
1985 }
1986
1987 if (l1 >= BUFSIZE) {
1988 l1 = BUFSIZE-1;
1989 pat[l1] = 0;
1990 }
1991 if (l2 >= BUFSIZE) {
1992 l2 = BUFSIZE-1;
1993 dumbPat[l2] = 0;
1994 }
1995
1996 if (!ok) {
1997 log_err("FAIL: uprv_getCharNameCharacters() returned %s, expected %s (too many lowercase a-z are ok)\n",
1998 aescstrdup(pat, l1), aescstrdup(dumbPat, l2));
1999 } else if(getTestOption(VERBOSITY_OPTION)) {
2000 log_verbose("Ok: uprv_getCharNameCharacters() returned %s\n", aescstrdup(pat, l1));
2001 }
2002
2003 uset_close(set);
2004 uset_close(dumb);
2005 }
2006
2007 /* ### TODO: test error cases and other interesting things */
2008 }
2009
2010 static void
2011 TestUCharFromNameUnderflow() {
2012 // Ticket #10889: Underflow crash when there is no dash.
2013 const char *name="<NO BREAK SPACE>";
2014 UErrorCode errorCode=U_ZERO_ERROR;
2015 UChar32 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2016 if(U_SUCCESS(errorCode)) {
2017 log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2018 name, c, u_errorName(errorCode));
2019 }
2020
2021 // Test related edge cases.
2022 name="<-00a0>";
2023 errorCode=U_ZERO_ERROR;
2024 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2025 if(U_SUCCESS(errorCode)) {
2026 log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2027 name, c, u_errorName(errorCode));
2028 }
2029
2030 errorCode=U_ZERO_ERROR;
2031 name="<control->";
2032 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2033 if(U_SUCCESS(errorCode)) {
2034 log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2035 name, c, u_errorName(errorCode));
2036 }
2037
2038 errorCode=U_ZERO_ERROR;
2039 name="<control-111111>";
2040 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2041 if(U_SUCCESS(errorCode)) {
2042 log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2043 name, c, u_errorName(errorCode));
2044 }
2045
2046 // ICU-20292: integer overflow
2047 errorCode=U_ZERO_ERROR;
2048 name="<noncharacter-10010FFFF>";
2049 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2050 if(U_SUCCESS(errorCode)) {
2051 log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2052 name, c, u_errorName(errorCode));
2053 }
2054
2055 errorCode=U_ZERO_ERROR;
2056 name="<noncharacter-00010FFFF>"; // too many digits even if only leading 0s
2057 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2058 if(U_SUCCESS(errorCode)) {
2059 log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2060 name, c, u_errorName(errorCode));
2061 }
2062
2063 errorCode=U_ZERO_ERROR;
2064 name="<noncharacter-fFFf>>";
2065 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2066 if(U_SUCCESS(errorCode)) {
2067 log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2068 name, c, u_errorName(errorCode));
2069 }
2070 }
2071
2072 /* test u_isMirrored() and u_charMirror() ----------------------------------- */
2073
2074 static void
2075 TestMirroring() {
2076 USet *set;
2077 UErrorCode errorCode;
2078
2079 UChar32 start, end, c2, c3;
2080 int32_t i;
2081
2082 U_STRING_DECL(mirroredPattern, "[:Bidi_Mirrored:]", 17);
2083
2084 U_STRING_INIT(mirroredPattern, "[:Bidi_Mirrored:]", 17);
2085
2086 log_verbose("Testing u_isMirrored()\n");
2087 if(!(u_isMirrored(0x28) && u_isMirrored(0xbb) && u_isMirrored(0x2045) && u_isMirrored(0x232a) &&
2088 !u_isMirrored(0x27) && !u_isMirrored(0x61) && !u_isMirrored(0x284) && !u_isMirrored(0x3400)
2089 )
2090 ) {
2091 log_err("u_isMirrored() does not work correctly\n");
2092 }
2093
2094 log_verbose("Testing u_charMirror()\n");
2095 if(!(u_charMirror(0x3c)==0x3e && u_charMirror(0x5d)==0x5b && u_charMirror(0x208d)==0x208e && u_charMirror(0x3017)==0x3016 &&
2096 u_charMirror(0xbb)==0xab && u_charMirror(0x2215)==0x29F5 && u_charMirror(0x29F5)==0x2215 && /* large delta between the code points */
2097 u_charMirror(0x2e)==0x2e && u_charMirror(0x6f3)==0x6f3 && u_charMirror(0x301c)==0x301c && u_charMirror(0xa4ab)==0xa4ab &&
2098 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
2099 u_charMirror(0x2018)==0x2018 && u_charMirror(0x201b)==0x201b && u_charMirror(0x301d)==0x301d
2100 )
2101 ) {
2102 log_err("u_charMirror() does not work correctly\n");
2103 }
2104
2105 /* verify that Bidi_Mirroring_Glyph roundtrips */
2106 errorCode=U_ZERO_ERROR;
2107 set=uset_openPattern(mirroredPattern, 17, &errorCode);
2108
2109 if (U_FAILURE(errorCode)) {
2110 log_data_err("uset_openPattern(mirroredPattern, 17, &errorCode) failed!\n");
2111 } else {
2112 for(i=0; 0==uset_getItem(set, i, &start, &end, NULL, 0, &errorCode); ++i) {
2113 do {
2114 c2=u_charMirror(start);
2115 c3=u_charMirror(c2);
2116 if(c3!=start) {
2117 log_err("u_charMirror() does not roundtrip: U+%04lx->U+%04lx->U+%04lx\n", (long)start, (long)c2, (long)c3);
2118 }
2119 c3=u_getBidiPairedBracket(start);
2120 if(u_getIntPropertyValue(start, UCHAR_BIDI_PAIRED_BRACKET_TYPE)==U_BPT_NONE) {
2121 if(c3!=start) {
2122 log_err("u_getBidiPairedBracket(U+%04lx) != self for bpt(c)==None\n",
2123 (long)start);
2124 }
2125 } else {
2126 if(c3!=c2) {
2127 log_err("u_getBidiPairedBracket(U+%04lx) != U+%04lx = bmg(c)'\n",
2128 (long)start, (long)c2);
2129 }
2130 }
2131 } while(++start<=end);
2132 }
2133 }
2134
2135 uset_close(set);
2136 }
2137
2138
2139 struct RunTestData
2140 {
2141 const char *runText;
2142 UScriptCode runCode;
2143 };
2144
2145 typedef struct RunTestData RunTestData;
2146
2147 static void
2148 CheckScriptRuns(UScriptRun *scriptRun, int32_t *runStarts, const RunTestData *testData, int32_t nRuns,
2149 const char *prefix)
2150 {
2151 int32_t run, runStart, runLimit;
2152 UScriptCode runCode;
2153
2154 /* iterate over all the runs */
2155 run = 0;
2156 while (uscript_nextRun(scriptRun, &runStart, &runLimit, &runCode)) {
2157 if (runStart != runStarts[run]) {
2158 log_err("%s: incorrect start offset for run %d: expected %d, got %d\n",
2159 prefix, run, runStarts[run], runStart);
2160 }
2161
2162 if (runLimit != runStarts[run + 1]) {
2163 log_err("%s: incorrect limit offset for run %d: expected %d, got %d\n",
2164 prefix, run, runStarts[run + 1], runLimit);
2165 }
2166
2167 if (runCode != testData[run].runCode) {
2168 log_err("%s: incorrect script for run %d: expected \"%s\", got \"%s\"\n",
2169 prefix, run, uscript_getName(testData[run].runCode), uscript_getName(runCode));
2170 }
2171
2172 run += 1;
2173
2174 /* stop when we've seen all the runs we expect to see */
2175 if (run >= nRuns) {
2176 break;
2177 }
2178 }
2179
2180 /* Complain if we didn't see then number of runs we expected */
2181 if (run != nRuns) {
2182 log_err("%s: incorrect number of runs: expected %d, got %d\n", prefix, run, nRuns);
2183 }
2184 }
2185
2186 static void
2187 TestUScriptRunAPI()
2188 {
2189 static const RunTestData testData1[] = {
2190 {"\\u0020\\u0946\\u0939\\u093F\\u0928\\u094D\\u0926\\u0940\\u0020", USCRIPT_DEVANAGARI},
2191 {"\\u0627\\u0644\\u0639\\u0631\\u0628\\u064A\\u0629\\u0020", USCRIPT_ARABIC},
2192 {"\\u0420\\u0443\\u0441\\u0441\\u043A\\u0438\\u0439\\u0020", USCRIPT_CYRILLIC},
2193 {"English (", USCRIPT_LATIN},
2194 {"\\u0E44\\u0E17\\u0E22", USCRIPT_THAI},
2195 {") ", USCRIPT_LATIN},
2196 {"\\u6F22\\u5B75", USCRIPT_HAN},
2197 {"\\u3068\\u3072\\u3089\\u304C\\u306A\\u3068", USCRIPT_HIRAGANA},
2198 {"\\u30AB\\u30BF\\u30AB\\u30CA", USCRIPT_KATAKANA},
2199 {"\\U00010400\\U00010401\\U00010402\\U00010403", USCRIPT_DESERET}
2200 };
2201
2202 static const RunTestData testData2[] = {
2203 {"((((((((((abc))))))))))", USCRIPT_LATIN}
2204 };
2205
2206 static const struct {
2207 const RunTestData *testData;
2208 int32_t nRuns;
2209 } testDataEntries[] = {
2210 {testData1, UPRV_LENGTHOF(testData1)},
2211 {testData2, UPRV_LENGTHOF(testData2)}
2212 };
2213
2214 static const int32_t nTestEntries = UPRV_LENGTHOF(testDataEntries);
2215 int32_t testEntry;
2216
2217 for (testEntry = 0; testEntry < nTestEntries; testEntry += 1) {
2218 UChar testString[1024];
2219 int32_t runStarts[256];
2220 int32_t nTestRuns = testDataEntries[testEntry].nRuns;
2221 const RunTestData *testData = testDataEntries[testEntry].testData;
2222
2223 int32_t run, stringLimit;
2224 UScriptRun *scriptRun = NULL;
2225 UErrorCode err;
2226
2227 /*
2228 * Fill in the test string and the runStarts array.
2229 */
2230 stringLimit = 0;
2231 for (run = 0; run < nTestRuns; run += 1) {
2232 runStarts[run] = stringLimit;
2233 stringLimit += u_unescape(testData[run].runText, &testString[stringLimit], 1024 - stringLimit);
2234 /*stringLimit -= 1;*/
2235 }
2236
2237 /* The limit of the last run */
2238 runStarts[nTestRuns] = stringLimit;
2239
2240 /*
2241 * Make sure that calling uscript_OpenRun with a NULL text pointer
2242 * and a non-zero text length returns the correct error.
2243 */
2244 err = U_ZERO_ERROR;
2245 scriptRun = uscript_openRun(NULL, stringLimit, &err);
2246
2247 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2248 log_err("uscript_openRun(NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2249 }
2250
2251 if (scriptRun != NULL) {
2252 log_err("uscript_openRun(NULL, stringLimit, &err) returned a non-NULL result.\n");
2253 uscript_closeRun(scriptRun);
2254 }
2255
2256 /*
2257 * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2258 * and a zero text length returns the correct error.
2259 */
2260 err = U_ZERO_ERROR;
2261 scriptRun = uscript_openRun(testString, 0, &err);
2262
2263 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2264 log_err("uscript_openRun(testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2265 }
2266
2267 if (scriptRun != NULL) {
2268 log_err("uscript_openRun(testString, 0, &err) returned a non-NULL result.\n");
2269 uscript_closeRun(scriptRun);
2270 }
2271
2272 /*
2273 * Make sure that calling uscript_openRun with a NULL text pointer
2274 * and a zero text length doesn't return an error.
2275 */
2276 err = U_ZERO_ERROR;
2277 scriptRun = uscript_openRun(NULL, 0, &err);
2278
2279 if (U_FAILURE(err)) {
2280 log_err("Got error %s from uscript_openRun(NULL, 0, &err)\n", u_errorName(err));
2281 }
2282
2283 /* Make sure that the empty iterator doesn't find any runs */
2284 if (uscript_nextRun(scriptRun, NULL, NULL, NULL)) {
2285 log_err("uscript_nextRun(...) returned TRUE for an empty iterator.\n");
2286 }
2287
2288 /*
2289 * Make sure that calling uscript_setRunText with a NULL text pointer
2290 * and a non-zero text length returns the correct error.
2291 */
2292 err = U_ZERO_ERROR;
2293 uscript_setRunText(scriptRun, NULL, stringLimit, &err);
2294
2295 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2296 log_err("uscript_setRunText(scriptRun, NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2297 }
2298
2299 /*
2300 * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2301 * and a zero text length returns the correct error.
2302 */
2303 err = U_ZERO_ERROR;
2304 uscript_setRunText(scriptRun, testString, 0, &err);
2305
2306 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2307 log_err("uscript_setRunText(scriptRun, testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2308 }
2309
2310 /*
2311 * Now call uscript_setRunText on the empty iterator
2312 * and make sure that it works.
2313 */
2314 err = U_ZERO_ERROR;
2315 uscript_setRunText(scriptRun, testString, stringLimit, &err);
2316
2317 if (U_FAILURE(err)) {
2318 log_err("Got error %s from uscript_setRunText(...)\n", u_errorName(err));
2319 } else {
2320 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_setRunText");
2321 }
2322
2323 uscript_closeRun(scriptRun);
2324
2325 /*
2326 * Now open an interator over the testString
2327 * using uscript_openRun and make sure that it works
2328 */
2329 scriptRun = uscript_openRun(testString, stringLimit, &err);
2330
2331 if (U_FAILURE(err)) {
2332 log_err("Got error %s from uscript_openRun(...)\n", u_errorName(err));
2333 } else {
2334 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_openRun");
2335 }
2336
2337 /* Now reset the iterator, and make sure
2338 * that it still works.
2339 */
2340 uscript_resetRun(scriptRun);
2341
2342 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_resetRun");
2343
2344 /* Close the iterator */
2345 uscript_closeRun(scriptRun);
2346 }
2347 }
2348
2349 /* test additional, non-core properties */
2350 static void
2351 TestAdditionalProperties() {
2352 /* test data for u_charAge() */
2353 static const struct {
2354 UChar32 c;
2355 UVersionInfo version;
2356 } charAges[]={
2357 {0x41, { 1, 1, 0, 0 }},
2358 {0xffff, { 1, 1, 0, 0 }},
2359 {0x20ab, { 2, 0, 0, 0 }},
2360 {0x2fffe, { 2, 0, 0, 0 }},
2361 {0x20ac, { 2, 1, 0, 0 }},
2362 {0xfb1d, { 3, 0, 0, 0 }},
2363 {0x3f4, { 3, 1, 0, 0 }},
2364 {0x10300, { 3, 1, 0, 0 }},
2365 {0x220, { 3, 2, 0, 0 }},
2366 {0xff60, { 3, 2, 0, 0 }}
2367 };
2368
2369 /* test data for u_hasBinaryProperty() */
2370 static const int32_t
2371 props[][3]={ /* code point, property, value */
2372 { 0x0627, UCHAR_ALPHABETIC, TRUE },
2373 { 0x1034a, UCHAR_ALPHABETIC, TRUE },
2374 { 0x2028, UCHAR_ALPHABETIC, FALSE },
2375
2376 { 0x0066, UCHAR_ASCII_HEX_DIGIT, TRUE },
2377 { 0x0067, UCHAR_ASCII_HEX_DIGIT, FALSE },
2378
2379 { 0x202c, UCHAR_BIDI_CONTROL, TRUE },
2380 { 0x202f, UCHAR_BIDI_CONTROL, FALSE },
2381
2382 { 0x003c, UCHAR_BIDI_MIRRORED, TRUE },
2383 { 0x003d, UCHAR_BIDI_MIRRORED, FALSE },
2384
2385 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
2386 { 0x2018, UCHAR_BIDI_MIRRORED, FALSE },
2387 { 0x201d, UCHAR_BIDI_MIRRORED, FALSE },
2388 { 0x201f, UCHAR_BIDI_MIRRORED, FALSE },
2389 { 0x301e, UCHAR_BIDI_MIRRORED, FALSE },
2390
2391 { 0x058a, UCHAR_DASH, TRUE },
2392 { 0x007e, UCHAR_DASH, FALSE },
2393
2394 { 0x0c4d, UCHAR_DIACRITIC, TRUE },
2395 { 0x3000, UCHAR_DIACRITIC, FALSE },
2396
2397 { 0x0e46, UCHAR_EXTENDER, TRUE },
2398 { 0x0020, UCHAR_EXTENDER, FALSE },
2399
2400 #if !UCONFIG_NO_NORMALIZATION
2401 { 0xfb1d, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2402 { 0x1d15f, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2403 { 0xfb1e, UCHAR_FULL_COMPOSITION_EXCLUSION, FALSE },
2404
2405 { 0x110a, UCHAR_NFD_INERT, TRUE }, /* Jamo L */
2406 { 0x0308, UCHAR_NFD_INERT, FALSE },
2407
2408 { 0x1164, UCHAR_NFKD_INERT, TRUE }, /* Jamo V */
2409 { 0x1d79d, UCHAR_NFKD_INERT, FALSE }, /* math compat version of xi */
2410
2411 { 0x0021, UCHAR_NFC_INERT, TRUE }, /* ! */
2412 { 0x0061, UCHAR_NFC_INERT, FALSE }, /* a */
2413 { 0x00e4, UCHAR_NFC_INERT, FALSE }, /* a-umlaut */
2414 { 0x0102, UCHAR_NFC_INERT, FALSE }, /* a-breve */
2415 { 0xac1c, UCHAR_NFC_INERT, FALSE }, /* Hangul LV */
2416 { 0xac1d, UCHAR_NFC_INERT, TRUE }, /* Hangul LVT */
2417
2418 { 0x1d79d, UCHAR_NFKC_INERT, FALSE }, /* math compat version of xi */
2419 { 0x2a6d6, UCHAR_NFKC_INERT, TRUE }, /* Han, last of CJK ext. B */
2420
2421 { 0x00e4, UCHAR_SEGMENT_STARTER, TRUE },
2422 { 0x0308, UCHAR_SEGMENT_STARTER, FALSE },
2423 { 0x110a, UCHAR_SEGMENT_STARTER, TRUE }, /* Jamo L */
2424 { 0x1164, UCHAR_SEGMENT_STARTER, FALSE },/* Jamo V */
2425 { 0xac1c, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LV */
2426 { 0xac1d, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LVT */
2427 #endif
2428
2429 { 0x0044, UCHAR_HEX_DIGIT, TRUE },
2430 { 0xff46, UCHAR_HEX_DIGIT, TRUE },
2431 { 0x0047, UCHAR_HEX_DIGIT, FALSE },
2432
2433 { 0x30fb, UCHAR_HYPHEN, TRUE },
2434 { 0xfe58, UCHAR_HYPHEN, FALSE },
2435
2436 { 0x2172, UCHAR_ID_CONTINUE, TRUE },
2437 { 0x0307, UCHAR_ID_CONTINUE, TRUE },
2438 { 0x005c, UCHAR_ID_CONTINUE, FALSE },
2439
2440 { 0x2172, UCHAR_ID_START, TRUE },
2441 { 0x007a, UCHAR_ID_START, TRUE },
2442 { 0x0039, UCHAR_ID_START, FALSE },
2443
2444 { 0x4db5, UCHAR_IDEOGRAPHIC, TRUE },
2445 { 0x2f999, UCHAR_IDEOGRAPHIC, TRUE },
2446 { 0x2f99, UCHAR_IDEOGRAPHIC, FALSE },
2447
2448 { 0x200c, UCHAR_JOIN_CONTROL, TRUE },
2449 { 0x2029, UCHAR_JOIN_CONTROL, FALSE },
2450
2451 { 0x1d7bc, UCHAR_LOWERCASE, TRUE },
2452 { 0x0345, UCHAR_LOWERCASE, TRUE },
2453 { 0x0030, UCHAR_LOWERCASE, FALSE },
2454
2455 { 0x1d7a9, UCHAR_MATH, TRUE },
2456 { 0x2135, UCHAR_MATH, TRUE },
2457 { 0x0062, UCHAR_MATH, FALSE },
2458
2459 { 0xfde1, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2460 { 0x10ffff, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2461 { 0x10fffd, UCHAR_NONCHARACTER_CODE_POINT, FALSE },
2462
2463 { 0x0022, UCHAR_QUOTATION_MARK, TRUE },
2464 { 0xff62, UCHAR_QUOTATION_MARK, TRUE },
2465 { 0xd840, UCHAR_QUOTATION_MARK, FALSE },
2466
2467 { 0x061f, UCHAR_TERMINAL_PUNCTUATION, TRUE },
2468 { 0xe003f, UCHAR_TERMINAL_PUNCTUATION, FALSE },
2469
2470 { 0x1d44a, UCHAR_UPPERCASE, TRUE },
2471 { 0x2162, UCHAR_UPPERCASE, TRUE },
2472 { 0x0345, UCHAR_UPPERCASE, FALSE },
2473
2474 { 0x0020, UCHAR_WHITE_SPACE, TRUE },
2475 { 0x202f, UCHAR_WHITE_SPACE, TRUE },
2476 { 0x3001, UCHAR_WHITE_SPACE, FALSE },
2477
2478 { 0x0711, UCHAR_XID_CONTINUE, TRUE },
2479 { 0x1d1aa, UCHAR_XID_CONTINUE, TRUE },
2480 { 0x007c, UCHAR_XID_CONTINUE, FALSE },
2481
2482 { 0x16ee, UCHAR_XID_START, TRUE },
2483 { 0x23456, UCHAR_XID_START, TRUE },
2484 { 0x1d1aa, UCHAR_XID_START, FALSE },
2485
2486 /*
2487 * Version break:
2488 * The following properties are only supported starting with the
2489 * Unicode version indicated in the second field.
2490 */
2491 { -1, 0x320, 0 },
2492
2493 { 0x180c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2494 { 0xfe02, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2495 { 0x1801, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, FALSE },
2496
2497 { 0x0149, UCHAR_DEPRECATED, TRUE }, /* changed in Unicode 5.2 */
2498 { 0x0341, UCHAR_DEPRECATED, FALSE }, /* changed in Unicode 5.2 */
2499 { 0xe0001, UCHAR_DEPRECATED, TRUE }, /* changed from Unicode 5 to 5.1 */
2500 { 0xe0100, UCHAR_DEPRECATED, FALSE },
2501
2502 { 0x00a0, UCHAR_GRAPHEME_BASE, TRUE },
2503 { 0x0a4d, UCHAR_GRAPHEME_BASE, FALSE },
2504 { 0xff9d, UCHAR_GRAPHEME_BASE, TRUE },
2505 { 0xff9f, UCHAR_GRAPHEME_BASE, FALSE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2506
2507 { 0x0300, UCHAR_GRAPHEME_EXTEND, TRUE },
2508 { 0xff9d, UCHAR_GRAPHEME_EXTEND, FALSE },
2509 { 0xff9f, UCHAR_GRAPHEME_EXTEND, TRUE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2510 { 0x0603, UCHAR_GRAPHEME_EXTEND, FALSE },
2511
2512 { 0x0a4d, UCHAR_GRAPHEME_LINK, TRUE },
2513 { 0xff9f, UCHAR_GRAPHEME_LINK, FALSE },
2514
2515 { 0x2ff7, UCHAR_IDS_BINARY_OPERATOR, TRUE },
2516 { 0x2ff3, UCHAR_IDS_BINARY_OPERATOR, FALSE },
2517
2518 { 0x2ff3, UCHAR_IDS_TRINARY_OPERATOR, TRUE },
2519 { 0x2f03, UCHAR_IDS_TRINARY_OPERATOR, FALSE },
2520
2521 { 0x0ec1, UCHAR_LOGICAL_ORDER_EXCEPTION, TRUE },
2522 { 0xdcba, UCHAR_LOGICAL_ORDER_EXCEPTION, FALSE },
2523
2524 { 0x2e9b, UCHAR_RADICAL, TRUE },
2525 { 0x4e00, UCHAR_RADICAL, FALSE },
2526
2527 { 0x012f, UCHAR_SOFT_DOTTED, TRUE },
2528 { 0x0049, UCHAR_SOFT_DOTTED, FALSE },
2529
2530 { 0xfa11, UCHAR_UNIFIED_IDEOGRAPH, TRUE },
2531 { 0xfa12, UCHAR_UNIFIED_IDEOGRAPH, FALSE },
2532
2533 { -1, 0x401, 0 }, /* version break for Unicode 4.0.1 */
2534
2535 { 0x002e, UCHAR_S_TERM, TRUE },
2536 { 0x0061, UCHAR_S_TERM, FALSE },
2537
2538 { 0x180c, UCHAR_VARIATION_SELECTOR, TRUE },
2539 { 0xfe03, UCHAR_VARIATION_SELECTOR, TRUE },
2540 { 0xe01ef, UCHAR_VARIATION_SELECTOR, TRUE },
2541 { 0xe0200, UCHAR_VARIATION_SELECTOR, FALSE },
2542
2543 /* enum/integer type properties */
2544
2545 /* UCHAR_BIDI_CLASS tested for assigned characters in TestUnicodeData() */
2546 /* test default Bidi classes for unassigned code points */
2547 { 0x0590, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2548 { 0x05cf, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2549 { 0x05ed, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2550 { 0x07f2, UCHAR_BIDI_CLASS, U_DIR_NON_SPACING_MARK }, /* Nko, new in Unicode 5.0 */
2551 { 0x07fe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, /* unassigned R */
2552 { 0x089f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2553 { 0xfb37, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2554 { 0xfb42, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2555 { 0x10806, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2556 { 0x10909, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2557 { 0x10fe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2558
2559 { 0x061d, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2560 { 0x063f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2561 { 0x070e, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2562 { 0x0775, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2563 { 0xfbc2, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2564 { 0xfd90, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2565 { 0xfefe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2566
2567 { 0x02AF, UCHAR_BLOCK, UBLOCK_IPA_EXTENSIONS },
2568 { 0x0C4E, UCHAR_BLOCK, UBLOCK_TELUGU },
2569 { 0x155A, UCHAR_BLOCK, UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS },
2570 { 0x1717, UCHAR_BLOCK, UBLOCK_TAGALOG },
2571 { 0x1900, UCHAR_BLOCK, UBLOCK_LIMBU },
2572 { 0x0870, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2573 { 0x3040, UCHAR_BLOCK, UBLOCK_HIRAGANA },
2574 { 0x1D0FF, UCHAR_BLOCK, UBLOCK_BYZANTINE_MUSICAL_SYMBOLS },
2575 { 0x50000, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2576 { 0xEFFFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2577 { 0x10D0FF, UCHAR_BLOCK, UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B },
2578
2579 /* UCHAR_CANONICAL_COMBINING_CLASS tested for assigned characters in TestUnicodeData() */
2580 { 0xd7d7, UCHAR_CANONICAL_COMBINING_CLASS, 0 },
2581
2582 { 0x00A0, UCHAR_DECOMPOSITION_TYPE, U_DT_NOBREAK },
2583 { 0x00A8, UCHAR_DECOMPOSITION_TYPE, U_DT_COMPAT },
2584 { 0x00bf, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2585 { 0x00c0, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2586 { 0x1E9B, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2587 { 0xBCDE, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2588 { 0xFB5D, UCHAR_DECOMPOSITION_TYPE, U_DT_MEDIAL },
2589 { 0x1D736, UCHAR_DECOMPOSITION_TYPE, U_DT_FONT },
2590 { 0xe0033, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2591
2592 { 0x0009, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2593 { 0x0020, UCHAR_EAST_ASIAN_WIDTH, U_EA_NARROW },
2594 { 0x00B1, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2595 { 0x20A9, UCHAR_EAST_ASIAN_WIDTH, U_EA_HALFWIDTH },
2596 { 0x2FFB, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2597 { 0x3000, UCHAR_EAST_ASIAN_WIDTH, U_EA_FULLWIDTH },
2598 { 0x35bb, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2599 { 0x58bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2600 { 0xD7A3, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2601 { 0xEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2602 { 0x1D198, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2603 { 0x20000, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2604 { 0x2F8C7, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2605 { 0x3a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, /* plane 3 got default W values in Unicode 4 */
2606 { 0x5a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2607 { 0xFEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2608 { 0x10EEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2609
2610 /* UCHAR_GENERAL_CATEGORY tested for assigned characters in TestUnicodeData() */
2611 { 0xd7c7, UCHAR_GENERAL_CATEGORY, 0 },
2612 { 0xd7d7, UCHAR_GENERAL_CATEGORY, U_OTHER_LETTER }, /* changed in Unicode 5.2 */
2613
2614 { 0x0444, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2615 { 0x0639, UCHAR_JOINING_GROUP, U_JG_AIN },
2616 { 0x072A, UCHAR_JOINING_GROUP, U_JG_DALATH_RISH },
2617 { 0x0647, UCHAR_JOINING_GROUP, U_JG_HEH },
2618 { 0x06C1, UCHAR_JOINING_GROUP, U_JG_HEH_GOAL },
2619
2620 { 0x200C, UCHAR_JOINING_TYPE, U_JT_NON_JOINING },
2621 { 0x200D, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2622 { 0x0639, UCHAR_JOINING_TYPE, U_JT_DUAL_JOINING },
2623 { 0x0640, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2624 { 0x06C3, UCHAR_JOINING_TYPE, U_JT_RIGHT_JOINING },
2625 { 0x0300, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2626 { 0x070F, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2627 { 0xe0033, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2628
2629 /* TestUnicodeData() verifies that no assigned character has "XX" (unknown) */
2630 { 0xe7e7, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2631 { 0x10fffd, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2632 { 0x0028, UCHAR_LINE_BREAK, U_LB_OPEN_PUNCTUATION },
2633 { 0x232A, UCHAR_LINE_BREAK, U_LB_CLOSE_PUNCTUATION },
2634 { 0x3401, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2635 { 0x4e02, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2636 { 0x20004, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2637 { 0xf905, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2638 { 0xdb7e, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2639 { 0xdbfd, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2640 { 0xdffc, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2641 { 0x2762, UCHAR_LINE_BREAK, U_LB_EXCLAMATION },
2642 { 0x002F, UCHAR_LINE_BREAK, U_LB_BREAK_SYMBOLS },
2643 { 0x1D49C, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2644 { 0x1731, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2645
2646 /* UCHAR_NUMERIC_TYPE tested in TestNumericProperties() */
2647
2648 /* UCHAR_SCRIPT tested in cucdapi.c TestUScriptCodeAPI() */
2649
2650 { 0x10ff, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2651 { 0x1100, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2652 { 0x1111, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2653 { 0x1159, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2654 { 0x115a, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2655 { 0x115e, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2656 { 0x115f, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2657
2658 { 0xa95f, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2659 { 0xa960, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2660 { 0xa97c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2661 { 0xa97d, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2662
2663 { 0x1160, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2664 { 0x1161, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2665 { 0x1172, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2666 { 0x11a2, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2667 { 0x11a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2668 { 0x11a7, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2669
2670 { 0xd7af, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2671 { 0xd7b0, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2672 { 0xd7c6, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2673 { 0xd7c7, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2674
2675 { 0x11a8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2676 { 0x11b8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2677 { 0x11c8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2678 { 0x11f9, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2679 { 0x11fa, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2680 { 0x11ff, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2681 { 0x1200, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2682
2683 { 0xd7ca, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2684 { 0xd7cb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2685 { 0xd7fb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2686 { 0xd7fc, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2687
2688 { 0xac00, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2689 { 0xac1c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2690 { 0xc5ec, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2691 { 0xd788, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2692
2693 { 0xac01, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2694 { 0xac1b, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2695 { 0xac1d, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2696 { 0xc5ee, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2697 { 0xd7a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2698
2699 { 0xd7a4, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2700
2701 { -1, 0x410, 0 }, /* version break for Unicode 4.1 */
2702
2703 { 0x00d7, UCHAR_PATTERN_SYNTAX, TRUE },
2704 { 0xfe45, UCHAR_PATTERN_SYNTAX, TRUE },
2705 { 0x0061, UCHAR_PATTERN_SYNTAX, FALSE },
2706
2707 { 0x0020, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2708 { 0x0085, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2709 { 0x200f, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2710 { 0x00a0, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2711 { 0x3000, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2712
2713 { 0x1d200, UCHAR_BLOCK, UBLOCK_ANCIENT_GREEK_MUSICAL_NOTATION },
2714 { 0x2c8e, UCHAR_BLOCK, UBLOCK_COPTIC },
2715 { 0xfe17, UCHAR_BLOCK, UBLOCK_VERTICAL_FORMS },
2716
2717 { 0x1a00, UCHAR_SCRIPT, USCRIPT_BUGINESE },
2718 { 0x2cea, UCHAR_SCRIPT, USCRIPT_COPTIC },
2719 { 0xa82b, UCHAR_SCRIPT, USCRIPT_SYLOTI_NAGRI },
2720 { 0x103d0, UCHAR_SCRIPT, USCRIPT_OLD_PERSIAN },
2721
2722 { 0xcc28, UCHAR_LINE_BREAK, U_LB_H2 },
2723 { 0xcc29, UCHAR_LINE_BREAK, U_LB_H3 },
2724 { 0xac03, UCHAR_LINE_BREAK, U_LB_H3 },
2725 { 0x115f, UCHAR_LINE_BREAK, U_LB_JL },
2726 { 0x11aa, UCHAR_LINE_BREAK, U_LB_JT },
2727 { 0x11a1, UCHAR_LINE_BREAK, U_LB_JV },
2728
2729 { 0xb2c9, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_LVT },
2730 { 0x036f, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_EXTEND },
2731 { 0x0000, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_CONTROL },
2732 { 0x1160, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_V },
2733
2734 { 0x05f4, UCHAR_WORD_BREAK, U_WB_MIDLETTER },
2735 { 0x4ef0, UCHAR_WORD_BREAK, U_WB_OTHER },
2736 { 0x19d9, UCHAR_WORD_BREAK, U_WB_NUMERIC },
2737 { 0x2044, UCHAR_WORD_BREAK, U_WB_MIDNUM },
2738
2739 { 0xfffd, UCHAR_SENTENCE_BREAK, U_SB_OTHER },
2740 { 0x1ffc, UCHAR_SENTENCE_BREAK, U_SB_UPPER },
2741 { 0xff63, UCHAR_SENTENCE_BREAK, U_SB_CLOSE },
2742 { 0x2028, UCHAR_SENTENCE_BREAK, U_SB_SEP },
2743
2744 { -1, 0x520, 0 }, /* version break for Unicode 5.2 */
2745
2746 /* unassigned code points in new default Bidi R blocks */
2747 { 0x1ede4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2748 { 0x1efe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2749
2750 /* test some script codes >127 */
2751 { 0xa6e6, UCHAR_SCRIPT, USCRIPT_BAMUM },
2752 { 0xa4d0, UCHAR_SCRIPT, USCRIPT_LISU },
2753 { 0x10a7f, UCHAR_SCRIPT, USCRIPT_OLD_SOUTH_ARABIAN },
2754
2755 { -1, 0x600, 0 }, /* version break for Unicode 6.0 */
2756
2757 /* value changed in Unicode 6.0 */
2758 { 0x06C3, UCHAR_JOINING_GROUP, U_JG_TEH_MARBUTA_GOAL },
2759
2760 { -1, 0x610, 0 }, /* version break for Unicode 6.1 */
2761
2762 /* unassigned code points in new/changed default Bidi AL blocks */
2763 { 0x08ba, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2764 { 0x1eee4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2765
2766 { -1, 0x630, 0 }, /* version break for Unicode 6.3 */
2767
2768 /* unassigned code points in the currency symbols block now default to ET */
2769 { 0x20C0, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR },
2770 { 0x20CF, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR },
2771
2772 /* new property in Unicode 6.3 */
2773 { 0x0027, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE },
2774 { 0x0028, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN },
2775 { 0x0029, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE },
2776 { 0xFF5C, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE },
2777 { 0xFF5B, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN },
2778 { 0xFF5D, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE },
2779
2780 { -1, 0x700, 0 }, /* version break for Unicode 7.0 */
2781
2782 /* new character range with Joining_Group values */
2783 { 0x10ABF, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2784 { 0x10AC0, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_ALEPH },
2785 { 0x10AC1, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_BETH },
2786 { 0x10AEF, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_HUNDRED },
2787 { 0x10AF0, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2788
2789 { -1, 0xa00, 0 }, // version break for Unicode 10
2790
2791 { 0x1F1E5, UCHAR_REGIONAL_INDICATOR, FALSE },
2792 { 0x1F1E7, UCHAR_REGIONAL_INDICATOR, TRUE },
2793 { 0x1F1FF, UCHAR_REGIONAL_INDICATOR, TRUE },
2794 { 0x1F200, UCHAR_REGIONAL_INDICATOR, FALSE },
2795
2796 { 0x0600, UCHAR_PREPENDED_CONCATENATION_MARK, TRUE },
2797 { 0x0606, UCHAR_PREPENDED_CONCATENATION_MARK, FALSE },
2798 { 0x110BD, UCHAR_PREPENDED_CONCATENATION_MARK, TRUE },
2799
2800 /* undefined UProperty values */
2801 { 0x61, 0x4a7, 0 },
2802 { 0x234bc, 0x15ed, 0 }
2803 };
2804
2805 UVersionInfo version;
2806 UChar32 c;
2807 int32_t i, result, uVersion;
2808 UProperty which;
2809
2810 /* what is our Unicode version? */
2811 u_getUnicodeVersion(version);
2812 uVersion=((int32_t)version[0]<<8)|(version[1]<<4)|version[2]; /* major/minor/update version numbers */
2813
2814 u_charAge(0x20, version);
2815 if(version[0]==0) {
2816 /* no additional properties available */
2817 log_err("TestAdditionalProperties: no additional properties available, not tested\n");
2818 return;
2819 }
2820
2821 /* test u_charAge() */
2822 for(i=0; i<UPRV_LENGTHOF(charAges); ++i) {
2823 u_charAge(charAges[i].c, version);
2824 if(0!=memcmp(version, charAges[i].version, sizeof(UVersionInfo))) {
2825 log_err("error: u_charAge(U+%04lx)={ %u, %u, %u, %u } instead of { %u, %u, %u, %u }\n",
2826 charAges[i].c,
2827 version[0], version[1], version[2], version[3],
2828 charAges[i].version[0], charAges[i].version[1], charAges[i].version[2], charAges[i].version[3]);
2829 }
2830 }
2831
2832 if( u_getIntPropertyMinValue(UCHAR_DASH)!=0 ||
2833 u_getIntPropertyMinValue(UCHAR_BIDI_CLASS)!=0 ||
2834 u_getIntPropertyMinValue(UCHAR_BLOCK)!=0 || /* j2478 */
2835 u_getIntPropertyMinValue(UCHAR_SCRIPT)!=0 || /*JB#2410*/
2836 u_getIntPropertyMinValue(0x2345)!=0
2837 ) {
2838 log_err("error: u_getIntPropertyMinValue() wrong\n");
2839 }
2840 if( u_getIntPropertyMaxValue(UCHAR_DASH)!=1) {
2841 log_err("error: u_getIntPropertyMaxValue(UCHAR_DASH) wrong\n");
2842 }
2843 if( u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE)!=1) {
2844 log_err("error: u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE) wrong\n");
2845 }
2846 if( u_getIntPropertyMaxValue((UProperty)(UCHAR_BINARY_LIMIT-1))!=1) {
2847 log_err("error: u_getIntPropertyMaxValue(UCHAR_BINARY_LIMIT-1) wrong\n");
2848 }
2849 if( u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS)!=(int32_t)U_CHAR_DIRECTION_COUNT-1 ) {
2850 log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS) wrong\n");
2851 }
2852 if( u_getIntPropertyMaxValue(UCHAR_BLOCK)!=(int32_t)UBLOCK_COUNT-1 ) {
2853 log_err("error: u_getIntPropertyMaxValue(UCHAR_BLOCK) wrong\n");
2854 }
2855 if(u_getIntPropertyMaxValue(UCHAR_LINE_BREAK)!=(int32_t)U_LB_COUNT-1) {
2856 log_err("error: u_getIntPropertyMaxValue(UCHAR_LINE_BREAK) wrong\n");
2857 }
2858 if(u_getIntPropertyMaxValue(UCHAR_SCRIPT)!=(int32_t)USCRIPT_CODE_LIMIT-1) {
2859 log_err("error: u_getIntPropertyMaxValue(UCHAR_SCRIPT) wrong\n");
2860 }
2861 if(u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE)!=(int32_t)U_NT_COUNT-1) {
2862 log_err("error: u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE) wrong\n");
2863 }
2864 if(u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY)!=(int32_t)U_CHAR_CATEGORY_COUNT-1) {
2865 log_err("error: u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY) wrong\n");
2866 }
2867 if(u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE)!=(int32_t)U_HST_COUNT-1) {
2868 log_err("error: u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE) wrong\n");
2869 }
2870 if(u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK)!=(int32_t)U_GCB_COUNT-1) {
2871 log_err("error: u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK) wrong\n");
2872 }
2873 if(u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK)!=(int32_t)U_SB_COUNT-1) {
2874 log_err("error: u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK) wrong\n");
2875 }
2876 if(u_getIntPropertyMaxValue(UCHAR_WORD_BREAK)!=(int32_t)U_WB_COUNT-1) {
2877 log_err("error: u_getIntPropertyMaxValue(UCHAR_WORD_BREAK) wrong\n");
2878 }
2879 if(u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE)!=(int32_t)U_BPT_COUNT-1) {
2880 log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE) wrong\n");
2881 }
2882 /*JB#2410*/
2883 if( u_getIntPropertyMaxValue(0x2345)!=-1) {
2884 log_err("error: u_getIntPropertyMaxValue(0x2345) wrong\n");
2885 }
2886 if( u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) != (int32_t) (U_DT_COUNT - 1)) {
2887 log_err("error: u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) wrong\n");
2888 }
2889 if( u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) != (int32_t) (U_JG_COUNT -1)) {
2890 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) wrong\n");
2891 }
2892 if( u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) != (int32_t) (U_JT_COUNT -1)) {
2893 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) wrong\n");
2894 }
2895 if( u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) != (int32_t) (U_EA_COUNT -1)) {
2896 log_err("error: u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) wrong\n");
2897 }
2898
2899 /* test u_hasBinaryProperty() and u_getIntPropertyValue() */
2900 for(i=0; i<UPRV_LENGTHOF(props); ++i) {
2901 const char *whichName;
2902
2903 if(props[i][0]<0) {
2904 /* Unicode version break */
2905 if(uVersion<props[i][1]) {
2906 break; /* do not test properties that are not yet supported */
2907 } else {
2908 continue; /* skip this row */
2909 }
2910 }
2911
2912 c=(UChar32)props[i][0];
2913 which=(UProperty)props[i][1];
2914 whichName=u_getPropertyName(which, U_LONG_PROPERTY_NAME);
2915
2916 if(which<UCHAR_INT_START) {
2917 result=u_hasBinaryProperty(c, which);
2918 if(result!=props[i][2]) {
2919 log_data_err("error: u_hasBinaryProperty(U+%04lx, %s)=%d is wrong (props[%d]) - (Are you missing data?)\n",
2920 c, whichName, result, i);
2921 }
2922 }
2923
2924 result=u_getIntPropertyValue(c, which);
2925 if(result!=props[i][2]) {
2926 log_data_err("error: u_getIntPropertyValue(U+%04lx, %s)=%d is wrong, should be %d (props[%d]) - (Are you missing data?)\n",
2927 c, whichName, result, props[i][2], i);
2928 }
2929
2930 /* test separate functions, too */
2931 switch((UProperty)props[i][1]) {
2932 case UCHAR_ALPHABETIC:
2933 if(u_isUAlphabetic((UChar32)props[i][0])!=(UBool)props[i][2]) {
2934 log_err("error: u_isUAlphabetic(U+%04lx)=%d is wrong (props[%d])\n",
2935 props[i][0], result, i);
2936 }
2937 break;
2938 case UCHAR_LOWERCASE:
2939 if(u_isULowercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2940 log_err("error: u_isULowercase(U+%04lx)=%d is wrong (props[%d])\n",
2941 props[i][0], result, i);
2942 }
2943 break;
2944 case UCHAR_UPPERCASE:
2945 if(u_isUUppercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2946 log_err("error: u_isUUppercase(U+%04lx)=%d is wrong (props[%d])\n",
2947 props[i][0], result, i);
2948 }
2949 break;
2950 case UCHAR_WHITE_SPACE:
2951 if(u_isUWhiteSpace((UChar32)props[i][0])!=(UBool)props[i][2]) {
2952 log_err("error: u_isUWhiteSpace(U+%04lx)=%d is wrong (props[%d])\n",
2953 props[i][0], result, i);
2954 }
2955 break;
2956 default:
2957 break;
2958 }
2959 }
2960 }
2961
2962 static void
2963 TestNumericProperties(void) {
2964 /* see UnicodeData.txt, DerivedNumericValues.txt */
2965 static const struct {
2966 UChar32 c;
2967 int32_t type;
2968 double numValue;
2969 } values[]={
2970 { 0x0F33, U_NT_NUMERIC, -1./2. },
2971 { 0x0C66, U_NT_DECIMAL, 0 },
2972 { 0x96f6, U_NT_NUMERIC, 0 },
2973 { 0xa833, U_NT_NUMERIC, 1./16. },
2974 { 0x2152, U_NT_NUMERIC, 1./10. },
2975 { 0x2151, U_NT_NUMERIC, 1./9. },
2976 { 0x1245f, U_NT_NUMERIC, 1./8. },
2977 { 0x2150, U_NT_NUMERIC, 1./7. },
2978 { 0x2159, U_NT_NUMERIC, 1./6. },
2979 { 0x09f6, U_NT_NUMERIC, 3./16. },
2980 { 0x2155, U_NT_NUMERIC, 1./5. },
2981 { 0x00BD, U_NT_NUMERIC, 1./2. },
2982 { 0x0031, U_NT_DECIMAL, 1. },
2983 { 0x4e00, U_NT_NUMERIC, 1. },
2984 { 0x58f1, U_NT_NUMERIC, 1. },
2985 { 0x10320, U_NT_NUMERIC, 1. },
2986 { 0x0F2B, U_NT_NUMERIC, 3./2. },
2987 { 0x00B2, U_NT_DIGIT, 2. },
2988 { 0x5f10, U_NT_NUMERIC, 2. },
2989 { 0x1813, U_NT_DECIMAL, 3. },
2990 { 0x5f0e, U_NT_NUMERIC, 3. },
2991 { 0x2173, U_NT_NUMERIC, 4. },
2992 { 0x8086, U_NT_NUMERIC, 4. },
2993 { 0x278E, U_NT_DIGIT, 5. },
2994 { 0x1D7F2, U_NT_DECIMAL, 6. },
2995 { 0x247A, U_NT_DIGIT, 7. },
2996 { 0x7396, U_NT_NUMERIC, 9. },
2997 { 0x1372, U_NT_NUMERIC, 10. },
2998 { 0x216B, U_NT_NUMERIC, 12. },
2999 { 0x16EE, U_NT_NUMERIC, 17. },
3000 { 0x249A, U_NT_NUMERIC, 19. },
3001 { 0x303A, U_NT_NUMERIC, 30. },
3002 { 0x5345, U_NT_NUMERIC, 30. },
3003 { 0x32B2, U_NT_NUMERIC, 37. },
3004 { 0x1375, U_NT_NUMERIC, 40. },
3005 { 0x10323, U_NT_NUMERIC, 50. },
3006 { 0x0BF1, U_NT_NUMERIC, 100. },
3007 { 0x964c, U_NT_NUMERIC, 100. },
3008 { 0x217E, U_NT_NUMERIC, 500. },
3009 { 0x2180, U_NT_NUMERIC, 1000. },
3010 { 0x4edf, U_NT_NUMERIC, 1000. },
3011 { 0x2181, U_NT_NUMERIC, 5000. },
3012 { 0x137C, U_NT_NUMERIC, 10000. },
3013 { 0x4e07, U_NT_NUMERIC, 10000. },
3014 { 0x12432, U_NT_NUMERIC, 216000. },
3015 { 0x12433, U_NT_NUMERIC, 432000. },
3016 { 0x4ebf, U_NT_NUMERIC, 100000000. },
3017 { 0x5146, U_NT_NUMERIC, 1000000000000. },
3018 { -1, U_NT_NONE, U_NO_NUMERIC_VALUE },
3019 { 0x61, U_NT_NONE, U_NO_NUMERIC_VALUE },
3020 { 0x3000, U_NT_NONE, U_NO_NUMERIC_VALUE },
3021 { 0xfffe, U_NT_NONE, U_NO_NUMERIC_VALUE },
3022 { 0x10301, U_NT_NONE, U_NO_NUMERIC_VALUE },
3023 { 0xe0033, U_NT_NONE, U_NO_NUMERIC_VALUE },
3024 { 0x10ffff, U_NT_NONE, U_NO_NUMERIC_VALUE },
3025 { 0x110000, U_NT_NONE, U_NO_NUMERIC_VALUE }
3026 };
3027
3028 double nv;
3029 UChar32 c;
3030 int32_t i, type;
3031
3032 for(i=0; i<UPRV_LENGTHOF(values); ++i) {
3033 c=values[i].c;
3034 type=u_getIntPropertyValue(c, UCHAR_NUMERIC_TYPE);
3035 nv=u_getNumericValue(c);
3036
3037 if(type!=values[i].type) {
3038 log_err("UCHAR_NUMERIC_TYPE(U+%04lx)=%d should be %d\n", c, type, values[i].type);
3039 }
3040 if(0.000001 <= fabs(nv - values[i].numValue)) {
3041 log_err("u_getNumericValue(U+%04lx)=%g should be %g\n", c, nv, values[i].numValue);
3042 }
3043 }
3044 }
3045
3046 /**
3047 * Test the property names and property value names API.
3048 */
3049 static void
3050 TestPropertyNames(void) {
3051 int32_t p, v, choice=0, rev;
3052 UBool atLeastSomething = FALSE;
3053
3054 for (p=0; ; ++p) {
3055 UProperty propEnum = (UProperty)p;
3056 UBool sawProp = FALSE;
3057 if(p > 10 && !atLeastSomething) {
3058 log_data_err("Never got anything after 10 tries.\nYour data is probably fried. Quitting this test\n", p, choice);
3059 return;
3060 }
3061
3062 for (choice=0; ; ++choice) {
3063 const char* name = u_getPropertyName(propEnum, (UPropertyNameChoice)choice);
3064 if (name) {
3065 if (!sawProp)
3066 log_verbose("prop 0x%04x+%2d:", p&~0xfff, p&0xfff);
3067 log_verbose("%d=\"%s\"", choice, name);
3068 sawProp = TRUE;
3069 atLeastSomething = TRUE;
3070
3071 /* test reverse mapping */
3072 rev = u_getPropertyEnum(name);
3073 if (rev != p) {
3074 log_err("Property round-trip failure: %d -> %s -> %d\n",
3075 p, name, rev);
3076 }
3077 }
3078 if (!name && choice>0) break;
3079 }
3080 if (sawProp) {
3081 /* looks like a valid property; check the values */
3082 const char* pname = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
3083 int32_t max = 0;
3084 if (p == UCHAR_CANONICAL_COMBINING_CLASS) {
3085 max = 255;
3086 } else if (p == UCHAR_GENERAL_CATEGORY_MASK) {
3087 /* it's far too slow to iterate all the way up to
3088 the real max, U_GC_P_MASK */
3089 max = U_GC_NL_MASK;
3090 } else if (p == UCHAR_BLOCK) {
3091 /* UBlockCodes, unlike other values, start at 1 */
3092 max = 1;
3093 }
3094 log_verbose("\n");
3095 for (v=-1; ; ++v) {
3096 UBool sawValue = FALSE;
3097 for (choice=0; ; ++choice) {
3098 const char* vname = u_getPropertyValueName(propEnum, v, (UPropertyNameChoice)choice);
3099 if (vname) {
3100 if (!sawValue) log_verbose(" %s, value %d:", pname, v);
3101 log_verbose("%d=\"%s\"", choice, vname);
3102 sawValue = TRUE;
3103
3104 /* test reverse mapping */
3105 rev = u_getPropertyValueEnum(propEnum, vname);
3106 if (rev != v) {
3107 log_err("Value round-trip failure (%s): %d -> %s -> %d\n",
3108 pname, v, vname, rev);
3109 }
3110 }
3111 if (!vname && choice>0) break;
3112 }
3113 if (sawValue) {
3114 log_verbose("\n");
3115 }
3116 if (!sawValue && v>=max) break;
3117 }
3118 }
3119 if (!sawProp) {
3120 if (p>=UCHAR_STRING_LIMIT) {
3121 break;
3122 } else if (p>=UCHAR_DOUBLE_LIMIT) {
3123 p = UCHAR_STRING_START - 1;
3124 } else if (p>=UCHAR_MASK_LIMIT) {
3125 p = UCHAR_DOUBLE_START - 1;
3126 } else if (p>=UCHAR_INT_LIMIT) {
3127 p = UCHAR_MASK_START - 1;
3128 } else if (p>=UCHAR_BINARY_LIMIT) {
3129 p = UCHAR_INT_START - 1;
3130 }
3131 }
3132 }
3133 }
3134
3135 /**
3136 * Test the property values API. See JB#2410.
3137 */
3138 static void
3139 TestPropertyValues(void) {
3140 int32_t i, p, min, max;
3141 UErrorCode ec;
3142
3143 /* Min should be 0 for everything. */
3144 /* Until JB#2478 is fixed, the one exception is UCHAR_BLOCK. */
3145 for (p=UCHAR_INT_START; p<UCHAR_INT_LIMIT; ++p) {
3146 UProperty propEnum = (UProperty)p;
3147 min = u_getIntPropertyMinValue(propEnum);
3148 if (min != 0) {
3149 if (p == UCHAR_BLOCK) {
3150 /* This is okay...for now. See JB#2487.
3151 TODO Update this for JB#2487. */
3152 } else {
3153 const char* name;
3154 name = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
3155 if (name == NULL)
3156 name = "<ERROR>";
3157 log_err("FAIL: u_getIntPropertyMinValue(%s) = %d, exp. 0\n",
3158 name, min);
3159 }
3160 }
3161 }
3162
3163 if( u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY_MASK)!=0 ||
3164 u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY_MASK)!=-1) {
3165 log_err("error: u_getIntPropertyMin/MaxValue(UCHAR_GENERAL_CATEGORY_MASK) is wrong\n");
3166 }
3167
3168 /* Max should be -1 for invalid properties. */
3169 max = u_getIntPropertyMaxValue(UCHAR_INVALID_CODE);
3170 if (max != -1) {
3171 log_err("FAIL: u_getIntPropertyMaxValue(-1) = %d, exp. -1\n",
3172 max);
3173 }
3174
3175 /* Script should return USCRIPT_INVALID_CODE for an invalid code point. */
3176 for (i=0; i<2; ++i) {
3177 int32_t script;
3178 const char* desc;
3179 ec = U_ZERO_ERROR;
3180 switch (i) {
3181 case 0:
3182 script = uscript_getScript(-1, &ec);
3183 desc = "uscript_getScript(-1)";
3184 break;
3185 case 1:
3186 script = u_getIntPropertyValue(-1, UCHAR_SCRIPT);
3187 desc = "u_getIntPropertyValue(-1, UCHAR_SCRIPT)";
3188 break;
3189 default:
3190 log_err("Internal test error. Too many scripts\n");
3191 return;
3192 }
3193 /* We don't explicitly test ec. It should be U_FAILURE but it
3194 isn't documented as such. */
3195 if (script != (int32_t)USCRIPT_INVALID_CODE) {
3196 log_err("FAIL: %s = %d, exp. 0\n",
3197 desc, script);
3198 }
3199 }
3200 }
3201
3202 /* various tests for consistency of UCD data and API behavior */
3203 static void
3204 TestConsistency() {
3205 char buffer[300];
3206 USet *set1, *set2, *set3, *set4;
3207 UErrorCode errorCode;
3208
3209 UChar32 start, end;
3210 int32_t i, length;
3211
3212 U_STRING_DECL(hyphenPattern, "[:Hyphen:]", 10);
3213 U_STRING_DECL(dashPattern, "[:Dash:]", 8);
3214 U_STRING_DECL(lowerPattern, "[:Lowercase:]", 13);
3215 U_STRING_DECL(formatPattern, "[:Cf:]", 6);
3216 U_STRING_DECL(alphaPattern, "[:Alphabetic:]", 14);
3217
3218 U_STRING_DECL(mathBlocksPattern,
3219 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
3220 214);
3221 U_STRING_DECL(mathPattern, "[:Math:]", 8);
3222 U_STRING_DECL(unassignedPattern, "[:Cn:]", 6);
3223 U_STRING_DECL(unknownPattern, "[:sc=Unknown:]", 14);
3224 U_STRING_DECL(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3225
3226 U_STRING_INIT(hyphenPattern, "[:Hyphen:]", 10);
3227 U_STRING_INIT(dashPattern, "[:Dash:]", 8);
3228 U_STRING_INIT(lowerPattern, "[:Lowercase:]", 13);
3229 U_STRING_INIT(formatPattern, "[:Cf:]", 6);
3230 U_STRING_INIT(alphaPattern, "[:Alphabetic:]", 14);
3231
3232 U_STRING_INIT(mathBlocksPattern,
3233 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
3234 214);
3235 U_STRING_INIT(mathPattern, "[:Math:]", 8);
3236 U_STRING_INIT(unassignedPattern, "[:Cn:]", 6);
3237 U_STRING_INIT(unknownPattern, "[:sc=Unknown:]", 14);
3238 U_STRING_INIT(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3239
3240 /*
3241 * It used to be that UCD.html and its precursors said
3242 * "Those dashes used to mark connections between pieces of words,
3243 * plus the Katakana middle dot."
3244 *
3245 * Unicode 4 changed 00AD Soft Hyphen to Cf and removed it from Dash
3246 * but not from Hyphen.
3247 * UTC 94 (2003mar) decided to leave it that way and to change UCD.html.
3248 * Therefore, do not show errors when testing the Hyphen property.
3249 */
3250 log_verbose("Starting with Unicode 4, inconsistencies with [:Hyphen:] are\n"
3251 "known to the UTC and not considered errors.\n");
3252
3253 errorCode=U_ZERO_ERROR;
3254 set1=uset_openPattern(hyphenPattern, 10, &errorCode);
3255 set2=uset_openPattern(dashPattern, 8, &errorCode);
3256 if(U_SUCCESS(errorCode)) {
3257 /* remove the Katakana middle dot(s) from set1 */
3258 uset_remove(set1, 0x30fb);
3259 uset_remove(set1, 0xff65); /* halfwidth variant */
3260 showAMinusB(set1, set2, "[:Hyphen:]", "[:Dash:]", FALSE);
3261 } else {
3262 log_data_err("error opening [:Hyphen:] or [:Dash:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3263 }
3264
3265 /* check that Cf is neither Hyphen nor Dash nor Alphabetic */
3266 set3=uset_openPattern(formatPattern, 6, &errorCode);
3267 set4=uset_openPattern(alphaPattern, 14, &errorCode);
3268 if(U_SUCCESS(errorCode)) {
3269 showAIntersectB(set3, set1, "[:Cf:]", "[:Hyphen:]", FALSE);
3270 showAIntersectB(set3, set2, "[:Cf:]", "[:Dash:]", TRUE);
3271 showAIntersectB(set3, set4, "[:Cf:]", "[:Alphabetic:]", TRUE);
3272 } else {
3273 log_data_err("error opening [:Cf:] or [:Alpbabetic:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3274 }
3275
3276 uset_close(set1);
3277 uset_close(set2);
3278 uset_close(set3);
3279 uset_close(set4);
3280
3281 /*
3282 * Check that each lowercase character has "small" in its name
3283 * and not "capital".
3284 * There are some such characters, some of which seem odd.
3285 * Use the verbose flag to see these notices.
3286 */
3287 errorCode=U_ZERO_ERROR;
3288 set1=uset_openPattern(lowerPattern, 13, &errorCode);
3289 if(U_SUCCESS(errorCode)) {
3290 for(i=0;; ++i) {
3291 length=uset_getItem(set1, i, &start, &end, NULL, 0, &errorCode);
3292 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
3293 break; /* done */
3294 }
3295 if(U_FAILURE(errorCode)) {
3296 log_err("error iterating over [:Lowercase:] at item %d: %s\n",
3297 i, u_errorName(errorCode));
3298 break;
3299 }
3300 if(length!=0) {
3301 break; /* done with code points, got a string or -1 */
3302 }
3303
3304 while(start<=end) {
3305 length=u_charName(start, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode);
3306 if(U_FAILURE(errorCode)) {
3307 log_data_err("error getting the name of U+%04x - %s\n", start, u_errorName(errorCode));
3308 errorCode=U_ZERO_ERROR;
3309 }
3310 if( (strstr(buffer, "SMALL")==NULL || strstr(buffer, "CAPITAL")!=NULL) &&
3311 strstr(buffer, "SMALL CAPITAL")==NULL
3312 ) {
3313 log_verbose("info: [:Lowercase:] contains U+%04x whose name does not suggest lowercase: %s\n", start, buffer);
3314 }
3315 ++start;
3316 }
3317 }
3318 } else {
3319 log_data_err("error opening [:Lowercase:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3320 }
3321 uset_close(set1);
3322
3323 /* verify that all assigned characters in Math blocks are exactly Math characters */
3324 errorCode=U_ZERO_ERROR;
3325 set1=uset_openPattern(mathBlocksPattern, -1, &errorCode);
3326 set2=uset_openPattern(mathPattern, 8, &errorCode);
3327 set3=uset_openPattern(unassignedPattern, 6, &errorCode);
3328 if(U_SUCCESS(errorCode)) {
3329 uset_retainAll(set2, set1); /* [math blocks]&[:Math:] */
3330 uset_complement(set3); /* assigned characters */
3331 uset_retainAll(set1, set3); /* [math blocks]&[assigned] */
3332 compareUSets(set1, set2,
3333 "[assigned Math block chars]", "[math blocks]&[:Math:]",
3334 TRUE);
3335 } else {
3336 log_data_err("error opening [math blocks] or [:Math:] or [:Cn:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3337 }
3338 uset_close(set1);
3339 uset_close(set2);
3340 uset_close(set3);
3341
3342 /* new in Unicode 5.0: exactly all unassigned+PUA+surrogate code points have script=Unknown */
3343 errorCode=U_ZERO_ERROR;
3344 set1=uset_openPattern(unknownPattern, 14, &errorCode);
3345 set2=uset_openPattern(reservedPattern, 20, &errorCode);
3346 if(U_SUCCESS(errorCode)) {
3347 compareUSets(set1, set2,
3348 "[:sc=Unknown:]", "[[:Cn:][:Co:][:Cs:]]",
3349 TRUE);
3350 } else {
3351 log_data_err("error opening [:sc=Unknown:] or [[:Cn:][:Co:][:Cs:]] - %s (Are you missing data?)\n", u_errorName(errorCode));
3352 }
3353 uset_close(set1);
3354 uset_close(set2);
3355 }
3356
3357 /* test case folding, compare return values with CaseFolding.txt ------------ */
3358
3359 /* bit set for which case foldings for a character have been tested already */
3360 enum {
3361 CF_SIMPLE=1,
3362 CF_FULL=2,
3363 CF_TURKIC=4,
3364 CF_ALL=7
3365 };
3366
3367 static void
3368 testFold(UChar32 c, int which,
3369 UChar32 simple, UChar32 turkic,
3370 const UChar *full, int32_t fullLength,
3371 const UChar *turkicFull, int32_t turkicFullLength) {
3372 UChar s[2], t[32];
3373 UChar32 c2;
3374 int32_t length, length2;
3375
3376 UErrorCode errorCode=U_ZERO_ERROR;
3377
3378 length=0;
3379 U16_APPEND_UNSAFE(s, length, c);
3380
3381 if((which&CF_SIMPLE)!=0 && (c2=u_foldCase(c, 0))!=simple) {
3382 log_err("u_foldCase(U+%04lx, default)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3383 }
3384 if((which&CF_FULL)!=0) {
3385 length2=u_strFoldCase(t, UPRV_LENGTHOF(t), s, length, 0, &errorCode);
3386 if(length2!=fullLength || 0!=u_memcmp(t, full, fullLength)) {
3387 log_err("u_strFoldCase(U+%04lx, default) does not fold properly\n", (long)c);
3388 }
3389 }
3390 if((which&CF_TURKIC)!=0) {
3391 if((c2=u_foldCase(c, U_FOLD_CASE_EXCLUDE_SPECIAL_I))!=turkic) {
3392 log_err("u_foldCase(U+%04lx, turkic)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3393 }
3394
3395 length2=u_strFoldCase(t, UPRV_LENGTHOF(t), s, length, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &errorCode);
3396 if(length2!=turkicFullLength || 0!=u_memcmp(t, turkicFull, length2)) {
3397 log_err("u_strFoldCase(U+%04lx, turkic) does not fold properly\n", (long)c);
3398 }
3399 }
3400 }
3401
3402 /* test that c case-folds to itself */
3403 static void
3404 testFoldToSelf(UChar32 c, int which) {
3405 UChar s[2];
3406 int32_t length;
3407
3408 length=0;
3409 U16_APPEND_UNSAFE(s, length, c);
3410 testFold(c, which, c, c, s, length, s, length);
3411 }
3412
3413 struct CaseFoldingData {
3414 USet *notSeen;
3415 UChar32 prev, prevSimple;
3416 UChar prevFull[32];
3417 int32_t prevFullLength;
3418 int which;
3419 };
3420 typedef struct CaseFoldingData CaseFoldingData;
3421
3422 static void U_CALLCONV
3423 caseFoldingLineFn(void *context,
3424 char *fields[][2], int32_t fieldCount,
3425 UErrorCode *pErrorCode) {
3426 (void)fieldCount; // suppress compiler warnings about unused variable
3427
3428 CaseFoldingData *pData=(CaseFoldingData *)context;
3429 char *end;
3430 UChar full[32];
3431 UChar32 c, prev, simple;
3432 int32_t count;
3433 int which;
3434 char status;
3435
3436 /* get code point */
3437 const char *s=u_skipWhitespace(fields[0][0]);
3438 if(0==strncmp(s, "0000..10FFFF", 12)) {
3439 /*
3440 * Ignore the line
3441 * # @missing: 0000..10FFFF; C; <code point>
3442 * because maps-to-self is already our default, and this line breaks this parser.
3443 */
3444 return;
3445 }
3446 c=(UChar32)strtoul(s, &end, 16);
3447 end=(char *)u_skipWhitespace(end);
3448 if(end<=fields[0][0] || end!=fields[0][1]) {
3449 log_err("syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
3450 *pErrorCode=U_PARSE_ERROR;
3451 return;
3452 }
3453
3454 /* get the status of this mapping */
3455 status=*u_skipWhitespace(fields[1][0]);
3456 if(status!='C' && status!='S' && status!='F' && status!='T') {
3457 log_err("unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
3458 *pErrorCode=U_PARSE_ERROR;
3459 return;
3460 }
3461
3462 /* get the mapping */
3463 count=u_parseString(fields[2][0], full, 32, (uint32_t *)&simple, pErrorCode);
3464 if(U_FAILURE(*pErrorCode)) {
3465 log_err("error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
3466 return;
3467 }
3468
3469 /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
3470 if(count==0 || count>2 || (count==2 && U16_IS_SINGLE(full[1]))) {
3471 simple=c;
3472 }
3473
3474 if(c!=(prev=pData->prev)) {
3475 /*
3476 * Test remaining mappings for the previous code point.
3477 * If a turkic folding was not mentioned, then it should fold the same
3478 * as the regular simple case folding.
3479 */
3480 UChar prevString[2];
3481 int32_t length;
3482
3483 length=0;
3484 U16_APPEND_UNSAFE(prevString, length, prev);
3485 testFold(prev, (~pData->which)&CF_ALL,
3486 prev, pData->prevSimple,
3487 prevString, length,
3488 pData->prevFull, pData->prevFullLength);
3489 pData->prev=pData->prevSimple=c;
3490 length=0;
3491 U16_APPEND_UNSAFE(pData->prevFull, length, c);
3492 pData->prevFullLength=length;
3493 pData->which=0;
3494 }
3495
3496 /*
3497 * Turn the status into a bit set of case foldings to test.
3498 * Remember non-Turkic case foldings as defaults for Turkic mode.
3499 */
3500 switch(status) {
3501 case 'C':
3502 which=CF_SIMPLE|CF_FULL;
3503 pData->prevSimple=simple;
3504 u_memcpy(pData->prevFull, full, count);
3505 pData->prevFullLength=count;
3506 break;
3507 case 'S':
3508 which=CF_SIMPLE;
3509 pData->prevSimple=simple;
3510 break;
3511 case 'F':
3512 which=CF_FULL;
3513 u_memcpy(pData->prevFull, full, count);
3514 pData->prevFullLength=count;
3515 break;
3516 case 'T':
3517 which=CF_TURKIC;
3518 break;
3519 default:
3520 which=0;
3521 break; /* won't happen because of test above */
3522 }
3523
3524 testFold(c, which, simple, simple, full, count, full, count);
3525
3526 /* remember which case foldings of c have been tested */
3527 pData->which|=which;
3528
3529 /* remove c from the set of ones not mentioned in CaseFolding.txt */
3530 uset_remove(pData->notSeen, c);
3531 }
3532
3533 static void
3534 TestCaseFolding() {
3535 CaseFoldingData data={ NULL, 0, 0, {0}, 0, 0 };
3536 char *fields[3][2];
3537 UErrorCode errorCode;
3538
3539 static char *lastLine= (char *)"10FFFF; C; 10FFFF;";
3540
3541 errorCode=U_ZERO_ERROR;
3542 /* test BMP & plane 1 - nothing interesting above */
3543 data.notSeen=uset_open(0, 0x1ffff);
3544 data.prevFullLength=1; /* length of full case folding of U+0000 */
3545
3546 parseUCDFile("CaseFolding.txt", fields, 3, caseFoldingLineFn, &data, &errorCode);
3547 if(U_SUCCESS(errorCode)) {
3548 int32_t i, start, end;
3549
3550 /* add a pseudo-last line to finish testing of the actual last one */
3551 fields[0][0]=lastLine;
3552 fields[0][1]=lastLine+6;
3553 fields[1][0]=lastLine+7;
3554 fields[1][1]=lastLine+9;
3555 fields[2][0]=lastLine+10;
3556 fields[2][1]=lastLine+17;
3557 caseFoldingLineFn(&data, fields, 3, &errorCode);
3558
3559 /* verify that all code points that are not mentioned in CaseFolding.txt fold to themselves */
3560 for(i=0;
3561 0==uset_getItem(data.notSeen, i, &start, &end, NULL, 0, &errorCode) &&
3562 U_SUCCESS(errorCode);
3563 ++i
3564 ) {
3565 do {
3566 testFoldToSelf(start, CF_ALL);
3567 } while(++start<=end);
3568 }
3569 }
3570
3571 uset_close(data.notSeen);
3572 }
3573
3574 static void TestBinaryCharacterPropertiesAPI() {
3575 // API test only. See intltest/ucdtest.cpp for functional test.
3576 UErrorCode errorCode = U_ZERO_ERROR;
3577 const USet *set = u_getBinaryPropertySet(-1, &errorCode);
3578 if (U_SUCCESS(errorCode)) {
3579 log_err("u_getBinaryPropertySet(-1) did not fail\n");
3580 }
3581 errorCode = U_ZERO_ERROR;
3582 set = u_getBinaryPropertySet(UCHAR_BINARY_LIMIT, &errorCode);
3583 if (U_SUCCESS(errorCode)) {
3584 log_err("u_getBinaryPropertySet(UCHAR_BINARY_LIMIT) did not fail\n");
3585 }
3586 errorCode = U_ZERO_ERROR;
3587 set = u_getBinaryPropertySet(UCHAR_WHITE_SPACE, &errorCode);
3588 if (!uset_contains(set, 0x20) || uset_contains(set, 0x61)) {
3589 log_err("u_getBinaryPropertySet(UCHAR_WHITE_SPACE) wrong contents\n");
3590 }
3591 }
3592
3593 static void TestIntCharacterPropertiesAPI() {
3594 // API test only. See intltest/ucdtest.cpp for functional test.
3595 UErrorCode errorCode = U_ZERO_ERROR;
3596 const UCPMap *map = u_getIntPropertyMap(UCHAR_INT_START - 1, &errorCode);
3597 if (U_SUCCESS(errorCode)) {
3598 log_err("u_getIntPropertyMap(UCHAR_INT_START - 1) did not fail\n");
3599 }
3600 errorCode = U_ZERO_ERROR;
3601 map = u_getIntPropertyMap(UCHAR_INT_LIMIT, &errorCode);
3602 if (U_SUCCESS(errorCode)) {
3603 log_err("u_getIntPropertyMap(UCHAR_INT_LIMIT) did not fail\n");
3604 }
3605 errorCode = U_ZERO_ERROR;
3606 map = u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY, &errorCode);
3607 if (ucpmap_get(map, 0x20) != U_SPACE_SEPARATOR || ucpmap_get(map, 0x23456) != U_OTHER_LETTER) {
3608 log_err("u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY) wrong contents\n");
3609 }
3610 }