]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/cintltst/cucdtst.c
15edead5e3457236001c3b20b3b67bc35aa323f0
[apple/icu.git] / icuSources / test / cintltst / cucdtst.c
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 1997-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8 /*******************************************************************************
9 *
10 * File CUCDTST.C
11 *
12 * Modification History:
13 * Name Description
14 * Madhu Katragadda Ported for C API, added tests for string functions
15 ********************************************************************************
16 */
17
18 #include <string.h>
19 #include <math.h>
20 #include <stdlib.h>
21
22 #include "unicode/utypes.h"
23 #include "unicode/uchar.h"
24 #include "unicode/putil.h"
25 #include "unicode/ustring.h"
26 #include "unicode/uloc.h"
27 #include "unicode/unorm2.h"
28
29 #include "cintltst.h"
30 #include "putilimp.h"
31 #include "uparse.h"
32 #include "ucase.h"
33 #include "ubidi_props.h"
34 #include "uprops.h"
35 #include "uset_imp.h"
36 #include "usc_impl.h"
37 #include "udatamem.h"
38 #include "cucdapi.h"
39 #include "cmemory.h"
40
41 /* prototypes --------------------------------------------------------------- */
42
43 static void TestUpperLower(void);
44 static void TestLetterNumber(void);
45 static void TestMisc(void);
46 static void TestPOSIX(void);
47 static void TestControlPrint(void);
48 static void TestIdentifier(void);
49 static void TestUnicodeData(void);
50 static void TestCodeUnit(void);
51 static void TestCodePoint(void);
52 static void TestCharLength(void);
53 static void TestCharNames(void);
54 static void TestUCharFromNameUnderflow(void);
55 static void TestMirroring(void);
56 static void TestUScriptRunAPI(void);
57 static void TestAdditionalProperties(void);
58 static void TestNumericProperties(void);
59 static void TestPropertyNames(void);
60 static void TestPropertyValues(void);
61 static void TestConsistency(void);
62 static void TestUBiDiProps(void);
63 static void TestCaseFolding(void);
64
65 /* internal methods used */
66 static int32_t MakeProp(char* str);
67 static int32_t MakeDir(char* str);
68
69 /* helpers ------------------------------------------------------------------ */
70
71 static void
72 parseUCDFile(const char *filename,
73 char *fields[][2], int32_t fieldCount,
74 UParseLineFn *lineFn, void *context,
75 UErrorCode *pErrorCode) {
76 char path[256];
77 char backupPath[256];
78
79 if(U_FAILURE(*pErrorCode)) {
80 return;
81 }
82
83 /* Look inside ICU_DATA first */
84 strcpy(path, u_getDataDirectory());
85 strcat(path, ".." U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING);
86 strcat(path, filename);
87
88 /* As a fallback, try to guess where the source data was located
89 * at the time ICU was built, and look there.
90 */
91 strcpy(backupPath, ctest_dataSrcDir());
92 strcat(backupPath, U_FILE_SEP_STRING);
93 strcat(backupPath, "unidata" U_FILE_SEP_STRING);
94 strcat(backupPath, filename);
95
96 u_parseDelimitedFile(path, ';', fields, fieldCount, lineFn, context, pErrorCode);
97 if(*pErrorCode==U_FILE_ACCESS_ERROR) {
98 *pErrorCode=U_ZERO_ERROR;
99 u_parseDelimitedFile(backupPath, ';', fields, fieldCount, lineFn, context, pErrorCode);
100 }
101 if(U_FAILURE(*pErrorCode)) {
102 log_err_status(*pErrorCode, "error parsing %s: %s\n", filename, u_errorName(*pErrorCode));
103 }
104 }
105
106 /* test data ---------------------------------------------------------------- */
107
108 static const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf";
109 static const int32_t tagValues[] =
110 {
111 /* Mn */ U_NON_SPACING_MARK,
112 /* Mc */ U_COMBINING_SPACING_MARK,
113 /* Me */ U_ENCLOSING_MARK,
114 /* Nd */ U_DECIMAL_DIGIT_NUMBER,
115 /* Nl */ U_LETTER_NUMBER,
116 /* No */ U_OTHER_NUMBER,
117 /* Zs */ U_SPACE_SEPARATOR,
118 /* Zl */ U_LINE_SEPARATOR,
119 /* Zp */ U_PARAGRAPH_SEPARATOR,
120 /* Cc */ U_CONTROL_CHAR,
121 /* Cf */ U_FORMAT_CHAR,
122 /* Cs */ U_SURROGATE,
123 /* Co */ U_PRIVATE_USE_CHAR,
124 /* Cn */ U_UNASSIGNED,
125 /* Lu */ U_UPPERCASE_LETTER,
126 /* Ll */ U_LOWERCASE_LETTER,
127 /* Lt */ U_TITLECASE_LETTER,
128 /* Lm */ U_MODIFIER_LETTER,
129 /* Lo */ U_OTHER_LETTER,
130 /* Pc */ U_CONNECTOR_PUNCTUATION,
131 /* Pd */ U_DASH_PUNCTUATION,
132 /* Ps */ U_START_PUNCTUATION,
133 /* Pe */ U_END_PUNCTUATION,
134 /* Po */ U_OTHER_PUNCTUATION,
135 /* Sm */ U_MATH_SYMBOL,
136 /* Sc */ U_CURRENCY_SYMBOL,
137 /* Sk */ U_MODIFIER_SYMBOL,
138 /* So */ U_OTHER_SYMBOL,
139 /* Pi */ U_INITIAL_PUNCTUATION,
140 /* Pf */ U_FINAL_PUNCTUATION
141 };
142
143 static const char dirStrings[][5] = {
144 "L",
145 "R",
146 "EN",
147 "ES",
148 "ET",
149 "AN",
150 "CS",
151 "B",
152 "S",
153 "WS",
154 "ON",
155 "LRE",
156 "LRO",
157 "AL",
158 "RLE",
159 "RLO",
160 "PDF",
161 "NSM",
162 "BN",
163 /* new in Unicode 6.3/ICU 52 */
164 "FSI",
165 "LRI",
166 "RLI",
167 "PDI"
168 };
169
170 void addUnicodeTest(TestNode** root);
171
172 void addUnicodeTest(TestNode** root)
173 {
174 addTest(root, &TestCodeUnit, "tsutil/cucdtst/TestCodeUnit");
175 addTest(root, &TestCodePoint, "tsutil/cucdtst/TestCodePoint");
176 addTest(root, &TestCharLength, "tsutil/cucdtst/TestCharLength");
177 addTest(root, &TestBinaryValues, "tsutil/cucdtst/TestBinaryValues");
178 addTest(root, &TestUnicodeData, "tsutil/cucdtst/TestUnicodeData");
179 addTest(root, &TestAdditionalProperties, "tsutil/cucdtst/TestAdditionalProperties");
180 addTest(root, &TestNumericProperties, "tsutil/cucdtst/TestNumericProperties");
181 addTest(root, &TestUpperLower, "tsutil/cucdtst/TestUpperLower");
182 addTest(root, &TestLetterNumber, "tsutil/cucdtst/TestLetterNumber");
183 addTest(root, &TestMisc, "tsutil/cucdtst/TestMisc");
184 addTest(root, &TestPOSIX, "tsutil/cucdtst/TestPOSIX");
185 addTest(root, &TestControlPrint, "tsutil/cucdtst/TestControlPrint");
186 addTest(root, &TestIdentifier, "tsutil/cucdtst/TestIdentifier");
187 addTest(root, &TestCharNames, "tsutil/cucdtst/TestCharNames");
188 addTest(root, &TestUCharFromNameUnderflow, "tsutil/cucdtst/TestUCharFromNameUnderflow");
189 addTest(root, &TestMirroring, "tsutil/cucdtst/TestMirroring");
190 addTest(root, &TestUScriptCodeAPI, "tsutil/cucdtst/TestUScriptCodeAPI");
191 addTest(root, &TestHasScript, "tsutil/cucdtst/TestHasScript");
192 addTest(root, &TestGetScriptExtensions, "tsutil/cucdtst/TestGetScriptExtensions");
193 addTest(root, &TestScriptMetadataAPI, "tsutil/cucdtst/TestScriptMetadataAPI");
194 addTest(root, &TestUScriptRunAPI, "tsutil/cucdtst/TestUScriptRunAPI");
195 addTest(root, &TestPropertyNames, "tsutil/cucdtst/TestPropertyNames");
196 addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues");
197 addTest(root, &TestConsistency, "tsutil/cucdtst/TestConsistency");
198 addTest(root, &TestUBiDiProps, "tsutil/cucdtst/TestUBiDiProps");
199 addTest(root, &TestCaseFolding, "tsutil/cucdtst/TestCaseFolding");
200 }
201
202 /*==================================================== */
203 /* test u_toupper() and u_tolower() */
204 /*==================================================== */
205 static void TestUpperLower()
206 {
207 const UChar upper[] = {0x41, 0x42, 0x00b2, 0x01c4, 0x01c6, 0x01c9, 0x01c8, 0x01c9, 0x000c, 0x0000};
208 const UChar lower[] = {0x61, 0x62, 0x00b2, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01c9, 0x000c, 0x0000};
209 U_STRING_DECL(upperTest, "abcdefg123hij.?:klmno", 21);
210 U_STRING_DECL(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
211 int32_t i;
212
213 U_STRING_INIT(upperTest, "abcdefg123hij.?:klmno", 21);
214 U_STRING_INIT(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
215
216 /*
217 Checks LetterLike Symbols which were previously a source of confusion
218 [Bertrand A. D. 02/04/98]
219 */
220 for (i=0x2100;i<0x2138;i++)
221 {
222 /* Unicode 5.0 adds lowercase U+214E (TURNED SMALL F) to U+2132 (TURNED CAPITAL F) */
223 if(i!=0x2126 && i!=0x212a && i!=0x212b && i!=0x2132)
224 {
225 if (i != (int)u_tolower(i)) /* itself */
226 log_err("Failed case conversion with itself: U+%04x\n", i);
227 if (i != (int)u_toupper(i))
228 log_err("Failed case conversion with itself: U+%04x\n", i);
229 }
230 }
231
232 for(i=0; i < u_strlen(upper); i++){
233 if(u_tolower(upper[i]) != lower[i]){
234 log_err("FAILED u_tolower() for %lx Expected %lx Got %lx\n", upper[i], lower[i], u_tolower(upper[i]));
235 }
236 }
237
238 log_verbose("testing upper lower\n");
239 for (i = 0; i < 21; i++) {
240
241 if (u_isalpha(upperTest[i]) && !u_islower(upperTest[i]))
242 {
243 log_err("Failed isLowerCase test at %c\n", upperTest[i]);
244 }
245 else if (u_isalpha(lowerTest[i]) && !u_isupper(lowerTest[i]))
246 {
247 log_err("Failed isUpperCase test at %c\n", lowerTest[i]);
248 }
249 else if (upperTest[i] != u_tolower(lowerTest[i]))
250 {
251 log_err("Failed case conversion from %c To %c :\n", lowerTest[i], upperTest[i]);
252 }
253 else if (lowerTest[i] != u_toupper(upperTest[i]))
254 {
255 log_err("Failed case conversion : %c To %c \n", upperTest[i], lowerTest[i]);
256 }
257 else if (upperTest[i] != u_tolower(upperTest[i]))
258 {
259 log_err("Failed case conversion with itself: %c\n", upperTest[i]);
260 }
261 else if (lowerTest[i] != u_toupper(lowerTest[i]))
262 {
263 log_err("Failed case conversion with itself: %c\n", lowerTest[i]);
264 }
265 }
266 log_verbose("done testing upper lower\n");
267
268 log_verbose("testing u_istitle\n");
269 {
270 static const UChar expected[] = {
271 0x1F88,
272 0x1F89,
273 0x1F8A,
274 0x1F8B,
275 0x1F8C,
276 0x1F8D,
277 0x1F8E,
278 0x1F8F,
279 0x1F88,
280 0x1F89,
281 0x1F8A,
282 0x1F8B,
283 0x1F8C,
284 0x1F8D,
285 0x1F8E,
286 0x1F8F,
287 0x1F98,
288 0x1F99,
289 0x1F9A,
290 0x1F9B,
291 0x1F9C,
292 0x1F9D,
293 0x1F9E,
294 0x1F9F,
295 0x1F98,
296 0x1F99,
297 0x1F9A,
298 0x1F9B,
299 0x1F9C,
300 0x1F9D,
301 0x1F9E,
302 0x1F9F,
303 0x1FA8,
304 0x1FA9,
305 0x1FAA,
306 0x1FAB,
307 0x1FAC,
308 0x1FAD,
309 0x1FAE,
310 0x1FAF,
311 0x1FA8,
312 0x1FA9,
313 0x1FAA,
314 0x1FAB,
315 0x1FAC,
316 0x1FAD,
317 0x1FAE,
318 0x1FAF,
319 0x1FBC,
320 0x1FBC,
321 0x1FCC,
322 0x1FCC,
323 0x1FFC,
324 0x1FFC,
325 };
326 int32_t num = UPRV_LENGTHOF(expected);
327 for(i=0; i<num; i++){
328 if(!u_istitle(expected[i])){
329 log_err("u_istitle failed for 0x%4X. Expected TRUE, got FALSE\n",expected[i]);
330 }
331 }
332
333 }
334 }
335
336 /* compare two sets and verify that their difference or intersection is empty */
337 static UBool
338 showADiffB(const USet *a, const USet *b,
339 const char *a_name, const char *b_name,
340 UBool expect, UBool diffIsError) {
341 USet *aa;
342 int32_t i, start, end, length;
343 UErrorCode errorCode;
344
345 /*
346 * expect:
347 * TRUE -> a-b should be empty, that is, b should contain all of a
348 * FALSE -> a&b should be empty, that is, a should contain none of b (and vice versa)
349 */
350 if(expect ? uset_containsAll(b, a) : uset_containsNone(a, b)) {
351 return TRUE;
352 }
353
354 /* clone a to aa because a is const */
355 aa=uset_open(1, 0);
356 if(aa==NULL) {
357 /* unusual problem - out of memory? */
358 return FALSE;
359 }
360 uset_addAll(aa, a);
361
362 /* compute the set in question */
363 if(expect) {
364 /* a-b */
365 uset_removeAll(aa, b);
366 } else {
367 /* a&b */
368 uset_retainAll(aa, b);
369 }
370
371 /* aa is not empty because of the initial tests above; show its contents */
372 errorCode=U_ZERO_ERROR;
373 i=0;
374 for(;;) {
375 length=uset_getItem(aa, i, &start, &end, NULL, 0, &errorCode);
376 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
377 break; /* done */
378 }
379 if(U_FAILURE(errorCode)) {
380 log_err("error comparing %s with %s at difference item %d: %s\n",
381 a_name, b_name, i, u_errorName(errorCode));
382 break;
383 }
384 if(length!=0) {
385 break; /* done with code points, got a string or -1 */
386 }
387
388 if(diffIsError) {
389 if(expect) {
390 log_err("error: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
391 } else {
392 log_err("error: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
393 }
394 } else {
395 if(expect) {
396 log_verbose("info: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
397 } else {
398 log_verbose("info: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
399 }
400 }
401
402 ++i;
403 }
404
405 uset_close(aa);
406 return FALSE;
407 }
408
409 static UBool
410 showAMinusB(const USet *a, const USet *b,
411 const char *a_name, const char *b_name,
412 UBool diffIsError) {
413 return showADiffB(a, b, a_name, b_name, TRUE, diffIsError);
414 }
415
416 static UBool
417 showAIntersectB(const USet *a, const USet *b,
418 const char *a_name, const char *b_name,
419 UBool diffIsError) {
420 return showADiffB(a, b, a_name, b_name, FALSE, diffIsError);
421 }
422
423 static UBool
424 compareUSets(const USet *a, const USet *b,
425 const char *a_name, const char *b_name,
426 UBool diffIsError) {
427 /*
428 * Use an arithmetic & not a logical && so that both branches
429 * are always taken and all differences are shown.
430 */
431 return
432 showAMinusB(a, b, a_name, b_name, diffIsError) &
433 showAMinusB(b, a, b_name, a_name, diffIsError);
434 }
435
436 /* test isLetter(u_isapha()) and isDigit(u_isdigit()) */
437 static void TestLetterNumber()
438 {
439 UChar i = 0x0000;
440
441 log_verbose("Testing for isalpha\n");
442 for (i = 0x0041; i < 0x005B; i++) {
443 if (!u_isalpha(i))
444 {
445 log_err("Failed isLetter test at %.4X\n", i);
446 }
447 }
448 for (i = 0x0660; i < 0x066A; i++) {
449 if (u_isalpha(i))
450 {
451 log_err("Failed isLetter test with numbers at %.4X\n", i);
452 }
453 }
454
455 log_verbose("Testing for isdigit\n");
456 for (i = 0x0660; i < 0x066A; i++) {
457 if (!u_isdigit(i))
458 {
459 log_verbose("Failed isNumber test at %.4X\n", i);
460 }
461 }
462
463 log_verbose("Testing for isalnum\n");
464 for (i = 0x0041; i < 0x005B; i++) {
465 if (!u_isalnum(i))
466 {
467 log_err("Failed isAlNum test at %.4X\n", i);
468 }
469 }
470 for (i = 0x0660; i < 0x066A; i++) {
471 if (!u_isalnum(i))
472 {
473 log_err("Failed isAlNum test at %.4X\n", i);
474 }
475 }
476
477 {
478 /*
479 * The following checks work only starting from Unicode 4.0.
480 * Check the version number here.
481 */
482 static UVersionInfo u401={ 4, 0, 1, 0 };
483 UVersionInfo version;
484 u_getUnicodeVersion(version);
485 if(version[0]<4 || 0==memcmp(version, u401, 4)) {
486 return;
487 }
488 }
489
490 {
491 /*
492 * Sanity check:
493 * Verify that exactly the digit characters have decimal digit values.
494 * This assumption is used in the implementation of u_digit()
495 * (which checks nt=de)
496 * compared with the parallel java.lang.Character.digit()
497 * (which checks Nd).
498 *
499 * This was not true in Unicode 3.2 and earlier.
500 * Unicode 4.0 fixed discrepancies.
501 * Unicode 4.0.1 re-introduced problems in this area due to an
502 * unintentionally incomplete last-minute change.
503 */
504 U_STRING_DECL(digitsPattern, "[:Nd:]", 6);
505 U_STRING_DECL(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
506
507 USet *digits, *decimalValues;
508 UErrorCode errorCode;
509
510 U_STRING_INIT(digitsPattern, "[:Nd:]", 6);
511 U_STRING_INIT(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
512 errorCode=U_ZERO_ERROR;
513 digits=uset_openPattern(digitsPattern, 6, &errorCode);
514 decimalValues=uset_openPattern(decimalValuesPattern, 24, &errorCode);
515
516 if(U_SUCCESS(errorCode)) {
517 compareUSets(digits, decimalValues, "[:Nd:]", "[:Numeric_Type=Decimal:]", TRUE);
518 }
519
520 uset_close(digits);
521 uset_close(decimalValues);
522 }
523 }
524
525 static void testSampleCharProps(UBool propFn(UChar32), const char *propName,
526 const UChar32 *sampleChars, int32_t sampleCharsLength,
527 UBool expected) {
528 int32_t i;
529 for (i = 0; i < sampleCharsLength; ++i) {
530 UBool result = propFn(sampleChars[i]);
531 if (result != expected) {
532 log_err("error: character property function %s(U+%04x)=%d is wrong\n",
533 propName, sampleChars[i], result);
534 }
535 }
536 }
537
538 /* Tests for isDefined(u_isdefined)(, isBaseForm(u_isbase()), isSpaceChar(u_isspace()), isWhiteSpace(), u_CharDigitValue() */
539 static void TestMisc()
540 {
541 static const UChar32 sampleSpaces[] = {0x0020, 0x00a0, 0x2000, 0x2001, 0x2005};
542 static const UChar32 sampleNonSpaces[] = {0x61, 0x62, 0x63, 0x64, 0x74};
543 static const UChar32 sampleUndefined[] = {0xfff1, 0xfff7, 0xfa6e};
544 static const UChar32 sampleDefined[] = {0x523E, 0x4f88, 0xfffd};
545 static const UChar32 sampleBase[] = {0x0061, 0x0031, 0x03d2};
546 static const UChar32 sampleNonBase[] = {0x002B, 0x0020, 0x203B};
547 /* static const UChar sampleChars[] = {0x000a, 0x0045, 0x4e00, 0xDC00, 0xFFE8, 0xFFF0};*/
548 static const UChar32 sampleDigits[]= {0x0030, 0x0662, 0x0F23, 0x0ED5};
549 static const UChar32 sampleNonDigits[] = {0x0010, 0x0041, 0x0122, 0x68FE};
550 static const UChar32 sampleWhiteSpaces[] = {0x2008, 0x2009, 0x200a, 0x001c, 0x000c};
551 static const UChar32 sampleNonWhiteSpaces[] = {0x61, 0x62, 0x3c, 0x28, 0x3f, 0x85, 0x2007, 0xffef};
552
553 static const int32_t sampleDigitValues[] = {0, 2, 3, 5};
554
555 uint32_t mask;
556
557 int32_t i;
558 char icuVersion[U_MAX_VERSION_STRING_LENGTH];
559 UVersionInfo realVersion;
560
561 memset(icuVersion, 0, U_MAX_VERSION_STRING_LENGTH);
562
563 testSampleCharProps(u_isspace, "u_isspace", sampleSpaces, UPRV_LENGTHOF(sampleSpaces), TRUE);
564 testSampleCharProps(u_isspace, "u_isspace", sampleNonSpaces, UPRV_LENGTHOF(sampleNonSpaces), FALSE);
565
566 testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
567 sampleSpaces, UPRV_LENGTHOF(sampleSpaces), TRUE);
568 testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
569 sampleNonSpaces, UPRV_LENGTHOF(sampleNonSpaces), FALSE);
570
571 testSampleCharProps(u_isWhitespace, "u_isWhitespace",
572 sampleWhiteSpaces, UPRV_LENGTHOF(sampleWhiteSpaces), TRUE);
573 testSampleCharProps(u_isWhitespace, "u_isWhitespace",
574 sampleNonWhiteSpaces, UPRV_LENGTHOF(sampleNonWhiteSpaces), FALSE);
575
576 testSampleCharProps(u_isdefined, "u_isdefined",
577 sampleDefined, UPRV_LENGTHOF(sampleDefined), TRUE);
578 testSampleCharProps(u_isdefined, "u_isdefined",
579 sampleUndefined, UPRV_LENGTHOF(sampleUndefined), FALSE);
580
581 testSampleCharProps(u_isbase, "u_isbase", sampleBase, UPRV_LENGTHOF(sampleBase), TRUE);
582 testSampleCharProps(u_isbase, "u_isbase", sampleNonBase, UPRV_LENGTHOF(sampleNonBase), FALSE);
583
584 testSampleCharProps(u_isdigit, "u_isdigit", sampleDigits, UPRV_LENGTHOF(sampleDigits), TRUE);
585 testSampleCharProps(u_isdigit, "u_isdigit", sampleNonDigits, UPRV_LENGTHOF(sampleNonDigits), FALSE);
586
587 for (i = 0; i < UPRV_LENGTHOF(sampleDigits); i++) {
588 if (u_charDigitValue(sampleDigits[i]) != sampleDigitValues[i]) {
589 log_err("error: u_charDigitValue(U+04x)=%d != %d\n",
590 sampleDigits[i], u_charDigitValue(sampleDigits[i]), sampleDigitValues[i]);
591 }
592 }
593
594 /* Tests the ICU version #*/
595 u_getVersion(realVersion);
596 u_versionToString(realVersion, icuVersion);
597 if (strncmp(icuVersion, U_ICU_VERSION, uprv_min((int32_t)strlen(icuVersion), (int32_t)strlen(U_ICU_VERSION))) != 0)
598 {
599 log_err("ICU version test failed. Header says=%s, got=%s \n", U_ICU_VERSION, icuVersion);
600 }
601 #if defined(ICU_VERSION)
602 /* test only happens where we have configure.in with VERSION - sanity check. */
603 if(strcmp(U_ICU_VERSION, ICU_VERSION))
604 {
605 log_err("ICU version mismatch: Header says %s, build environment says %s.\n", U_ICU_VERSION, ICU_VERSION);
606 }
607 #endif
608
609 /* test U_GC_... */
610 if(
611 U_GET_GC_MASK(0x41)!=U_GC_LU_MASK ||
612 U_GET_GC_MASK(0x662)!=U_GC_ND_MASK ||
613 U_GET_GC_MASK(0xa0)!=U_GC_ZS_MASK ||
614 U_GET_GC_MASK(0x28)!=U_GC_PS_MASK ||
615 U_GET_GC_MASK(0x2044)!=U_GC_SM_MASK ||
616 U_GET_GC_MASK(0xe0063)!=U_GC_CF_MASK
617 ) {
618 log_err("error: U_GET_GC_MASK does not work properly\n");
619 }
620
621 mask=0;
622 mask=(mask&~U_GC_CN_MASK)|U_GC_CN_MASK;
623
624 mask=(mask&~U_GC_LU_MASK)|U_GC_LU_MASK;
625 mask=(mask&~U_GC_LL_MASK)|U_GC_LL_MASK;
626 mask=(mask&~U_GC_LT_MASK)|U_GC_LT_MASK;
627 mask=(mask&~U_GC_LM_MASK)|U_GC_LM_MASK;
628 mask=(mask&~U_GC_LO_MASK)|U_GC_LO_MASK;
629
630 mask=(mask&~U_GC_MN_MASK)|U_GC_MN_MASK;
631 mask=(mask&~U_GC_ME_MASK)|U_GC_ME_MASK;
632 mask=(mask&~U_GC_MC_MASK)|U_GC_MC_MASK;
633
634 mask=(mask&~U_GC_ND_MASK)|U_GC_ND_MASK;
635 mask=(mask&~U_GC_NL_MASK)|U_GC_NL_MASK;
636 mask=(mask&~U_GC_NO_MASK)|U_GC_NO_MASK;
637
638 mask=(mask&~U_GC_ZS_MASK)|U_GC_ZS_MASK;
639 mask=(mask&~U_GC_ZL_MASK)|U_GC_ZL_MASK;
640 mask=(mask&~U_GC_ZP_MASK)|U_GC_ZP_MASK;
641
642 mask=(mask&~U_GC_CC_MASK)|U_GC_CC_MASK;
643 mask=(mask&~U_GC_CF_MASK)|U_GC_CF_MASK;
644 mask=(mask&~U_GC_CO_MASK)|U_GC_CO_MASK;
645 mask=(mask&~U_GC_CS_MASK)|U_GC_CS_MASK;
646
647 mask=(mask&~U_GC_PD_MASK)|U_GC_PD_MASK;
648 mask=(mask&~U_GC_PS_MASK)|U_GC_PS_MASK;
649 mask=(mask&~U_GC_PE_MASK)|U_GC_PE_MASK;
650 mask=(mask&~U_GC_PC_MASK)|U_GC_PC_MASK;
651 mask=(mask&~U_GC_PO_MASK)|U_GC_PO_MASK;
652
653 mask=(mask&~U_GC_SM_MASK)|U_GC_SM_MASK;
654 mask=(mask&~U_GC_SC_MASK)|U_GC_SC_MASK;
655 mask=(mask&~U_GC_SK_MASK)|U_GC_SK_MASK;
656 mask=(mask&~U_GC_SO_MASK)|U_GC_SO_MASK;
657
658 mask=(mask&~U_GC_PI_MASK)|U_GC_PI_MASK;
659 mask=(mask&~U_GC_PF_MASK)|U_GC_PF_MASK;
660
661 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
662 log_err("error: problems with U_GC_XX_MASK constants\n");
663 }
664
665 mask=0;
666 mask=(mask&~U_GC_C_MASK)|U_GC_C_MASK;
667 mask=(mask&~U_GC_L_MASK)|U_GC_L_MASK;
668 mask=(mask&~U_GC_M_MASK)|U_GC_M_MASK;
669 mask=(mask&~U_GC_N_MASK)|U_GC_N_MASK;
670 mask=(mask&~U_GC_Z_MASK)|U_GC_Z_MASK;
671 mask=(mask&~U_GC_P_MASK)|U_GC_P_MASK;
672 mask=(mask&~U_GC_S_MASK)|U_GC_S_MASK;
673
674 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
675 log_err("error: problems with U_GC_Y_MASK constants\n");
676 }
677 {
678 static const UChar32 digit[10]={ 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039 };
679 for(i=0; i<10; i++){
680 if(digit[i]!=u_forDigit(i,10)){
681 log_err("u_forDigit failed for %i. Expected: 0x%4X Got: 0x%4X\n",i,digit[i],u_forDigit(i,10));
682 }
683 }
684 }
685
686 /* test u_digit() */
687 {
688 static const struct {
689 UChar32 c;
690 int8_t radix, value;
691 } data[]={
692 /* base 16 */
693 { 0x0031, 16, 1 },
694 { 0x0038, 16, 8 },
695 { 0x0043, 16, 12 },
696 { 0x0066, 16, 15 },
697 { 0x00e4, 16, -1 },
698 { 0x0662, 16, 2 },
699 { 0x06f5, 16, 5 },
700 { 0xff13, 16, 3 },
701 { 0xff41, 16, 10 },
702
703 /* base 8 */
704 { 0x0031, 8, 1 },
705 { 0x0038, 8, -1 },
706 { 0x0043, 8, -1 },
707 { 0x0066, 8, -1 },
708 { 0x00e4, 8, -1 },
709 { 0x0662, 8, 2 },
710 { 0x06f5, 8, 5 },
711 { 0xff13, 8, 3 },
712 { 0xff41, 8, -1 },
713
714 /* base 36 */
715 { 0x5a, 36, 35 },
716 { 0x7a, 36, 35 },
717 { 0xff3a, 36, 35 },
718 { 0xff5a, 36, 35 },
719
720 /* wrong radix values */
721 { 0x0031, 1, -1 },
722 { 0xff3a, 37, -1 }
723 };
724
725 for(i=0; i<UPRV_LENGTHOF(data); ++i) {
726 if(u_digit(data[i].c, data[i].radix)!=data[i].value) {
727 log_err("u_digit(U+%04x, %d)=%d expected %d\n",
728 data[i].c,
729 data[i].radix,
730 u_digit(data[i].c, data[i].radix),
731 data[i].value);
732 }
733 }
734 }
735 }
736
737 /* test C/POSIX-style functions --------------------------------------------- */
738
739 /* bit flags */
740 #define ISAL 1
741 #define ISLO 2
742 #define ISUP 4
743
744 #define ISDI 8
745 #define ISXD 0x10
746
747 #define ISAN 0x20
748
749 #define ISPU 0x40
750 #define ISGR 0x80
751 #define ISPR 0x100
752
753 #define ISSP 0x200
754 #define ISBL 0x400
755 #define ISCN 0x800
756
757 /* C/POSIX-style functions, in the same order as the bit flags */
758 typedef UBool U_EXPORT2 IsPOSIXClass(UChar32 c);
759
760 static const struct {
761 IsPOSIXClass *fn;
762 const char *name;
763 } posixClasses[]={
764 { u_isalpha, "isalpha" },
765 { u_islower, "islower" },
766 { u_isupper, "isupper" },
767 { u_isdigit, "isdigit" },
768 { u_isxdigit, "isxdigit" },
769 { u_isalnum, "isalnum" },
770 { u_ispunct, "ispunct" },
771 { u_isgraph, "isgraph" },
772 { u_isprint, "isprint" },
773 { u_isspace, "isspace" },
774 { u_isblank, "isblank" },
775 { u_iscntrl, "iscntrl" }
776 };
777
778 static const struct {
779 UChar32 c;
780 uint32_t posixResults;
781 } posixData[]={
782 { 0x0008, ISCN }, /* backspace */
783 { 0x0009, ISSP|ISBL|ISCN }, /* TAB */
784 { 0x000a, ISSP| ISCN }, /* LF */
785 { 0x000c, ISSP| ISCN }, /* FF */
786 { 0x000d, ISSP| ISCN }, /* CR */
787 { 0x0020, ISPR|ISSP|ISBL }, /* space */
788 { 0x0021, ISPU|ISGR|ISPR }, /* ! */
789 { 0x0033, ISDI|ISXD|ISAN| ISGR|ISPR }, /* 3 */
790 { 0x0040, ISPU|ISGR|ISPR }, /* @ */
791 { 0x0041, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* A */
792 { 0x007a, ISAL|ISLO| ISAN| ISGR|ISPR }, /* z */
793 { 0x007b, ISPU|ISGR|ISPR }, /* { */
794 { 0x0085, ISSP| ISCN }, /* NEL */
795 { 0x00a0, ISPR|ISSP|ISBL }, /* NBSP */
796 { 0x00a4, ISGR|ISPR }, /* currency sign */
797 { 0x00e4, ISAL|ISLO| ISAN| ISGR|ISPR }, /* a-umlaut */
798 { 0x0300, ISGR|ISPR }, /* combining grave */
799 { 0x0600, ISCN }, /* arabic number sign */
800 { 0x0627, ISAL| ISAN| ISGR|ISPR }, /* alef */
801 { 0x0663, ISDI|ISXD|ISAN| ISGR|ISPR }, /* arabic 3 */
802 { 0x2002, ISPR|ISSP|ISBL }, /* en space */
803 { 0x2007, ISPR|ISSP|ISBL }, /* figure space */
804 { 0x2009, ISPR|ISSP|ISBL }, /* thin space */
805 { 0x200b, ISCN }, /* ZWSP */
806 /*{ 0x200b, ISPR|ISSP },*/ /* ZWSP */ /* ZWSP became a control char in 4.0.1*/
807 { 0x200e, ISCN }, /* LRM */
808 { 0x2028, ISPR|ISSP| ISCN }, /* LS */
809 { 0x2029, ISPR|ISSP| ISCN }, /* PS */
810 { 0x20ac, ISGR|ISPR }, /* Euro */
811 { 0xff15, ISDI|ISXD|ISAN| ISGR|ISPR }, /* fullwidth 5 */
812 { 0xff25, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* fullwidth E */
813 { 0xff35, ISAL| ISUP| ISAN| ISGR|ISPR }, /* fullwidth U */
814 { 0xff45, ISAL|ISLO| ISXD|ISAN| ISGR|ISPR }, /* fullwidth e */
815 { 0xff55, ISAL|ISLO| ISAN| ISGR|ISPR } /* fullwidth u */
816 };
817
818 static void
819 TestPOSIX() {
820 uint32_t mask;
821 int32_t cl, i;
822 UBool expect;
823
824 mask=1;
825 for(cl=0; cl<12; ++cl) {
826 for(i=0; i<UPRV_LENGTHOF(posixData); ++i) {
827 expect=(UBool)((posixData[i].posixResults&mask)!=0);
828 if(posixClasses[cl].fn(posixData[i].c)!=expect) {
829 log_err("u_%s(U+%04x)=%s is wrong\n",
830 posixClasses[cl].name, posixData[i].c, expect ? "FALSE" : "TRUE");
831 }
832 }
833 mask<<=1;
834 }
835 }
836
837 /* Tests for isControl(u_iscntrl()) and isPrintable(u_isprint()) */
838 static void TestControlPrint()
839 {
840 const UChar32 sampleControl[] = {0x1b, 0x97, 0x82, 0x2028, 0x2029, 0x200c, 0x202b};
841 const UChar32 sampleNonControl[] = {0x61, 0x0031, 0x00e2};
842 const UChar32 samplePrintable[] = {0x0042, 0x005f, 0x2014};
843 const UChar32 sampleNonPrintable[] = {0x200c, 0x009f, 0x001b};
844 UChar32 c;
845
846 testSampleCharProps(u_iscntrl, "u_iscntrl", sampleControl, UPRV_LENGTHOF(sampleControl), TRUE);
847 testSampleCharProps(u_iscntrl, "u_iscntrl", sampleNonControl, UPRV_LENGTHOF(sampleNonControl), FALSE);
848
849 testSampleCharProps(u_isprint, "u_isprint",
850 samplePrintable, UPRV_LENGTHOF(samplePrintable), TRUE);
851 testSampleCharProps(u_isprint, "u_isprint",
852 sampleNonPrintable, UPRV_LENGTHOF(sampleNonPrintable), FALSE);
853
854 /* test all ISO 8 controls */
855 for(c=0; c<=0x9f; ++c) {
856 if(c==0x20) {
857 /* skip ASCII graphic characters and continue with DEL */
858 c=0x7f;
859 }
860 if(!u_iscntrl(c)) {
861 log_err("error: u_iscntrl(ISO 8 control U+%04x)=FALSE\n", c);
862 }
863 if(!u_isISOControl(c)) {
864 log_err("error: u_isISOControl(ISO 8 control U+%04x)=FALSE\n", c);
865 }
866 if(u_isprint(c)) {
867 log_err("error: u_isprint(ISO 8 control U+%04x)=TRUE\n", c);
868 }
869 }
870
871 /* test all Latin-1 graphic characters */
872 for(c=0x20; c<=0xff; ++c) {
873 if(c==0x7f) {
874 c=0xa0;
875 } else if(c==0xad) {
876 /* Unicode 4 changes 00AD Soft Hyphen to Cf (and it is in fact not printable) */
877 ++c;
878 }
879 if(!u_isprint(c)) {
880 log_err("error: u_isprint(Latin-1 graphic character U+%04x)=FALSE\n", c);
881 }
882 }
883 }
884
885 /* u_isJavaIDStart, u_isJavaIDPart, u_isIDStart(), u_isIDPart(), u_isIDIgnorable()*/
886 static void TestIdentifier()
887 {
888 const UChar32 sampleJavaIDStart[] = {0x0071, 0x00e4, 0x005f};
889 const UChar32 sampleNonJavaIDStart[] = {0x0020, 0x2030, 0x0082};
890 const UChar32 sampleJavaIDPart[] = {0x005f, 0x0032, 0x0045};
891 const UChar32 sampleNonJavaIDPart[] = {0x2030, 0x2020, 0x0020};
892 const UChar32 sampleUnicodeIDStart[] = {0x0250, 0x00e2, 0x0061};
893 const UChar32 sampleNonUnicodeIDStart[] = {0x2000, 0x000a, 0x2019};
894 const UChar32 sampleUnicodeIDPart[] = {0x005f, 0x0032, 0x0045};
895 const UChar32 sampleNonUnicodeIDPart[] = {0x2030, 0x00a3, 0x0020};
896 const UChar32 sampleIDIgnore[] = {0x0006, 0x0010, 0x206b, 0x85};
897 const UChar32 sampleNonIDIgnore[] = {0x0075, 0x00a3, 0x0061};
898
899 testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
900 sampleJavaIDStart, UPRV_LENGTHOF(sampleJavaIDStart), TRUE);
901 testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
902 sampleNonJavaIDStart, UPRV_LENGTHOF(sampleNonJavaIDStart), FALSE);
903
904 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
905 sampleJavaIDPart, UPRV_LENGTHOF(sampleJavaIDPart), TRUE);
906 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
907 sampleNonJavaIDPart, UPRV_LENGTHOF(sampleNonJavaIDPart), FALSE);
908
909 /* IDPart should imply IDStart */
910 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
911 sampleJavaIDStart, UPRV_LENGTHOF(sampleJavaIDStart), TRUE);
912
913 testSampleCharProps(u_isIDStart, "u_isIDStart",
914 sampleUnicodeIDStart, UPRV_LENGTHOF(sampleUnicodeIDStart), TRUE);
915 testSampleCharProps(u_isIDStart, "u_isIDStart",
916 sampleNonUnicodeIDStart, UPRV_LENGTHOF(sampleNonUnicodeIDStart), FALSE);
917
918 testSampleCharProps(u_isIDPart, "u_isIDPart",
919 sampleUnicodeIDPart, UPRV_LENGTHOF(sampleUnicodeIDPart), TRUE);
920 testSampleCharProps(u_isIDPart, "u_isIDPart",
921 sampleNonUnicodeIDPart, UPRV_LENGTHOF(sampleNonUnicodeIDPart), FALSE);
922
923 /* IDPart should imply IDStart */
924 testSampleCharProps(u_isIDPart, "u_isIDPart",
925 sampleUnicodeIDStart, UPRV_LENGTHOF(sampleUnicodeIDStart), TRUE);
926
927 testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
928 sampleIDIgnore, UPRV_LENGTHOF(sampleIDIgnore), TRUE);
929 testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
930 sampleNonIDIgnore, UPRV_LENGTHOF(sampleNonIDIgnore), FALSE);
931 }
932
933 /* for each line of UnicodeData.txt, check some of the properties */
934 typedef struct UnicodeDataContext {
935 #if UCONFIG_NO_NORMALIZATION
936 const void *dummy;
937 #else
938 const UNormalizer2 *nfc;
939 const UNormalizer2 *nfkc;
940 #endif
941 } UnicodeDataContext;
942
943 /*
944 * ### TODO
945 * This test fails incorrectly if the First or Last code point of a repetitive area
946 * is overridden, which is allowed and is encouraged for the PUAs.
947 * Currently, this means that both area First/Last and override lines are
948 * tested against the properties from the API,
949 * and the area boundary will not match and cause an error.
950 *
951 * This function should detect area boundaries and skip them for the test of individual
952 * code points' properties.
953 * Then it should check that the areas contain all the same properties except where overridden.
954 * For this, it would have had to set a flag for which code points were listed explicitly.
955 */
956 static void U_CALLCONV
957 unicodeDataLineFn(void *context,
958 char *fields[][2], int32_t fieldCount,
959 UErrorCode *pErrorCode)
960 {
961 char buffer[100];
962 const char *d;
963 char *end;
964 uint32_t value;
965 UChar32 c;
966 int32_t i;
967 int8_t type;
968 int32_t dt;
969 UChar dm[32], s[32];
970 int32_t dmLength, length;
971
972 #if !UCONFIG_NO_NORMALIZATION
973 const UNormalizer2 *nfc, *nfkc;
974 #endif
975
976 /* get the character code, field 0 */
977 c=strtoul(fields[0][0], &end, 16);
978 if(end<=fields[0][0] || end!=fields[0][1]) {
979 log_err("error: syntax error in field 0 at %s\n", fields[0][0]);
980 return;
981 }
982 if((uint32_t)c>=UCHAR_MAX_VALUE + 1) {
983 log_err("error in UnicodeData.txt: code point %lu out of range\n", c);
984 return;
985 }
986
987 /* get general category, field 2 */
988 *fields[2][1]=0;
989 type = (int8_t)tagValues[MakeProp(fields[2][0])];
990 if(u_charType(c)!=type) {
991 log_err("error: u_charType(U+%04lx)==%u instead of %u\n", c, u_charType(c), type);
992 }
993 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
994 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
995 }
996
997 /* get canonical combining class, field 3 */
998 value=strtoul(fields[3][0], &end, 10);
999 if(end<=fields[3][0] || end!=fields[3][1]) {
1000 log_err("error: syntax error in field 3 at code 0x%lx\n", c);
1001 return;
1002 }
1003 if(value>255) {
1004 log_err("error in UnicodeData.txt: combining class %lu out of range\n", value);
1005 return;
1006 }
1007 #if !UCONFIG_NO_NORMALIZATION
1008 if(value!=u_getCombiningClass(c) || value!=(uint32_t)u_getIntPropertyValue(c, UCHAR_CANONICAL_COMBINING_CLASS)) {
1009 log_err("error: u_getCombiningClass(U+%04lx)==%hu instead of %lu\n", c, u_getCombiningClass(c), value);
1010 }
1011 nfkc=((UnicodeDataContext *)context)->nfkc;
1012 if(value!=unorm2_getCombiningClass(nfkc, c)) {
1013 log_err("error: unorm2_getCombiningClass(nfkc, U+%04lx)==%hu instead of %lu\n", c, unorm2_getCombiningClass(nfkc, c), value);
1014 }
1015 #endif
1016
1017 /* get BiDi category, field 4 */
1018 *fields[4][1]=0;
1019 i=MakeDir(fields[4][0]);
1020 #if U_ICU_VERSION_MAJOR_NUM!=59
1021 // TODO: Remove this version check, see ticket #13061.
1022 if(i!=u_charDirection(c) || i!=u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)) {
1023 log_err("error: u_charDirection(U+%04lx)==%u instead of %u (%s)\n", c, u_charDirection(c), MakeDir(fields[4][0]), fields[4][0]);
1024 }
1025 #endif
1026
1027 /* get Decomposition_Type & Decomposition_Mapping, field 5 */
1028 d=NULL;
1029 if(fields[5][0]==fields[5][1]) {
1030 /* no decomposition, except UnicodeData.txt omits Hangul syllable decompositions */
1031 if(c==0xac00 || c==0xd7a3) {
1032 dt=U_DT_CANONICAL;
1033 } else {
1034 dt=U_DT_NONE;
1035 }
1036 } else {
1037 d=fields[5][0];
1038 *fields[5][1]=0;
1039 dt=UCHAR_INVALID_CODE;
1040 if(*d=='<') {
1041 end=strchr(++d, '>');
1042 if(end!=NULL) {
1043 *end=0;
1044 dt=u_getPropertyValueEnum(UCHAR_DECOMPOSITION_TYPE, d);
1045 d=u_skipWhitespace(end+1);
1046 }
1047 } else {
1048 dt=U_DT_CANONICAL;
1049 }
1050 }
1051 if(dt>U_DT_NONE) {
1052 if(c==0xac00) {
1053 dm[0]=0x1100;
1054 dm[1]=0x1161;
1055 dm[2]=0;
1056 dmLength=2;
1057 } else if(c==0xd7a3) {
1058 dm[0]=0xd788;
1059 dm[1]=0x11c2;
1060 dm[2]=0;
1061 dmLength=2;
1062 } else {
1063 dmLength=u_parseString(d, dm, 32, NULL, pErrorCode);
1064 }
1065 } else {
1066 dmLength=-1;
1067 }
1068 if(dt<0 || U_FAILURE(*pErrorCode)) {
1069 log_err("error in UnicodeData.txt: syntax error in U+%04lX decomposition field\n", (long)c);
1070 return;
1071 }
1072 #if !UCONFIG_NO_NORMALIZATION
1073 i=u_getIntPropertyValue(c, UCHAR_DECOMPOSITION_TYPE);
1074 if(i!=dt) {
1075 log_err("error: u_getIntPropertyValue(U+%04lx, UCHAR_DECOMPOSITION_TYPE)==%d instead of %d\n", c, i, dt);
1076 }
1077 /* Expect Decomposition_Mapping=nfkc.getRawDecomposition(c). */
1078 length=unorm2_getRawDecomposition(nfkc, c, s, 32, pErrorCode);
1079 if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1080 log_err("error: unorm2_getRawDecomposition(nfkc, U+%04lx)==%d instead of %d "
1081 "or the Decomposition_Mapping is different (%s)\n",
1082 c, length, dmLength, u_errorName(*pErrorCode));
1083 return;
1084 }
1085 /* For canonical decompositions only, expect Decomposition_Mapping=nfc.getRawDecomposition(c). */
1086 if(dt!=U_DT_CANONICAL) {
1087 dmLength=-1;
1088 }
1089 nfc=((UnicodeDataContext *)context)->nfc;
1090 length=unorm2_getRawDecomposition(nfc, c, s, 32, pErrorCode);
1091 if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1092 log_err("error: unorm2_getRawDecomposition(nfc, U+%04lx)==%d instead of %d "
1093 "or the Decomposition_Mapping is different (%s)\n",
1094 c, length, dmLength, u_errorName(*pErrorCode));
1095 return;
1096 }
1097 /* recompose */
1098 if(dt==U_DT_CANONICAL && !u_hasBinaryProperty(c, UCHAR_FULL_COMPOSITION_EXCLUSION)) {
1099 UChar32 a, b, composite;
1100 i=0;
1101 U16_NEXT(dm, i, dmLength, a);
1102 U16_NEXT(dm, i, dmLength, b);
1103 /* i==dmLength */
1104 composite=unorm2_composePair(nfc, a, b);
1105 if(composite!=c) {
1106 log_err("error: nfc U+%04lX decomposes to U+%04lX+U+%04lX but does not compose back (instead U+%04lX)\n",
1107 (long)c, (long)a, (long)b, (long)composite);
1108 }
1109 /*
1110 * Note: NFKC has fewer round-trip mappings than NFC,
1111 * so we can't just test unorm2_composePair(nfkc, a, b) here without further data.
1112 */
1113 }
1114 #endif
1115
1116 /* get ISO Comment, field 11 */
1117 *fields[11][1]=0;
1118 i=u_getISOComment(c, buffer, sizeof(buffer), pErrorCode);
1119 if(U_FAILURE(*pErrorCode) || 0!=strcmp(fields[11][0], buffer)) {
1120 log_err_status(*pErrorCode, "error: u_getISOComment(U+%04lx) wrong (%s): \"%s\" should be \"%s\"\n",
1121 c, u_errorName(*pErrorCode),
1122 U_FAILURE(*pErrorCode) ? buffer : "[error]",
1123 fields[11][0]);
1124 }
1125
1126 /* get uppercase mapping, field 12 */
1127 if(fields[12][0]!=fields[12][1]) {
1128 value=strtoul(fields[12][0], &end, 16);
1129 if(end!=fields[12][1]) {
1130 log_err("error: syntax error in field 12 at code 0x%lx\n", c);
1131 return;
1132 }
1133 if((UChar32)value!=u_toupper(c)) {
1134 log_err("error: u_toupper(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_toupper(c), value);
1135 }
1136 } else {
1137 /* no case mapping: the API must map the code point to itself */
1138 if(c!=u_toupper(c)) {
1139 log_err("error: U+%04lx does not have an uppercase mapping but u_toupper()==U+%04lx\n", c, u_toupper(c));
1140 }
1141 }
1142
1143 /* get lowercase mapping, field 13 */
1144 if(fields[13][0]!=fields[13][1]) {
1145 value=strtoul(fields[13][0], &end, 16);
1146 if(end!=fields[13][1]) {
1147 log_err("error: syntax error in field 13 at code 0x%lx\n", c);
1148 return;
1149 }
1150 if((UChar32)value!=u_tolower(c)) {
1151 log_err("error: u_tolower(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_tolower(c), value);
1152 }
1153 } else {
1154 /* no case mapping: the API must map the code point to itself */
1155 if(c!=u_tolower(c)) {
1156 log_err("error: U+%04lx does not have a lowercase mapping but u_tolower()==U+%04lx\n", c, u_tolower(c));
1157 }
1158 }
1159
1160 /* get titlecase mapping, field 14 */
1161 if(fields[14][0]!=fields[14][1]) {
1162 value=strtoul(fields[14][0], &end, 16);
1163 if(end!=fields[14][1]) {
1164 log_err("error: syntax error in field 14 at code 0x%lx\n", c);
1165 return;
1166 }
1167 if((UChar32)value!=u_totitle(c)) {
1168 log_err("error: u_totitle(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_totitle(c), value);
1169 }
1170 } else {
1171 /* no case mapping: the API must map the code point to itself */
1172 if(c!=u_totitle(c)) {
1173 log_err("error: U+%04lx does not have a titlecase mapping but u_totitle()==U+%04lx\n", c, u_totitle(c));
1174 }
1175 }
1176 }
1177
1178 static UBool U_CALLCONV
1179 enumTypeRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1180 static const UChar32 test[][2]={
1181 {0x41, U_UPPERCASE_LETTER},
1182 {0x308, U_NON_SPACING_MARK},
1183 {0xfffe, U_GENERAL_OTHER_TYPES},
1184 {0xe0041, U_FORMAT_CHAR},
1185 {0xeffff, U_UNASSIGNED}
1186 };
1187
1188 int32_t i, count;
1189
1190 if(0!=strcmp((const char *)context, "a1")) {
1191 log_err("error: u_enumCharTypes() passes on an incorrect context pointer\n");
1192 return FALSE;
1193 }
1194
1195 count=UPRV_LENGTHOF(test);
1196 for(i=0; i<count; ++i) {
1197 if(start<=test[i][0] && test[i][0]<limit) {
1198 if(type!=(UCharCategory)test[i][1]) {
1199 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld instead of U+%04lx with %ld\n",
1200 start, limit, (long)type, test[i][0], test[i][1]);
1201 }
1202 /* stop at the range that includes the last test code point (increases code coverage for enumeration) */
1203 return i==(count-1) ? FALSE : TRUE;
1204 }
1205 }
1206
1207 if(start>test[count-1][0]) {
1208 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld after it should have stopped\n",
1209 start, limit, (long)type);
1210 return FALSE;
1211 }
1212
1213 return TRUE;
1214 }
1215
1216 static UBool U_CALLCONV
1217 enumDefaultsRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1218 /* default Bidi classes for unassigned code points, from the DerivedBidiClass.txt header */
1219 static const int32_t defaultBidi[][2]={ /* { limit, class } */
1220 { 0x0590, U_LEFT_TO_RIGHT },
1221 { 0x0600, U_RIGHT_TO_LEFT },
1222 { 0x07C0, U_RIGHT_TO_LEFT_ARABIC },
1223 { 0x08A0, U_RIGHT_TO_LEFT },
1224 { 0x0900, U_RIGHT_TO_LEFT_ARABIC }, /* Unicode 6.1 changes U+08A0..U+08FF from R to AL */
1225 { 0x20A0, U_LEFT_TO_RIGHT },
1226 { 0x20D0, U_EUROPEAN_NUMBER_TERMINATOR }, /* Unicode 6.3 changes the currency symbols block U+20A0..U+20CF to default to ET not L */
1227 { 0xFB1D, U_LEFT_TO_RIGHT },
1228 { 0xFB50, U_RIGHT_TO_LEFT },
1229 { 0xFE00, U_RIGHT_TO_LEFT_ARABIC },
1230 { 0xFE70, U_LEFT_TO_RIGHT },
1231 { 0xFF00, U_RIGHT_TO_LEFT_ARABIC },
1232 { 0x10800, U_LEFT_TO_RIGHT },
1233 { 0x11000, U_RIGHT_TO_LEFT },
1234 { 0x1E800, U_LEFT_TO_RIGHT }, /* new default-R range in Unicode 5.2: U+1E800 - U+1EFFF */
1235 { 0x1EE00, U_RIGHT_TO_LEFT },
1236 { 0x1EF00, U_RIGHT_TO_LEFT_ARABIC }, /* Unicode 6.1 changes U+1EE00..U+1EEFF from R to AL */
1237 { 0x1F000, U_RIGHT_TO_LEFT },
1238 { 0x110000, U_LEFT_TO_RIGHT }
1239 };
1240
1241 UChar32 c;
1242 int32_t i;
1243 UCharDirection shouldBeDir;
1244
1245 /*
1246 * LineBreak.txt specifies:
1247 * # - Assigned characters that are not listed explicitly are given the value
1248 * # "AL".
1249 * # - Unassigned characters are given the value "XX".
1250 *
1251 * PUA characters are listed explicitly with "XX".
1252 * Verify that no assigned character has "XX".
1253 */
1254 if(type!=U_UNASSIGNED && type!=U_PRIVATE_USE_CHAR) {
1255 c=start;
1256 while(c<limit) {
1257 if(0==u_getIntPropertyValue(c, UCHAR_LINE_BREAK)) {
1258 log_err("error UCHAR_LINE_BREAK(assigned U+%04lx)=XX\n", c);
1259 }
1260 ++c;
1261 }
1262 }
1263
1264 /*
1265 * Verify default Bidi classes.
1266 * See DerivedBidiClass.txt, especially for unassigned code points.
1267 */
1268 if(type==U_UNASSIGNED || type==U_PRIVATE_USE_CHAR) {
1269 /* enumerate the intersections of defaultBidi ranges with [start..limit[ */
1270 c=start;
1271 for(i=0; i<UPRV_LENGTHOF(defaultBidi) && c<limit; ++i) {
1272 if((int32_t)c<defaultBidi[i][0]) {
1273 while(c<limit && (int32_t)c<defaultBidi[i][0]) {
1274 if(U_IS_UNICODE_NONCHAR(c) || u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
1275 shouldBeDir=U_BOUNDARY_NEUTRAL;
1276 } else {
1277 shouldBeDir=(UCharDirection)defaultBidi[i][1];
1278 }
1279
1280 #if U_ICU_VERSION_MAJOR_NUM!=59
1281 // TODO: Remove this version check, see ticket #13061.
1282 if( u_charDirection(c)!=shouldBeDir ||
1283 u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)!=shouldBeDir
1284 ) {
1285 log_err("error: u_charDirection(unassigned/PUA U+%04lx)=%s should be %s\n",
1286 c, dirStrings[u_charDirection(c)], dirStrings[shouldBeDir]);
1287 }
1288 #endif
1289 ++c;
1290 }
1291 }
1292 }
1293 }
1294
1295 return TRUE;
1296 }
1297
1298 /* tests for several properties */
1299 static void TestUnicodeData()
1300 {
1301 UVersionInfo expectVersionArray;
1302 UVersionInfo versionArray;
1303 char *fields[15][2];
1304 UErrorCode errorCode;
1305 UChar32 c;
1306 int8_t type;
1307
1308 UnicodeDataContext context;
1309
1310 u_versionFromString(expectVersionArray, U_UNICODE_VERSION);
1311 u_getUnicodeVersion(versionArray);
1312 if(memcmp(versionArray, expectVersionArray, U_MAX_VERSION_LENGTH) != 0)
1313 {
1314 log_err("Testing u_getUnicodeVersion() - expected " U_UNICODE_VERSION " got %d.%d.%d.%d\n",
1315 versionArray[0], versionArray[1], versionArray[2], versionArray[3]);
1316 }
1317
1318 #if defined(ICU_UNICODE_VERSION)
1319 /* test only happens where we have configure.in with UNICODE_VERSION - sanity check. */
1320 if(strcmp(U_UNICODE_VERSION, ICU_UNICODE_VERSION))
1321 {
1322 log_err("Testing configure.in's ICU_UNICODE_VERSION - expected " U_UNICODE_VERSION " got " ICU_UNICODE_VERSION "\n");
1323 }
1324 #endif
1325
1326 if (ublock_getCode((UChar)0x0041) != UBLOCK_BASIC_LATIN || u_getIntPropertyValue(0x41, UCHAR_BLOCK)!=(int32_t)UBLOCK_BASIC_LATIN) {
1327 log_err("ublock_getCode(U+0041) property failed! Expected : %i Got: %i \n", UBLOCK_BASIC_LATIN,ublock_getCode((UChar)0x0041));
1328 }
1329
1330 errorCode=U_ZERO_ERROR;
1331 #if !UCONFIG_NO_NORMALIZATION
1332 context.nfc=unorm2_getNFCInstance(&errorCode);
1333 context.nfkc=unorm2_getNFKCInstance(&errorCode);
1334 if(U_FAILURE(errorCode)) {
1335 log_data_err("error: unable to open an NFC or NFKC UNormalizer2 - %s\n", u_errorName(errorCode));
1336 return;
1337 }
1338 #endif
1339 parseUCDFile("UnicodeData.txt", fields, 15, unicodeDataLineFn, &context, &errorCode);
1340 if(U_FAILURE(errorCode)) {
1341 return; /* if we couldn't parse UnicodeData.txt, we should return */
1342 }
1343
1344 /* sanity check on repeated properties */
1345 for(c=0xfffe; c<=0x10ffff;) {
1346 type=u_charType(c);
1347 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1348 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1349 }
1350 if(type!=U_UNASSIGNED) {
1351 log_err("error: u_charType(U+%04lx)!=U_UNASSIGNED (returns %d)\n", c, u_charType(c));
1352 }
1353 if((c&0xffff)==0xfffe) {
1354 ++c;
1355 } else {
1356 c+=0xffff;
1357 }
1358 }
1359
1360 /* test that PUA is not "unassigned" */
1361 for(c=0xe000; c<=0x10fffd;) {
1362 type=u_charType(c);
1363 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1364 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1365 }
1366 if(type==U_UNASSIGNED) {
1367 log_err("error: u_charType(U+%04lx)==U_UNASSIGNED\n", c);
1368 } else if(type!=U_PRIVATE_USE_CHAR) {
1369 log_verbose("PUA override: u_charType(U+%04lx)=%d\n", c, type);
1370 }
1371 if(c==0xf8ff) {
1372 c=0xf0000;
1373 } else if(c==0xffffd) {
1374 c=0x100000;
1375 } else {
1376 ++c;
1377 }
1378 }
1379
1380 /* test u_enumCharTypes() */
1381 u_enumCharTypes(enumTypeRange, "a1");
1382
1383 /* check default properties */
1384 u_enumCharTypes(enumDefaultsRange, NULL);
1385 }
1386
1387 static void TestCodeUnit(){
1388 const UChar codeunit[]={0x0000,0xe065,0x20ac,0xd7ff,0xd800,0xd841,0xd905,0xdbff,0xdc00,0xdc02,0xddee,0xdfff,0};
1389
1390 int32_t i;
1391
1392 for(i=0; i<UPRV_LENGTHOF(codeunit); i++){
1393 UChar c=codeunit[i];
1394 if(i<4){
1395 if(!(UTF_IS_SINGLE(c)) || (UTF_IS_LEAD(c)) || (UTF_IS_TRAIL(c)) ||(UTF_IS_SURROGATE(c))){
1396 log_err("ERROR: U+%04x is a single", c);
1397 }
1398
1399 }
1400 if(i >= 4 && i< 8){
1401 if(!(UTF_IS_LEAD(c)) || UTF_IS_SINGLE(c) || UTF_IS_TRAIL(c) || !(UTF_IS_SURROGATE(c))){
1402 log_err("ERROR: U+%04x is a first surrogate", c);
1403 }
1404 }
1405 if(i >= 8 && i< 12){
1406 if(!(UTF_IS_TRAIL(c)) || UTF_IS_SINGLE(c) || UTF_IS_LEAD(c) || !(UTF_IS_SURROGATE(c))){
1407 log_err("ERROR: U+%04x is a second surrogate", c);
1408 }
1409 }
1410 }
1411
1412 }
1413
1414 static void TestCodePoint(){
1415 const UChar32 codePoint[]={
1416 /*surrogate, notvalid(codepoint), not a UnicodeChar, not Error */
1417 0xd800,
1418 0xdbff,
1419 0xdc00,
1420 0xdfff,
1421 0xdc04,
1422 0xd821,
1423 /*not a surrogate, valid, isUnicodeChar , not Error*/
1424 0x20ac,
1425 0xd7ff,
1426 0xe000,
1427 0xe123,
1428 0x0061,
1429 0xe065,
1430 0x20402,
1431 0x24506,
1432 0x23456,
1433 0x20402,
1434 0x10402,
1435 0x23456,
1436 /*not a surrogate, not valid, isUnicodeChar, isError */
1437 0x0015,
1438 0x009f,
1439 /*not a surrogate, not valid, not isUnicodeChar, isError */
1440 0xffff,
1441 0xfffe,
1442 };
1443 int32_t i;
1444 for(i=0; i<UPRV_LENGTHOF(codePoint); i++){
1445 UChar32 c=codePoint[i];
1446 if(i<6){
1447 if(!UTF_IS_SURROGATE(c) || !U_IS_SURROGATE(c) || !U16_IS_SURROGATE(c)){
1448 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1449 }
1450 if(UTF_IS_VALID(c)){
1451 log_err("ERROR: isValid() failed for U+%04x\n", c);
1452 }
1453 if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){
1454 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1455 }
1456 if(UTF_IS_ERROR(c)){
1457 log_err("ERROR: isError() failed for U+%04x\n", c);
1458 }
1459 }else if(i >=6 && i<18){
1460 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
1461 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1462 }
1463 if(!UTF_IS_VALID(c)){
1464 log_err("ERROR: isValid() failed for U+%04x\n", c);
1465 }
1466 if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){
1467 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1468 }
1469 if(UTF_IS_ERROR(c)){
1470 log_err("ERROR: isError() failed for U+%04x\n", c);
1471 }
1472 }else if(i >=18 && i<20){
1473 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
1474 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1475 }
1476 if(UTF_IS_VALID(c)){
1477 log_err("ERROR: isValid() failed for U+%04x\n", c);
1478 }
1479 if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){
1480 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1481 }
1482 if(!UTF_IS_ERROR(c)){
1483 log_err("ERROR: isError() failed for U+%04x\n", c);
1484 }
1485 }
1486 else if(i >=18 && i<UPRV_LENGTHOF(codePoint)){
1487 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
1488 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1489 }
1490 if(UTF_IS_VALID(c)){
1491 log_err("ERROR: isValid() failed for U+%04x\n", c);
1492 }
1493 if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){
1494 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1495 }
1496 if(!UTF_IS_ERROR(c)){
1497 log_err("ERROR: isError() failed for U+%04x\n", c);
1498 }
1499 }
1500 }
1501
1502 if(
1503 !U_IS_BMP(0) || !U_IS_BMP(0x61) || !U_IS_BMP(0x20ac) ||
1504 !U_IS_BMP(0xd9da) || !U_IS_BMP(0xdfed) || !U_IS_BMP(0xffff) ||
1505 U_IS_BMP(U_SENTINEL) || U_IS_BMP(0x10000) || U_IS_BMP(0x50005) ||
1506 U_IS_BMP(0x10ffff) || U_IS_BMP(0x110000) || U_IS_BMP(0x7fffffff)
1507 ) {
1508 log_err("error with U_IS_BMP()\n");
1509 }
1510
1511 if(
1512 U_IS_SUPPLEMENTARY(0) || U_IS_SUPPLEMENTARY(0x61) || U_IS_SUPPLEMENTARY(0x20ac) ||
1513 U_IS_SUPPLEMENTARY(0xd9da) || U_IS_SUPPLEMENTARY(0xdfed) || U_IS_SUPPLEMENTARY(0xffff) ||
1514 U_IS_SUPPLEMENTARY(U_SENTINEL) || !U_IS_SUPPLEMENTARY(0x10000) || !U_IS_SUPPLEMENTARY(0x50005) ||
1515 !U_IS_SUPPLEMENTARY(0x10ffff) || U_IS_SUPPLEMENTARY(0x110000) || U_IS_SUPPLEMENTARY(0x7fffffff)
1516 ) {
1517 log_err("error with U_IS_SUPPLEMENTARY()\n");
1518 }
1519 }
1520
1521 static void TestCharLength()
1522 {
1523 const int32_t codepoint[]={
1524 1, 0x0061,
1525 1, 0xe065,
1526 1, 0x20ac,
1527 2, 0x20402,
1528 2, 0x23456,
1529 2, 0x24506,
1530 2, 0x20402,
1531 2, 0x10402,
1532 1, 0xd7ff,
1533 1, 0xe000
1534 };
1535
1536 int32_t i;
1537 UBool multiple;
1538 for(i=0; i<UPRV_LENGTHOF(codepoint); i=(int16_t)(i+2)){
1539 UChar32 c=codepoint[i+1];
1540 if(UTF_CHAR_LENGTH(c) != codepoint[i] || U16_LENGTH(c) != codepoint[i]){
1541 log_err("The no: of code units for U+%04x:- Expected: %d Got: %d\n", c, codepoint[i], U16_LENGTH(c));
1542 }
1543 multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
1544 if(UTF_NEED_MULTIPLE_UCHAR(c) != multiple){
1545 log_err("ERROR: Unicode::needMultipleUChar() failed for U+%04x\n", c);
1546 }
1547 }
1548 }
1549
1550 /*internal functions ----*/
1551 static int32_t MakeProp(char* str)
1552 {
1553 int32_t result = 0;
1554 char* matchPosition =0;
1555
1556 matchPosition = strstr(tagStrings, str);
1557 if (matchPosition == 0)
1558 {
1559 log_err("unrecognized type letter ");
1560 log_err(str);
1561 }
1562 else
1563 result = (int32_t)((matchPosition - tagStrings) / 2);
1564 return result;
1565 }
1566
1567 static int32_t MakeDir(char* str)
1568 {
1569 int32_t pos = 0;
1570 for (pos = 0; pos < U_CHAR_DIRECTION_COUNT; pos++) {
1571 if (strcmp(str, dirStrings[pos]) == 0) {
1572 return pos;
1573 }
1574 }
1575 return -1;
1576 }
1577
1578 /* test u_charName() -------------------------------------------------------- */
1579
1580 static const struct {
1581 uint32_t code;
1582 const char *name, *oldName, *extName, *alias;
1583 } names[]={
1584 {0x0061, "LATIN SMALL LETTER A", "", "LATIN SMALL LETTER A"},
1585 {0x01a2, "LATIN CAPITAL LETTER OI", "",
1586 "LATIN CAPITAL LETTER OI",
1587 "LATIN CAPITAL LETTER GHA"},
1588 {0x0284, "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK", "",
1589 "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK" },
1590 {0x0fd0, "TIBETAN MARK BSKA- SHOG GI MGO RGYAN", "",
1591 "TIBETAN MARK BSKA- SHOG GI MGO RGYAN",
1592 "TIBETAN MARK BKA- SHOG GI MGO RGYAN"},
1593 {0x3401, "CJK UNIFIED IDEOGRAPH-3401", "", "CJK UNIFIED IDEOGRAPH-3401" },
1594 {0x7fed, "CJK UNIFIED IDEOGRAPH-7FED", "", "CJK UNIFIED IDEOGRAPH-7FED" },
1595 {0xac00, "HANGUL SYLLABLE GA", "", "HANGUL SYLLABLE GA" },
1596 {0xd7a3, "HANGUL SYLLABLE HIH", "", "HANGUL SYLLABLE HIH" },
1597 {0xd800, "", "", "<lead surrogate-D800>" },
1598 {0xdc00, "", "", "<trail surrogate-DC00>" },
1599 {0xff08, "FULLWIDTH LEFT PARENTHESIS", "", "FULLWIDTH LEFT PARENTHESIS" },
1600 {0xffe5, "FULLWIDTH YEN SIGN", "", "FULLWIDTH YEN SIGN" },
1601 {0xffff, "", "", "<noncharacter-FFFF>" },
1602 {0x1d0c5, "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", "",
1603 "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS",
1604 "BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS"},
1605 {0x23456, "CJK UNIFIED IDEOGRAPH-23456", "", "CJK UNIFIED IDEOGRAPH-23456" }
1606 };
1607
1608 static UBool
1609 enumCharNamesFn(void *context,
1610 UChar32 code, UCharNameChoice nameChoice,
1611 const char *name, int32_t length) {
1612 int32_t *pCount=(int32_t *)context;
1613 const char *expected;
1614 int i;
1615
1616 if(length<=0 || length!=(int32_t)strlen(name)) {
1617 /* should not be called with an empty string or invalid length */
1618 log_err("u_enumCharName(0x%lx)=%s but length=%ld\n", name, length);
1619 return TRUE;
1620 }
1621
1622 ++*pCount;
1623 for(i=0; i<UPRV_LENGTHOF(names); ++i) {
1624 if(code==(UChar32)names[i].code) {
1625 switch (nameChoice) {
1626 case U_EXTENDED_CHAR_NAME:
1627 if(0!=strcmp(name, names[i].extName)) {
1628 log_err("u_enumCharName(0x%lx - Extended)=%s instead of %s\n", code, name, names[i].extName);
1629 }
1630 break;
1631 case U_UNICODE_CHAR_NAME:
1632 if(0!=strcmp(name, names[i].name)) {
1633 log_err("u_enumCharName(0x%lx)=%s instead of %s\n", code, name, names[i].name);
1634 }
1635 break;
1636 case U_UNICODE_10_CHAR_NAME:
1637 expected=names[i].oldName;
1638 if(expected[0]==0 || 0!=strcmp(name, expected)) {
1639 log_err("u_enumCharName(0x%lx - 1.0)=%s instead of %s\n", code, name, expected);
1640 }
1641 break;
1642 case U_CHAR_NAME_ALIAS:
1643 expected=names[i].alias;
1644 if(expected==NULL || expected[0]==0 || 0!=strcmp(name, expected)) {
1645 log_err("u_enumCharName(0x%lx - alias)=%s instead of %s\n", code, name, expected);
1646 }
1647 break;
1648 case U_CHAR_NAME_CHOICE_COUNT:
1649 break;
1650 }
1651 break;
1652 }
1653 }
1654 return TRUE;
1655 }
1656
1657 struct enumExtCharNamesContext {
1658 uint32_t length;
1659 int32_t last;
1660 };
1661
1662 static UBool
1663 enumExtCharNamesFn(void *context,
1664 UChar32 code, UCharNameChoice nameChoice,
1665 const char *name, int32_t length) {
1666 struct enumExtCharNamesContext *ecncp = (struct enumExtCharNamesContext *) context;
1667
1668 if (ecncp->last != (int32_t) code - 1) {
1669 if (ecncp->last < 0) {
1670 log_err("u_enumCharName(0x%lx - Ext) after u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x%lx - Ext)\n", code, ecncp->last, ecncp->last + 1);
1671 } else {
1672 log_err("u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x0 - Ext)\n", code);
1673 }
1674 }
1675 ecncp->last = (int32_t) code;
1676
1677 if (!*name) {
1678 log_err("u_enumCharName(0x%lx - Ext) should not be an empty string\n", code);
1679 }
1680
1681 return enumCharNamesFn(&ecncp->length, code, nameChoice, name, length);
1682 }
1683
1684 /**
1685 * This can be made more efficient by moving it into putil.c and having
1686 * it directly access the ebcdic translation tables.
1687 * TODO: If we get this method in putil.c, then delete it from here.
1688 */
1689 static UChar
1690 u_charToUChar(char c) {
1691 UChar uc;
1692 u_charsToUChars(&c, &uc, 1);
1693 return uc;
1694 }
1695
1696 static void
1697 TestCharNames() {
1698 static char name[80];
1699 UErrorCode errorCode=U_ZERO_ERROR;
1700 struct enumExtCharNamesContext extContext;
1701 const char *expected;
1702 int32_t length;
1703 UChar32 c;
1704 int32_t i;
1705
1706 log_verbose("Testing uprv_getMaxCharNameLength()\n");
1707 length=uprv_getMaxCharNameLength();
1708 if(length==0) {
1709 /* no names data available */
1710 return;
1711 }
1712 if(length<83) { /* Unicode 3.2 max char name length */
1713 log_err("uprv_getMaxCharNameLength()=%d is too short");
1714 }
1715 /* ### TODO same tests for max ISO comment length as for max name length */
1716
1717 log_verbose("Testing u_charName()\n");
1718 for(i=0; i<UPRV_LENGTHOF(names); ++i) {
1719 /* modern Unicode character name */
1720 length=u_charName(names[i].code, U_UNICODE_CHAR_NAME, name, sizeof(name), &errorCode);
1721 if(U_FAILURE(errorCode)) {
1722 log_err("u_charName(0x%lx) error %s\n", names[i].code, u_errorName(errorCode));
1723 return;
1724 }
1725 if(length<0 || 0!=strcmp(name, names[i].name) || length!=(uint16_t)strlen(name)) {
1726 log_err("u_charName(0x%lx) gets: %s (length %ld) instead of: %s\n", names[i].code, name, length, names[i].name);
1727 }
1728
1729 /* find the modern name */
1730 if (*names[i].name) {
1731 c=u_charFromName(U_UNICODE_CHAR_NAME, names[i].name, &errorCode);
1732 if(U_FAILURE(errorCode)) {
1733 log_err("u_charFromName(%s) error %s\n", names[i].name, u_errorName(errorCode));
1734 return;
1735 }
1736 if(c!=(UChar32)names[i].code) {
1737 log_err("u_charFromName(%s) gets 0x%lx instead of 0x%lx\n", names[i].name, c, names[i].code);
1738 }
1739 }
1740
1741 /* Unicode 1.0 character name */
1742 length=u_charName(names[i].code, U_UNICODE_10_CHAR_NAME, name, sizeof(name), &errorCode);
1743 if(U_FAILURE(errorCode)) {
1744 log_err("u_charName(0x%lx - 1.0) error %s\n", names[i].code, u_errorName(errorCode));
1745 return;
1746 }
1747 if(length<0 || (length>0 && 0!=strcmp(name, names[i].oldName)) || length!=(uint16_t)strlen(name)) {
1748 log_err("u_charName(0x%lx - 1.0) gets %s length %ld instead of nothing or %s\n", names[i].code, name, length, names[i].oldName);
1749 }
1750
1751 /* find the Unicode 1.0 name if it is stored (length>0 means that we could read it) */
1752 if(names[i].oldName[0]!=0 /* && length>0 */) {
1753 c=u_charFromName(U_UNICODE_10_CHAR_NAME, names[i].oldName, &errorCode);
1754 if(U_FAILURE(errorCode)) {
1755 log_err("u_charFromName(%s - 1.0) error %s\n", names[i].oldName, u_errorName(errorCode));
1756 return;
1757 }
1758 if(c!=(UChar32)names[i].code) {
1759 log_err("u_charFromName(%s - 1.0) gets 0x%lx instead of 0x%lx\n", names[i].oldName, c, names[i].code);
1760 }
1761 }
1762
1763 /* Unicode character name alias */
1764 length=u_charName(names[i].code, U_CHAR_NAME_ALIAS, name, sizeof(name), &errorCode);
1765 if(U_FAILURE(errorCode)) {
1766 log_err("u_charName(0x%lx - alias) error %s\n", names[i].code, u_errorName(errorCode));
1767 return;
1768 }
1769 expected=names[i].alias;
1770 if(expected==NULL) {
1771 expected="";
1772 }
1773 if(length<0 || (length>0 && 0!=strcmp(name, expected)) || length!=(uint16_t)strlen(name)) {
1774 log_err("u_charName(0x%lx - alias) gets %s length %ld instead of nothing or %s\n",
1775 names[i].code, name, length, expected);
1776 }
1777
1778 /* find the Unicode character name alias if it is stored (length>0 means that we could read it) */
1779 if(expected[0]!=0 /* && length>0 */) {
1780 c=u_charFromName(U_CHAR_NAME_ALIAS, expected, &errorCode);
1781 if(U_FAILURE(errorCode)) {
1782 log_err("u_charFromName(%s - alias) error %s\n",
1783 expected, u_errorName(errorCode));
1784 return;
1785 }
1786 if(c!=(UChar32)names[i].code) {
1787 log_err("u_charFromName(%s - alias) gets 0x%lx instead of 0x%lx\n",
1788 expected, c, names[i].code);
1789 }
1790 }
1791 }
1792
1793 /* test u_enumCharNames() */
1794 length=0;
1795 errorCode=U_ZERO_ERROR;
1796 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumCharNamesFn, &length, U_UNICODE_CHAR_NAME, &errorCode);
1797 if(U_FAILURE(errorCode) || length<94140) {
1798 log_err("u_enumCharNames(%ld..%lx) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE, u_errorName(errorCode), length);
1799 }
1800
1801 extContext.length = 0;
1802 extContext.last = -1;
1803 errorCode=U_ZERO_ERROR;
1804 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumExtCharNamesFn, &extContext, U_EXTENDED_CHAR_NAME, &errorCode);
1805 if(U_FAILURE(errorCode) || extContext.length<UCHAR_MAX_VALUE + 1) {
1806 log_err("u_enumCharNames(%ld..0x%lx - Extended) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, u_errorName(errorCode), extContext.length);
1807 }
1808
1809 /* test that u_charFromName() uppercases the input name, i.e., works with mixed-case names (new in 2.0) */
1810 if(0x61!=u_charFromName(U_UNICODE_CHAR_NAME, "LATin smALl letTER A", &errorCode)) {
1811 log_err("u_charFromName(U_UNICODE_CHAR_NAME, \"LATin smALl letTER A\") did not find U+0061 (%s)\n", u_errorName(errorCode));
1812 }
1813
1814 /* Test getCharNameCharacters */
1815 if(!getTestOption(QUICK_OPTION)) {
1816 enum { BUFSIZE = 256 };
1817 UErrorCode ec = U_ZERO_ERROR;
1818 char buf[BUFSIZE];
1819 int32_t maxLength;
1820 UChar32 cp;
1821 UChar pat[BUFSIZE], dumbPat[BUFSIZE];
1822 int32_t l1, l2;
1823 UBool map[256];
1824 UBool ok;
1825
1826 USet* set = uset_open(1, 0); /* empty set */
1827 USet* dumb = uset_open(1, 0); /* empty set */
1828
1829 /*
1830 * uprv_getCharNameCharacters() will likely return more lowercase
1831 * letters than actual character names contain because
1832 * it includes all the characters in lowercased names of
1833 * general categories, for the full possible set of extended names.
1834 */
1835 {
1836 USetAdder sa={
1837 NULL,
1838 uset_add,
1839 uset_addRange,
1840 uset_addString,
1841 NULL /* don't need remove() */
1842 };
1843 sa.set=set;
1844 uprv_getCharNameCharacters(&sa);
1845 }
1846
1847 /* build set the dumb (but sure-fire) way */
1848 for (i=0; i<256; ++i) {
1849 map[i] = FALSE;
1850 }
1851
1852 maxLength=0;
1853 for (cp=0; cp<0x110000; ++cp) {
1854 int32_t len = u_charName(cp, U_EXTENDED_CHAR_NAME,
1855 buf, BUFSIZE, &ec);
1856 if (U_FAILURE(ec)) {
1857 log_err("FAIL: u_charName failed when it shouldn't\n");
1858 uset_close(set);
1859 uset_close(dumb);
1860 return;
1861 }
1862 if(len>maxLength) {
1863 maxLength=len;
1864 }
1865
1866 for (i=0; i<len; ++i) {
1867 if (!map[(uint8_t) buf[i]]) {
1868 uset_add(dumb, (UChar32)u_charToUChar(buf[i]));
1869 map[(uint8_t) buf[i]] = TRUE;
1870 }
1871 }
1872
1873 /* test for leading/trailing whitespace */
1874 if(buf[0]==' ' || buf[0]=='\t' || buf[len-1]==' ' || buf[len-1]=='\t') {
1875 log_err("u_charName(U+%04x) returns a name with leading or trailing whitespace\n", cp);
1876 }
1877 }
1878
1879 if(map[(uint8_t)'\t']) {
1880 log_err("u_charName() returned a name with a TAB for some code point\n", cp);
1881 }
1882
1883 length=uprv_getMaxCharNameLength();
1884 if(length!=maxLength) {
1885 log_err("uprv_getMaxCharNameLength()=%d differs from the maximum length %d of all extended names\n",
1886 length, maxLength);
1887 }
1888
1889 /* compare the sets. Where is my uset_equals?!! */
1890 ok=TRUE;
1891 for(i=0; i<256; ++i) {
1892 if(uset_contains(set, i)!=uset_contains(dumb, i)) {
1893 if(0x61<=i && i<=0x7a /* a-z */ && uset_contains(set, i) && !uset_contains(dumb, i)) {
1894 /* ignore lowercase a-z that are in set but not in dumb */
1895 ok=TRUE;
1896 } else {
1897 ok=FALSE;
1898 break;
1899 }
1900 }
1901 }
1902
1903 l1 = uset_toPattern(set, pat, BUFSIZE, TRUE, &ec);
1904 l2 = uset_toPattern(dumb, dumbPat, BUFSIZE, TRUE, &ec);
1905 if (U_FAILURE(ec)) {
1906 log_err("FAIL: uset_toPattern failed when it shouldn't\n");
1907 uset_close(set);
1908 uset_close(dumb);
1909 return;
1910 }
1911
1912 if (l1 >= BUFSIZE) {
1913 l1 = BUFSIZE-1;
1914 pat[l1] = 0;
1915 }
1916 if (l2 >= BUFSIZE) {
1917 l2 = BUFSIZE-1;
1918 dumbPat[l2] = 0;
1919 }
1920
1921 if (!ok) {
1922 log_err("FAIL: uprv_getCharNameCharacters() returned %s, expected %s (too many lowercase a-z are ok)\n",
1923 aescstrdup(pat, l1), aescstrdup(dumbPat, l2));
1924 } else if(getTestOption(VERBOSITY_OPTION)) {
1925 log_verbose("Ok: uprv_getCharNameCharacters() returned %s\n", aescstrdup(pat, l1));
1926 }
1927
1928 uset_close(set);
1929 uset_close(dumb);
1930 }
1931
1932 /* ### TODO: test error cases and other interesting things */
1933 }
1934
1935 static void
1936 TestUCharFromNameUnderflow() {
1937 // Ticket #10889: Underflow crash when there is no dash.
1938 UErrorCode errorCode=U_ZERO_ERROR;
1939 UChar32 c=u_charFromName(U_EXTENDED_CHAR_NAME, "<NO BREAK SPACE>", &errorCode);
1940 if(U_SUCCESS(errorCode)) {
1941 log_err("u_charFromName(<NO BREAK SPACE>) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
1942 }
1943
1944 // Test related edge cases.
1945 errorCode=U_ZERO_ERROR;
1946 c=u_charFromName(U_EXTENDED_CHAR_NAME, "<-00a0>", &errorCode);
1947 if(U_SUCCESS(errorCode)) {
1948 log_err("u_charFromName(<-00a0>) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
1949 }
1950
1951 errorCode=U_ZERO_ERROR;
1952 c=u_charFromName(U_EXTENDED_CHAR_NAME, "<control->", &errorCode);
1953 if(U_SUCCESS(errorCode)) {
1954 log_err("u_charFromName(<control->) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
1955 }
1956
1957 errorCode=U_ZERO_ERROR;
1958 c=u_charFromName(U_EXTENDED_CHAR_NAME, "<control-111111>", &errorCode);
1959 if(U_SUCCESS(errorCode)) {
1960 log_err("u_charFromName(<control-111111>) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
1961 }
1962 }
1963
1964 /* test u_isMirrored() and u_charMirror() ----------------------------------- */
1965
1966 static void
1967 TestMirroring() {
1968 USet *set;
1969 UErrorCode errorCode;
1970
1971 UChar32 start, end, c2, c3;
1972 int32_t i;
1973
1974 U_STRING_DECL(mirroredPattern, "[:Bidi_Mirrored:]", 17);
1975
1976 U_STRING_INIT(mirroredPattern, "[:Bidi_Mirrored:]", 17);
1977
1978 log_verbose("Testing u_isMirrored()\n");
1979 if(!(u_isMirrored(0x28) && u_isMirrored(0xbb) && u_isMirrored(0x2045) && u_isMirrored(0x232a) &&
1980 !u_isMirrored(0x27) && !u_isMirrored(0x61) && !u_isMirrored(0x284) && !u_isMirrored(0x3400)
1981 )
1982 ) {
1983 log_err("u_isMirrored() does not work correctly\n");
1984 }
1985
1986 log_verbose("Testing u_charMirror()\n");
1987 if(!(u_charMirror(0x3c)==0x3e && u_charMirror(0x5d)==0x5b && u_charMirror(0x208d)==0x208e && u_charMirror(0x3017)==0x3016 &&
1988 u_charMirror(0xbb)==0xab && u_charMirror(0x2215)==0x29F5 && u_charMirror(0x29F5)==0x2215 && /* large delta between the code points */
1989 u_charMirror(0x2e)==0x2e && u_charMirror(0x6f3)==0x6f3 && u_charMirror(0x301c)==0x301c && u_charMirror(0xa4ab)==0xa4ab &&
1990 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
1991 u_charMirror(0x2018)==0x2018 && u_charMirror(0x201b)==0x201b && u_charMirror(0x301d)==0x301d
1992 )
1993 ) {
1994 log_err("u_charMirror() does not work correctly\n");
1995 }
1996
1997 /* verify that Bidi_Mirroring_Glyph roundtrips */
1998 errorCode=U_ZERO_ERROR;
1999 set=uset_openPattern(mirroredPattern, 17, &errorCode);
2000
2001 if (U_FAILURE(errorCode)) {
2002 log_data_err("uset_openPattern(mirroredPattern, 17, &errorCode) failed!\n");
2003 } else {
2004 for(i=0; 0==uset_getItem(set, i, &start, &end, NULL, 0, &errorCode); ++i) {
2005 do {
2006 c2=u_charMirror(start);
2007 c3=u_charMirror(c2);
2008 if(c3!=start) {
2009 log_err("u_charMirror() does not roundtrip: U+%04lx->U+%04lx->U+%04lx\n", (long)start, (long)c2, (long)c3);
2010 }
2011 c3=u_getBidiPairedBracket(start);
2012 if(u_getIntPropertyValue(start, UCHAR_BIDI_PAIRED_BRACKET_TYPE)==U_BPT_NONE) {
2013 if(c3!=start) {
2014 log_err("u_getBidiPairedBracket(U+%04lx) != self for bpt(c)==None\n",
2015 (long)start);
2016 }
2017 } else {
2018 if(c3!=c2) {
2019 log_err("u_getBidiPairedBracket(U+%04lx) != U+%04lx = bmg(c)'\n",
2020 (long)start, (long)c2);
2021 }
2022 }
2023 } while(++start<=end);
2024 }
2025 }
2026
2027 uset_close(set);
2028 }
2029
2030
2031 struct RunTestData
2032 {
2033 const char *runText;
2034 UScriptCode runCode;
2035 };
2036
2037 typedef struct RunTestData RunTestData;
2038
2039 static void
2040 CheckScriptRuns(UScriptRun *scriptRun, int32_t *runStarts, const RunTestData *testData, int32_t nRuns,
2041 const char *prefix)
2042 {
2043 int32_t run, runStart, runLimit;
2044 UScriptCode runCode;
2045
2046 /* iterate over all the runs */
2047 run = 0;
2048 while (uscript_nextRun(scriptRun, &runStart, &runLimit, &runCode)) {
2049 if (runStart != runStarts[run]) {
2050 log_err("%s: incorrect start offset for run %d: expected %d, got %d\n",
2051 prefix, run, runStarts[run], runStart);
2052 }
2053
2054 if (runLimit != runStarts[run + 1]) {
2055 log_err("%s: incorrect limit offset for run %d: expected %d, got %d\n",
2056 prefix, run, runStarts[run + 1], runLimit);
2057 }
2058
2059 if (runCode != testData[run].runCode) {
2060 log_err("%s: incorrect script for run %d: expected \"%s\", got \"%s\"\n",
2061 prefix, run, uscript_getName(testData[run].runCode), uscript_getName(runCode));
2062 }
2063
2064 run += 1;
2065
2066 /* stop when we've seen all the runs we expect to see */
2067 if (run >= nRuns) {
2068 break;
2069 }
2070 }
2071
2072 /* Complain if we didn't see then number of runs we expected */
2073 if (run != nRuns) {
2074 log_err("%s: incorrect number of runs: expected %d, got %d\n", prefix, run, nRuns);
2075 }
2076 }
2077
2078 static void
2079 TestUScriptRunAPI()
2080 {
2081 static const RunTestData testData1[] = {
2082 {"\\u0020\\u0946\\u0939\\u093F\\u0928\\u094D\\u0926\\u0940\\u0020", USCRIPT_DEVANAGARI},
2083 {"\\u0627\\u0644\\u0639\\u0631\\u0628\\u064A\\u0629\\u0020", USCRIPT_ARABIC},
2084 {"\\u0420\\u0443\\u0441\\u0441\\u043A\\u0438\\u0439\\u0020", USCRIPT_CYRILLIC},
2085 {"English (", USCRIPT_LATIN},
2086 {"\\u0E44\\u0E17\\u0E22", USCRIPT_THAI},
2087 {") ", USCRIPT_LATIN},
2088 {"\\u6F22\\u5B75", USCRIPT_HAN},
2089 {"\\u3068\\u3072\\u3089\\u304C\\u306A\\u3068", USCRIPT_HIRAGANA},
2090 {"\\u30AB\\u30BF\\u30AB\\u30CA", USCRIPT_KATAKANA},
2091 {"\\U00010400\\U00010401\\U00010402\\U00010403", USCRIPT_DESERET}
2092 };
2093
2094 static const RunTestData testData2[] = {
2095 {"((((((((((abc))))))))))", USCRIPT_LATIN}
2096 };
2097
2098 static const struct {
2099 const RunTestData *testData;
2100 int32_t nRuns;
2101 } testDataEntries[] = {
2102 {testData1, UPRV_LENGTHOF(testData1)},
2103 {testData2, UPRV_LENGTHOF(testData2)}
2104 };
2105
2106 static const int32_t nTestEntries = UPRV_LENGTHOF(testDataEntries);
2107 int32_t testEntry;
2108
2109 for (testEntry = 0; testEntry < nTestEntries; testEntry += 1) {
2110 UChar testString[1024];
2111 int32_t runStarts[256];
2112 int32_t nTestRuns = testDataEntries[testEntry].nRuns;
2113 const RunTestData *testData = testDataEntries[testEntry].testData;
2114
2115 int32_t run, stringLimit;
2116 UScriptRun *scriptRun = NULL;
2117 UErrorCode err;
2118
2119 /*
2120 * Fill in the test string and the runStarts array.
2121 */
2122 stringLimit = 0;
2123 for (run = 0; run < nTestRuns; run += 1) {
2124 runStarts[run] = stringLimit;
2125 stringLimit += u_unescape(testData[run].runText, &testString[stringLimit], 1024 - stringLimit);
2126 /*stringLimit -= 1;*/
2127 }
2128
2129 /* The limit of the last run */
2130 runStarts[nTestRuns] = stringLimit;
2131
2132 /*
2133 * Make sure that calling uscript_OpenRun with a NULL text pointer
2134 * and a non-zero text length returns the correct error.
2135 */
2136 err = U_ZERO_ERROR;
2137 scriptRun = uscript_openRun(NULL, stringLimit, &err);
2138
2139 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2140 log_err("uscript_openRun(NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2141 }
2142
2143 if (scriptRun != NULL) {
2144 log_err("uscript_openRun(NULL, stringLimit, &err) returned a non-NULL result.\n");
2145 uscript_closeRun(scriptRun);
2146 }
2147
2148 /*
2149 * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2150 * and a zero text length returns the correct error.
2151 */
2152 err = U_ZERO_ERROR;
2153 scriptRun = uscript_openRun(testString, 0, &err);
2154
2155 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2156 log_err("uscript_openRun(testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2157 }
2158
2159 if (scriptRun != NULL) {
2160 log_err("uscript_openRun(testString, 0, &err) returned a non-NULL result.\n");
2161 uscript_closeRun(scriptRun);
2162 }
2163
2164 /*
2165 * Make sure that calling uscript_openRun with a NULL text pointer
2166 * and a zero text length doesn't return an error.
2167 */
2168 err = U_ZERO_ERROR;
2169 scriptRun = uscript_openRun(NULL, 0, &err);
2170
2171 if (U_FAILURE(err)) {
2172 log_err("Got error %s from uscript_openRun(NULL, 0, &err)\n", u_errorName(err));
2173 }
2174
2175 /* Make sure that the empty iterator doesn't find any runs */
2176 if (uscript_nextRun(scriptRun, NULL, NULL, NULL)) {
2177 log_err("uscript_nextRun(...) returned TRUE for an empty iterator.\n");
2178 }
2179
2180 /*
2181 * Make sure that calling uscript_setRunText with a NULL text pointer
2182 * and a non-zero text length returns the correct error.
2183 */
2184 err = U_ZERO_ERROR;
2185 uscript_setRunText(scriptRun, NULL, stringLimit, &err);
2186
2187 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2188 log_err("uscript_setRunText(scriptRun, NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2189 }
2190
2191 /*
2192 * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2193 * and a zero text length returns the correct error.
2194 */
2195 err = U_ZERO_ERROR;
2196 uscript_setRunText(scriptRun, testString, 0, &err);
2197
2198 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2199 log_err("uscript_setRunText(scriptRun, testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2200 }
2201
2202 /*
2203 * Now call uscript_setRunText on the empty iterator
2204 * and make sure that it works.
2205 */
2206 err = U_ZERO_ERROR;
2207 uscript_setRunText(scriptRun, testString, stringLimit, &err);
2208
2209 if (U_FAILURE(err)) {
2210 log_err("Got error %s from uscript_setRunText(...)\n", u_errorName(err));
2211 } else {
2212 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_setRunText");
2213 }
2214
2215 uscript_closeRun(scriptRun);
2216
2217 /*
2218 * Now open an interator over the testString
2219 * using uscript_openRun and make sure that it works
2220 */
2221 scriptRun = uscript_openRun(testString, stringLimit, &err);
2222
2223 if (U_FAILURE(err)) {
2224 log_err("Got error %s from uscript_openRun(...)\n", u_errorName(err));
2225 } else {
2226 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_openRun");
2227 }
2228
2229 /* Now reset the iterator, and make sure
2230 * that it still works.
2231 */
2232 uscript_resetRun(scriptRun);
2233
2234 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_resetRun");
2235
2236 /* Close the iterator */
2237 uscript_closeRun(scriptRun);
2238 }
2239 }
2240
2241 /* test additional, non-core properties */
2242 static void
2243 TestAdditionalProperties() {
2244 /* test data for u_charAge() */
2245 static const struct {
2246 UChar32 c;
2247 UVersionInfo version;
2248 } charAges[]={
2249 {0x41, { 1, 1, 0, 0 }},
2250 {0xffff, { 1, 1, 0, 0 }},
2251 {0x20ab, { 2, 0, 0, 0 }},
2252 {0x2fffe, { 2, 0, 0, 0 }},
2253 {0x20ac, { 2, 1, 0, 0 }},
2254 {0xfb1d, { 3, 0, 0, 0 }},
2255 {0x3f4, { 3, 1, 0, 0 }},
2256 {0x10300, { 3, 1, 0, 0 }},
2257 {0x220, { 3, 2, 0, 0 }},
2258 {0xff60, { 3, 2, 0, 0 }}
2259 };
2260
2261 /* test data for u_hasBinaryProperty() */
2262 static const int32_t
2263 props[][3]={ /* code point, property, value */
2264 { 0x0627, UCHAR_ALPHABETIC, TRUE },
2265 { 0x1034a, UCHAR_ALPHABETIC, TRUE },
2266 { 0x2028, UCHAR_ALPHABETIC, FALSE },
2267
2268 { 0x0066, UCHAR_ASCII_HEX_DIGIT, TRUE },
2269 { 0x0067, UCHAR_ASCII_HEX_DIGIT, FALSE },
2270
2271 { 0x202c, UCHAR_BIDI_CONTROL, TRUE },
2272 { 0x202f, UCHAR_BIDI_CONTROL, FALSE },
2273
2274 { 0x003c, UCHAR_BIDI_MIRRORED, TRUE },
2275 { 0x003d, UCHAR_BIDI_MIRRORED, FALSE },
2276
2277 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
2278 { 0x2018, UCHAR_BIDI_MIRRORED, FALSE },
2279 { 0x201d, UCHAR_BIDI_MIRRORED, FALSE },
2280 { 0x201f, UCHAR_BIDI_MIRRORED, FALSE },
2281 { 0x301e, UCHAR_BIDI_MIRRORED, FALSE },
2282
2283 { 0x058a, UCHAR_DASH, TRUE },
2284 { 0x007e, UCHAR_DASH, FALSE },
2285
2286 { 0x0c4d, UCHAR_DIACRITIC, TRUE },
2287 { 0x3000, UCHAR_DIACRITIC, FALSE },
2288
2289 { 0x0e46, UCHAR_EXTENDER, TRUE },
2290 { 0x0020, UCHAR_EXTENDER, FALSE },
2291
2292 #if !UCONFIG_NO_NORMALIZATION
2293 { 0xfb1d, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2294 { 0x1d15f, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2295 { 0xfb1e, UCHAR_FULL_COMPOSITION_EXCLUSION, FALSE },
2296
2297 { 0x110a, UCHAR_NFD_INERT, TRUE }, /* Jamo L */
2298 { 0x0308, UCHAR_NFD_INERT, FALSE },
2299
2300 { 0x1164, UCHAR_NFKD_INERT, TRUE }, /* Jamo V */
2301 { 0x1d79d, UCHAR_NFKD_INERT, FALSE }, /* math compat version of xi */
2302
2303 { 0x0021, UCHAR_NFC_INERT, TRUE }, /* ! */
2304 { 0x0061, UCHAR_NFC_INERT, FALSE }, /* a */
2305 { 0x00e4, UCHAR_NFC_INERT, FALSE }, /* a-umlaut */
2306 { 0x0102, UCHAR_NFC_INERT, FALSE }, /* a-breve */
2307 { 0xac1c, UCHAR_NFC_INERT, FALSE }, /* Hangul LV */
2308 { 0xac1d, UCHAR_NFC_INERT, TRUE }, /* Hangul LVT */
2309
2310 { 0x1d79d, UCHAR_NFKC_INERT, FALSE }, /* math compat version of xi */
2311 { 0x2a6d6, UCHAR_NFKC_INERT, TRUE }, /* Han, last of CJK ext. B */
2312
2313 { 0x00e4, UCHAR_SEGMENT_STARTER, TRUE },
2314 { 0x0308, UCHAR_SEGMENT_STARTER, FALSE },
2315 { 0x110a, UCHAR_SEGMENT_STARTER, TRUE }, /* Jamo L */
2316 { 0x1164, UCHAR_SEGMENT_STARTER, FALSE },/* Jamo V */
2317 { 0xac1c, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LV */
2318 { 0xac1d, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LVT */
2319 #endif
2320
2321 { 0x0044, UCHAR_HEX_DIGIT, TRUE },
2322 { 0xff46, UCHAR_HEX_DIGIT, TRUE },
2323 { 0x0047, UCHAR_HEX_DIGIT, FALSE },
2324
2325 { 0x30fb, UCHAR_HYPHEN, TRUE },
2326 { 0xfe58, UCHAR_HYPHEN, FALSE },
2327
2328 { 0x2172, UCHAR_ID_CONTINUE, TRUE },
2329 { 0x0307, UCHAR_ID_CONTINUE, TRUE },
2330 { 0x005c, UCHAR_ID_CONTINUE, FALSE },
2331
2332 { 0x2172, UCHAR_ID_START, TRUE },
2333 { 0x007a, UCHAR_ID_START, TRUE },
2334 { 0x0039, UCHAR_ID_START, FALSE },
2335
2336 { 0x4db5, UCHAR_IDEOGRAPHIC, TRUE },
2337 { 0x2f999, UCHAR_IDEOGRAPHIC, TRUE },
2338 { 0x2f99, UCHAR_IDEOGRAPHIC, FALSE },
2339
2340 { 0x200c, UCHAR_JOIN_CONTROL, TRUE },
2341 { 0x2029, UCHAR_JOIN_CONTROL, FALSE },
2342
2343 { 0x1d7bc, UCHAR_LOWERCASE, TRUE },
2344 { 0x0345, UCHAR_LOWERCASE, TRUE },
2345 { 0x0030, UCHAR_LOWERCASE, FALSE },
2346
2347 { 0x1d7a9, UCHAR_MATH, TRUE },
2348 { 0x2135, UCHAR_MATH, TRUE },
2349 { 0x0062, UCHAR_MATH, FALSE },
2350
2351 { 0xfde1, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2352 { 0x10ffff, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2353 { 0x10fffd, UCHAR_NONCHARACTER_CODE_POINT, FALSE },
2354
2355 { 0x0022, UCHAR_QUOTATION_MARK, TRUE },
2356 { 0xff62, UCHAR_QUOTATION_MARK, TRUE },
2357 { 0xd840, UCHAR_QUOTATION_MARK, FALSE },
2358
2359 { 0x061f, UCHAR_TERMINAL_PUNCTUATION, TRUE },
2360 { 0xe003f, UCHAR_TERMINAL_PUNCTUATION, FALSE },
2361
2362 { 0x1d44a, UCHAR_UPPERCASE, TRUE },
2363 { 0x2162, UCHAR_UPPERCASE, TRUE },
2364 { 0x0345, UCHAR_UPPERCASE, FALSE },
2365
2366 { 0x0020, UCHAR_WHITE_SPACE, TRUE },
2367 { 0x202f, UCHAR_WHITE_SPACE, TRUE },
2368 { 0x3001, UCHAR_WHITE_SPACE, FALSE },
2369
2370 { 0x0711, UCHAR_XID_CONTINUE, TRUE },
2371 { 0x1d1aa, UCHAR_XID_CONTINUE, TRUE },
2372 { 0x007c, UCHAR_XID_CONTINUE, FALSE },
2373
2374 { 0x16ee, UCHAR_XID_START, TRUE },
2375 { 0x23456, UCHAR_XID_START, TRUE },
2376 { 0x1d1aa, UCHAR_XID_START, FALSE },
2377
2378 /*
2379 * Version break:
2380 * The following properties are only supported starting with the
2381 * Unicode version indicated in the second field.
2382 */
2383 { -1, 0x320, 0 },
2384
2385 { 0x180c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2386 { 0xfe02, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2387 { 0x1801, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, FALSE },
2388
2389 { 0x0149, UCHAR_DEPRECATED, TRUE }, /* changed in Unicode 5.2 */
2390 { 0x0341, UCHAR_DEPRECATED, FALSE }, /* changed in Unicode 5.2 */
2391 { 0xe0001, UCHAR_DEPRECATED, TRUE }, /* changed from Unicode 5 to 5.1 */
2392 { 0xe0100, UCHAR_DEPRECATED, FALSE },
2393
2394 { 0x00a0, UCHAR_GRAPHEME_BASE, TRUE },
2395 { 0x0a4d, UCHAR_GRAPHEME_BASE, FALSE },
2396 { 0xff9d, UCHAR_GRAPHEME_BASE, TRUE },
2397 { 0xff9f, UCHAR_GRAPHEME_BASE, FALSE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2398
2399 { 0x0300, UCHAR_GRAPHEME_EXTEND, TRUE },
2400 { 0xff9d, UCHAR_GRAPHEME_EXTEND, FALSE },
2401 { 0xff9f, UCHAR_GRAPHEME_EXTEND, TRUE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2402 { 0x0603, UCHAR_GRAPHEME_EXTEND, FALSE },
2403
2404 { 0x0a4d, UCHAR_GRAPHEME_LINK, TRUE },
2405 { 0xff9f, UCHAR_GRAPHEME_LINK, FALSE },
2406
2407 { 0x2ff7, UCHAR_IDS_BINARY_OPERATOR, TRUE },
2408 { 0x2ff3, UCHAR_IDS_BINARY_OPERATOR, FALSE },
2409
2410 { 0x2ff3, UCHAR_IDS_TRINARY_OPERATOR, TRUE },
2411 { 0x2f03, UCHAR_IDS_TRINARY_OPERATOR, FALSE },
2412
2413 { 0x0ec1, UCHAR_LOGICAL_ORDER_EXCEPTION, TRUE },
2414 { 0xdcba, UCHAR_LOGICAL_ORDER_EXCEPTION, FALSE },
2415
2416 { 0x2e9b, UCHAR_RADICAL, TRUE },
2417 { 0x4e00, UCHAR_RADICAL, FALSE },
2418
2419 { 0x012f, UCHAR_SOFT_DOTTED, TRUE },
2420 { 0x0049, UCHAR_SOFT_DOTTED, FALSE },
2421
2422 { 0xfa11, UCHAR_UNIFIED_IDEOGRAPH, TRUE },
2423 { 0xfa12, UCHAR_UNIFIED_IDEOGRAPH, FALSE },
2424
2425 { -1, 0x401, 0 }, /* version break for Unicode 4.0.1 */
2426
2427 { 0x002e, UCHAR_S_TERM, TRUE },
2428 { 0x0061, UCHAR_S_TERM, FALSE },
2429
2430 { 0x180c, UCHAR_VARIATION_SELECTOR, TRUE },
2431 { 0xfe03, UCHAR_VARIATION_SELECTOR, TRUE },
2432 { 0xe01ef, UCHAR_VARIATION_SELECTOR, TRUE },
2433 { 0xe0200, UCHAR_VARIATION_SELECTOR, FALSE },
2434
2435 /* enum/integer type properties */
2436
2437 /* UCHAR_BIDI_CLASS tested for assigned characters in TestUnicodeData() */
2438 /* test default Bidi classes for unassigned code points */
2439 { 0x0590, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2440 { 0x05cf, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2441 { 0x05ed, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2442 { 0x07f2, UCHAR_BIDI_CLASS, U_DIR_NON_SPACING_MARK }, /* Nko, new in Unicode 5.0 */
2443 { 0x07fe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, /* unassigned R */
2444 { 0x089f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2445 { 0xfb37, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2446 { 0xfb42, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2447 { 0x10806, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2448 { 0x10909, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2449 { 0x10fe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2450
2451 { 0x061d, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2452 { 0x063f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2453 { 0x070e, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2454 { 0x0775, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2455 { 0xfbc2, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2456 { 0xfd90, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2457 { 0xfefe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2458
2459 { 0x02AF, UCHAR_BLOCK, UBLOCK_IPA_EXTENSIONS },
2460 { 0x0C4E, UCHAR_BLOCK, UBLOCK_TELUGU },
2461 { 0x155A, UCHAR_BLOCK, UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS },
2462 { 0x1717, UCHAR_BLOCK, UBLOCK_TAGALOG },
2463 { 0x1900, UCHAR_BLOCK, UBLOCK_LIMBU },
2464 { 0x1CBF, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2465 { 0x3040, UCHAR_BLOCK, UBLOCK_HIRAGANA },
2466 { 0x1D0FF, UCHAR_BLOCK, UBLOCK_BYZANTINE_MUSICAL_SYMBOLS },
2467 { 0x50000, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2468 { 0xEFFFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2469 { 0x10D0FF, UCHAR_BLOCK, UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B },
2470
2471 /* UCHAR_CANONICAL_COMBINING_CLASS tested for assigned characters in TestUnicodeData() */
2472 { 0xd7d7, UCHAR_CANONICAL_COMBINING_CLASS, 0 },
2473
2474 { 0x00A0, UCHAR_DECOMPOSITION_TYPE, U_DT_NOBREAK },
2475 { 0x00A8, UCHAR_DECOMPOSITION_TYPE, U_DT_COMPAT },
2476 { 0x00bf, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2477 { 0x00c0, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2478 { 0x1E9B, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2479 { 0xBCDE, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2480 { 0xFB5D, UCHAR_DECOMPOSITION_TYPE, U_DT_MEDIAL },
2481 { 0x1D736, UCHAR_DECOMPOSITION_TYPE, U_DT_FONT },
2482 { 0xe0033, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2483
2484 { 0x0009, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2485 { 0x0020, UCHAR_EAST_ASIAN_WIDTH, U_EA_NARROW },
2486 { 0x00B1, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2487 { 0x20A9, UCHAR_EAST_ASIAN_WIDTH, U_EA_HALFWIDTH },
2488 { 0x2FFB, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2489 { 0x3000, UCHAR_EAST_ASIAN_WIDTH, U_EA_FULLWIDTH },
2490 { 0x35bb, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2491 { 0x58bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2492 { 0xD7A3, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2493 { 0xEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2494 { 0x1D198, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2495 { 0x20000, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2496 { 0x2F8C7, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2497 { 0x3a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, /* plane 3 got default W values in Unicode 4 */
2498 { 0x5a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2499 { 0xFEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2500 { 0x10EEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2501
2502 /* UCHAR_GENERAL_CATEGORY tested for assigned characters in TestUnicodeData() */
2503 { 0xd7c7, UCHAR_GENERAL_CATEGORY, 0 },
2504 { 0xd7d7, UCHAR_GENERAL_CATEGORY, U_OTHER_LETTER }, /* changed in Unicode 5.2 */
2505
2506 { 0x0444, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2507 { 0x0639, UCHAR_JOINING_GROUP, U_JG_AIN },
2508 { 0x072A, UCHAR_JOINING_GROUP, U_JG_DALATH_RISH },
2509 { 0x0647, UCHAR_JOINING_GROUP, U_JG_HEH },
2510 { 0x06C1, UCHAR_JOINING_GROUP, U_JG_HEH_GOAL },
2511
2512 { 0x200C, UCHAR_JOINING_TYPE, U_JT_NON_JOINING },
2513 { 0x200D, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2514 { 0x0639, UCHAR_JOINING_TYPE, U_JT_DUAL_JOINING },
2515 { 0x0640, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2516 { 0x06C3, UCHAR_JOINING_TYPE, U_JT_RIGHT_JOINING },
2517 { 0x0300, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2518 { 0x070F, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2519 { 0xe0033, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2520
2521 /* TestUnicodeData() verifies that no assigned character has "XX" (unknown) */
2522 { 0xe7e7, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2523 { 0x10fffd, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2524 { 0x0028, UCHAR_LINE_BREAK, U_LB_OPEN_PUNCTUATION },
2525 { 0x232A, UCHAR_LINE_BREAK, U_LB_CLOSE_PUNCTUATION },
2526 { 0x3401, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2527 { 0x4e02, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2528 { 0x20004, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2529 { 0xf905, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2530 { 0xdb7e, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2531 { 0xdbfd, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2532 { 0xdffc, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2533 { 0x2762, UCHAR_LINE_BREAK, U_LB_EXCLAMATION },
2534 { 0x002F, UCHAR_LINE_BREAK, U_LB_BREAK_SYMBOLS },
2535 { 0x1D49C, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2536 { 0x1731, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2537
2538 /* UCHAR_NUMERIC_TYPE tested in TestNumericProperties() */
2539
2540 /* UCHAR_SCRIPT tested in TestUScriptCodeAPI() */
2541
2542 { 0x10ff, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2543 { 0x1100, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2544 { 0x1111, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2545 { 0x1159, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2546 { 0x115a, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2547 { 0x115e, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2548 { 0x115f, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2549
2550 { 0xa95f, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2551 { 0xa960, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2552 { 0xa97c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2553 { 0xa97d, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2554
2555 { 0x1160, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2556 { 0x1161, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2557 { 0x1172, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2558 { 0x11a2, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2559 { 0x11a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2560 { 0x11a7, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2561
2562 { 0xd7af, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2563 { 0xd7b0, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2564 { 0xd7c6, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2565 { 0xd7c7, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2566
2567 { 0x11a8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2568 { 0x11b8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2569 { 0x11c8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2570 { 0x11f9, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2571 { 0x11fa, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2572 { 0x11ff, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2573 { 0x1200, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2574
2575 { 0xd7ca, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2576 { 0xd7cb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2577 { 0xd7fb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2578 { 0xd7fc, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2579
2580 { 0xac00, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2581 { 0xac1c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2582 { 0xc5ec, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2583 { 0xd788, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2584
2585 { 0xac01, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2586 { 0xac1b, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2587 { 0xac1d, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2588 { 0xc5ee, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2589 { 0xd7a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2590
2591 { 0xd7a4, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2592
2593 { -1, 0x410, 0 }, /* version break for Unicode 4.1 */
2594
2595 { 0x00d7, UCHAR_PATTERN_SYNTAX, TRUE },
2596 { 0xfe45, UCHAR_PATTERN_SYNTAX, TRUE },
2597 { 0x0061, UCHAR_PATTERN_SYNTAX, FALSE },
2598
2599 { 0x0020, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2600 { 0x0085, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2601 { 0x200f, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2602 { 0x00a0, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2603 { 0x3000, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2604
2605 { 0x1d200, UCHAR_BLOCK, UBLOCK_ANCIENT_GREEK_MUSICAL_NOTATION },
2606 { 0x2c8e, UCHAR_BLOCK, UBLOCK_COPTIC },
2607 { 0xfe17, UCHAR_BLOCK, UBLOCK_VERTICAL_FORMS },
2608
2609 { 0x1a00, UCHAR_SCRIPT, USCRIPT_BUGINESE },
2610 { 0x2cea, UCHAR_SCRIPT, USCRIPT_COPTIC },
2611 { 0xa82b, UCHAR_SCRIPT, USCRIPT_SYLOTI_NAGRI },
2612 { 0x103d0, UCHAR_SCRIPT, USCRIPT_OLD_PERSIAN },
2613
2614 { 0xcc28, UCHAR_LINE_BREAK, U_LB_H2 },
2615 { 0xcc29, UCHAR_LINE_BREAK, U_LB_H3 },
2616 { 0xac03, UCHAR_LINE_BREAK, U_LB_H3 },
2617 { 0x115f, UCHAR_LINE_BREAK, U_LB_JL },
2618 { 0x11aa, UCHAR_LINE_BREAK, U_LB_JT },
2619 { 0x11a1, UCHAR_LINE_BREAK, U_LB_JV },
2620
2621 { 0xb2c9, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_LVT },
2622 { 0x036f, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_EXTEND },
2623 { 0x0000, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_CONTROL },
2624 { 0x1160, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_V },
2625
2626 { 0x05f4, UCHAR_WORD_BREAK, U_WB_MIDLETTER },
2627 { 0x4ef0, UCHAR_WORD_BREAK, U_WB_OTHER },
2628 { 0x19d9, UCHAR_WORD_BREAK, U_WB_NUMERIC },
2629 { 0x2044, UCHAR_WORD_BREAK, U_WB_MIDNUM },
2630
2631 { 0xfffd, UCHAR_SENTENCE_BREAK, U_SB_OTHER },
2632 { 0x1ffc, UCHAR_SENTENCE_BREAK, U_SB_UPPER },
2633 { 0xff63, UCHAR_SENTENCE_BREAK, U_SB_CLOSE },
2634 { 0x2028, UCHAR_SENTENCE_BREAK, U_SB_SEP },
2635
2636 { -1, 0x520, 0 }, /* version break for Unicode 5.2 */
2637
2638 /* unassigned code points in new default Bidi R blocks */
2639 { 0x1ede4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2640 { 0x1efe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2641
2642 /* test some script codes >127 */
2643 { 0xa6e6, UCHAR_SCRIPT, USCRIPT_BAMUM },
2644 { 0xa4d0, UCHAR_SCRIPT, USCRIPT_LISU },
2645 { 0x10a7f, UCHAR_SCRIPT, USCRIPT_OLD_SOUTH_ARABIAN },
2646
2647 { -1, 0x600, 0 }, /* version break for Unicode 6.0 */
2648
2649 /* value changed in Unicode 6.0 */
2650 { 0x06C3, UCHAR_JOINING_GROUP, U_JG_TEH_MARBUTA_GOAL },
2651
2652 { -1, 0x610, 0 }, /* version break for Unicode 6.1 */
2653
2654 /* unassigned code points in new/changed default Bidi AL blocks */
2655 { 0x08ba, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2656 { 0x1eee4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2657
2658 { -1, 0x630, 0 }, /* version break for Unicode 6.3 */
2659
2660 /* unassigned code points in the currency symbols block now default to ET */
2661 { 0x20C0, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR },
2662 { 0x20CF, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR },
2663
2664 /* new property in Unicode 6.3 */
2665 { 0x0027, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE },
2666 { 0x0028, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN },
2667 { 0x0029, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE },
2668 { 0xFF5C, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE },
2669 { 0xFF5B, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN },
2670 { 0xFF5D, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE },
2671
2672 { -1, 0x700, 0 }, /* version break for Unicode 7.0 */
2673
2674 /* new character range with Joining_Group values */
2675 { 0x10ABF, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2676 { 0x10AC0, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_ALEPH },
2677 { 0x10AC1, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_BETH },
2678 { 0x10AEF, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_HUNDRED },
2679 { 0x10AF0, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2680
2681 /* undefined UProperty values */
2682 { 0x61, 0x4a7, 0 },
2683 { 0x234bc, 0x15ed, 0 }
2684 };
2685
2686 UVersionInfo version;
2687 UChar32 c;
2688 int32_t i, result, uVersion;
2689 UProperty which;
2690
2691 /* what is our Unicode version? */
2692 u_getUnicodeVersion(version);
2693 uVersion=((int32_t)version[0]<<8)|(version[1]<<4)|version[2]; /* major/minor/update version numbers */
2694
2695 u_charAge(0x20, version);
2696 if(version[0]==0) {
2697 /* no additional properties available */
2698 log_err("TestAdditionalProperties: no additional properties available, not tested\n");
2699 return;
2700 }
2701
2702 /* test u_charAge() */
2703 for(i=0; i<UPRV_LENGTHOF(charAges); ++i) {
2704 u_charAge(charAges[i].c, version);
2705 if(0!=memcmp(version, charAges[i].version, sizeof(UVersionInfo))) {
2706 log_err("error: u_charAge(U+%04lx)={ %u, %u, %u, %u } instead of { %u, %u, %u, %u }\n",
2707 charAges[i].c,
2708 version[0], version[1], version[2], version[3],
2709 charAges[i].version[0], charAges[i].version[1], charAges[i].version[2], charAges[i].version[3]);
2710 }
2711 }
2712
2713 if( u_getIntPropertyMinValue(UCHAR_DASH)!=0 ||
2714 u_getIntPropertyMinValue(UCHAR_BIDI_CLASS)!=0 ||
2715 u_getIntPropertyMinValue(UCHAR_BLOCK)!=0 || /* j2478 */
2716 u_getIntPropertyMinValue(UCHAR_SCRIPT)!=0 || /*JB#2410*/
2717 u_getIntPropertyMinValue(0x2345)!=0
2718 ) {
2719 log_err("error: u_getIntPropertyMinValue() wrong\n");
2720 }
2721 if( u_getIntPropertyMaxValue(UCHAR_DASH)!=1) {
2722 log_err("error: u_getIntPropertyMaxValue(UCHAR_DASH) wrong\n");
2723 }
2724 if( u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE)!=1) {
2725 log_err("error: u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE) wrong\n");
2726 }
2727 if( u_getIntPropertyMaxValue((UProperty)(UCHAR_BINARY_LIMIT-1))!=1) {
2728 log_err("error: u_getIntPropertyMaxValue(UCHAR_BINARY_LIMIT-1) wrong\n");
2729 }
2730 if( u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS)!=(int32_t)U_CHAR_DIRECTION_COUNT-1 ) {
2731 log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS) wrong\n");
2732 }
2733 if( u_getIntPropertyMaxValue(UCHAR_BLOCK)!=(int32_t)UBLOCK_COUNT-1 ) {
2734 log_err("error: u_getIntPropertyMaxValue(UCHAR_BLOCK) wrong\n");
2735 }
2736 if(u_getIntPropertyMaxValue(UCHAR_LINE_BREAK)!=(int32_t)U_LB_COUNT-1) {
2737 log_err("error: u_getIntPropertyMaxValue(UCHAR_LINE_BREAK) wrong\n");
2738 }
2739 if(u_getIntPropertyMaxValue(UCHAR_SCRIPT)!=(int32_t)USCRIPT_CODE_LIMIT-1) {
2740 log_err("error: u_getIntPropertyMaxValue(UCHAR_SCRIPT) wrong\n");
2741 }
2742 if(u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE)!=(int32_t)U_NT_COUNT-1) {
2743 log_err("error: u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE) wrong\n");
2744 }
2745 if(u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY)!=(int32_t)U_CHAR_CATEGORY_COUNT-1) {
2746 log_err("error: u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY) wrong\n");
2747 }
2748 if(u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE)!=(int32_t)U_HST_COUNT-1) {
2749 log_err("error: u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE) wrong\n");
2750 }
2751 if(u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK)!=(int32_t)U_GCB_COUNT-1) {
2752 log_err("error: u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK) wrong\n");
2753 }
2754 if(u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK)!=(int32_t)U_SB_COUNT-1) {
2755 log_err("error: u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK) wrong\n");
2756 }
2757 if(u_getIntPropertyMaxValue(UCHAR_WORD_BREAK)!=(int32_t)U_WB_COUNT-1) {
2758 log_err("error: u_getIntPropertyMaxValue(UCHAR_WORD_BREAK) wrong\n");
2759 }
2760 if(u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE)!=(int32_t)U_BPT_COUNT-1) {
2761 log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE) wrong\n");
2762 }
2763 /*JB#2410*/
2764 if( u_getIntPropertyMaxValue(0x2345)!=-1) {
2765 log_err("error: u_getIntPropertyMaxValue(0x2345) wrong\n");
2766 }
2767 if( u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) != (int32_t) (U_DT_COUNT - 1)) {
2768 log_err("error: u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) wrong\n");
2769 }
2770 if( u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) != (int32_t) (U_JG_COUNT -1)) {
2771 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) wrong\n");
2772 }
2773 if( u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) != (int32_t) (U_JT_COUNT -1)) {
2774 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) wrong\n");
2775 }
2776 if( u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) != (int32_t) (U_EA_COUNT -1)) {
2777 log_err("error: u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) wrong\n");
2778 }
2779
2780 /* test u_hasBinaryProperty() and u_getIntPropertyValue() */
2781 for(i=0; i<UPRV_LENGTHOF(props); ++i) {
2782 const char *whichName;
2783
2784 if(props[i][0]<0) {
2785 /* Unicode version break */
2786 if(uVersion<props[i][1]) {
2787 break; /* do not test properties that are not yet supported */
2788 } else {
2789 continue; /* skip this row */
2790 }
2791 }
2792
2793 c=(UChar32)props[i][0];
2794 which=(UProperty)props[i][1];
2795 whichName=u_getPropertyName(which, U_LONG_PROPERTY_NAME);
2796
2797 if(which<UCHAR_INT_START) {
2798 result=u_hasBinaryProperty(c, which);
2799 if(result!=props[i][2]) {
2800 log_data_err("error: u_hasBinaryProperty(U+%04lx, %s)=%d is wrong (props[%d]) - (Are you missing data?)\n",
2801 c, whichName, result, i);
2802 }
2803 }
2804
2805 result=u_getIntPropertyValue(c, which);
2806 if(result!=props[i][2]) {
2807 log_data_err("error: u_getIntPropertyValue(U+%04lx, %s)=%d is wrong, should be %d (props[%d]) - (Are you missing data?)\n",
2808 c, whichName, result, props[i][2], i);
2809 }
2810
2811 /* test separate functions, too */
2812 switch((UProperty)props[i][1]) {
2813 case UCHAR_ALPHABETIC:
2814 if(u_isUAlphabetic((UChar32)props[i][0])!=(UBool)props[i][2]) {
2815 log_err("error: u_isUAlphabetic(U+%04lx)=%d is wrong (props[%d])\n",
2816 props[i][0], result, i);
2817 }
2818 break;
2819 case UCHAR_LOWERCASE:
2820 if(u_isULowercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2821 log_err("error: u_isULowercase(U+%04lx)=%d is wrong (props[%d])\n",
2822 props[i][0], result, i);
2823 }
2824 break;
2825 case UCHAR_UPPERCASE:
2826 if(u_isUUppercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2827 log_err("error: u_isUUppercase(U+%04lx)=%d is wrong (props[%d])\n",
2828 props[i][0], result, i);
2829 }
2830 break;
2831 case UCHAR_WHITE_SPACE:
2832 if(u_isUWhiteSpace((UChar32)props[i][0])!=(UBool)props[i][2]) {
2833 log_err("error: u_isUWhiteSpace(U+%04lx)=%d is wrong (props[%d])\n",
2834 props[i][0], result, i);
2835 }
2836 break;
2837 default:
2838 break;
2839 }
2840 }
2841 }
2842
2843 static void
2844 TestNumericProperties(void) {
2845 /* see UnicodeData.txt, DerivedNumericValues.txt */
2846 static const struct {
2847 UChar32 c;
2848 int32_t type;
2849 double numValue;
2850 } values[]={
2851 { 0x0F33, U_NT_NUMERIC, -1./2. },
2852 { 0x0C66, U_NT_DECIMAL, 0 },
2853 { 0x96f6, U_NT_NUMERIC, 0 },
2854 { 0xa833, U_NT_NUMERIC, 1./16. },
2855 { 0x2152, U_NT_NUMERIC, 1./10. },
2856 { 0x2151, U_NT_NUMERIC, 1./9. },
2857 { 0x1245f, U_NT_NUMERIC, 1./8. },
2858 { 0x2150, U_NT_NUMERIC, 1./7. },
2859 { 0x2159, U_NT_NUMERIC, 1./6. },
2860 { 0x09f6, U_NT_NUMERIC, 3./16. },
2861 { 0x2155, U_NT_NUMERIC, 1./5. },
2862 { 0x00BD, U_NT_NUMERIC, 1./2. },
2863 { 0x0031, U_NT_DECIMAL, 1. },
2864 { 0x4e00, U_NT_NUMERIC, 1. },
2865 { 0x58f1, U_NT_NUMERIC, 1. },
2866 { 0x10320, U_NT_NUMERIC, 1. },
2867 { 0x0F2B, U_NT_NUMERIC, 3./2. },
2868 { 0x00B2, U_NT_DIGIT, 2. },
2869 { 0x5f10, U_NT_NUMERIC, 2. },
2870 { 0x1813, U_NT_DECIMAL, 3. },
2871 { 0x5f0e, U_NT_NUMERIC, 3. },
2872 { 0x2173, U_NT_NUMERIC, 4. },
2873 { 0x8086, U_NT_NUMERIC, 4. },
2874 { 0x278E, U_NT_DIGIT, 5. },
2875 { 0x1D7F2, U_NT_DECIMAL, 6. },
2876 { 0x247A, U_NT_DIGIT, 7. },
2877 { 0x7396, U_NT_NUMERIC, 9. },
2878 { 0x1372, U_NT_NUMERIC, 10. },
2879 { 0x216B, U_NT_NUMERIC, 12. },
2880 { 0x16EE, U_NT_NUMERIC, 17. },
2881 { 0x249A, U_NT_NUMERIC, 19. },
2882 { 0x303A, U_NT_NUMERIC, 30. },
2883 { 0x5345, U_NT_NUMERIC, 30. },
2884 { 0x32B2, U_NT_NUMERIC, 37. },
2885 { 0x1375, U_NT_NUMERIC, 40. },
2886 { 0x10323, U_NT_NUMERIC, 50. },
2887 { 0x0BF1, U_NT_NUMERIC, 100. },
2888 { 0x964c, U_NT_NUMERIC, 100. },
2889 { 0x217E, U_NT_NUMERIC, 500. },
2890 { 0x2180, U_NT_NUMERIC, 1000. },
2891 { 0x4edf, U_NT_NUMERIC, 1000. },
2892 { 0x2181, U_NT_NUMERIC, 5000. },
2893 { 0x137C, U_NT_NUMERIC, 10000. },
2894 { 0x4e07, U_NT_NUMERIC, 10000. },
2895 { 0x12432, U_NT_NUMERIC, 216000. },
2896 { 0x12433, U_NT_NUMERIC, 432000. },
2897 { 0x4ebf, U_NT_NUMERIC, 100000000. },
2898 { 0x5146, U_NT_NUMERIC, 1000000000000. },
2899 { -1, U_NT_NONE, U_NO_NUMERIC_VALUE },
2900 { 0x61, U_NT_NONE, U_NO_NUMERIC_VALUE },
2901 { 0x3000, U_NT_NONE, U_NO_NUMERIC_VALUE },
2902 { 0xfffe, U_NT_NONE, U_NO_NUMERIC_VALUE },
2903 { 0x10301, U_NT_NONE, U_NO_NUMERIC_VALUE },
2904 { 0xe0033, U_NT_NONE, U_NO_NUMERIC_VALUE },
2905 { 0x10ffff, U_NT_NONE, U_NO_NUMERIC_VALUE },
2906 { 0x110000, U_NT_NONE, U_NO_NUMERIC_VALUE }
2907 };
2908
2909 double nv;
2910 UChar32 c;
2911 int32_t i, type;
2912
2913 for(i=0; i<UPRV_LENGTHOF(values); ++i) {
2914 c=values[i].c;
2915 type=u_getIntPropertyValue(c, UCHAR_NUMERIC_TYPE);
2916 nv=u_getNumericValue(c);
2917
2918 if(type!=values[i].type) {
2919 log_err("UCHAR_NUMERIC_TYPE(U+%04lx)=%d should be %d\n", c, type, values[i].type);
2920 }
2921 if(0.000001 <= fabs(nv - values[i].numValue)) {
2922 log_err("u_getNumericValue(U+%04lx)=%g should be %g\n", c, nv, values[i].numValue);
2923 }
2924 }
2925 }
2926
2927 /**
2928 * Test the property names and property value names API.
2929 */
2930 static void
2931 TestPropertyNames(void) {
2932 int32_t p, v, choice=0, rev;
2933 UBool atLeastSomething = FALSE;
2934
2935 for (p=0; ; ++p) {
2936 UProperty propEnum = (UProperty)p;
2937 UBool sawProp = FALSE;
2938 if(p > 10 && !atLeastSomething) {
2939 log_data_err("Never got anything after 10 tries.\nYour data is probably fried. Quitting this test\n", p, choice);
2940 return;
2941 }
2942
2943 for (choice=0; ; ++choice) {
2944 const char* name = u_getPropertyName(propEnum, (UPropertyNameChoice)choice);
2945 if (name) {
2946 if (!sawProp)
2947 log_verbose("prop 0x%04x+%2d:", p&~0xfff, p&0xfff);
2948 log_verbose("%d=\"%s\"", choice, name);
2949 sawProp = TRUE;
2950 atLeastSomething = TRUE;
2951
2952 /* test reverse mapping */
2953 rev = u_getPropertyEnum(name);
2954 if (rev != p) {
2955 log_err("Property round-trip failure: %d -> %s -> %d\n",
2956 p, name, rev);
2957 }
2958 }
2959 if (!name && choice>0) break;
2960 }
2961 if (sawProp) {
2962 /* looks like a valid property; check the values */
2963 const char* pname = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
2964 int32_t max = 0;
2965 if (p == UCHAR_CANONICAL_COMBINING_CLASS) {
2966 max = 255;
2967 } else if (p == UCHAR_GENERAL_CATEGORY_MASK) {
2968 /* it's far too slow to iterate all the way up to
2969 the real max, U_GC_P_MASK */
2970 max = U_GC_NL_MASK;
2971 } else if (p == UCHAR_BLOCK) {
2972 /* UBlockCodes, unlike other values, start at 1 */
2973 max = 1;
2974 }
2975 log_verbose("\n");
2976 for (v=-1; ; ++v) {
2977 UBool sawValue = FALSE;
2978 for (choice=0; ; ++choice) {
2979 const char* vname = u_getPropertyValueName(propEnum, v, (UPropertyNameChoice)choice);
2980 if (vname) {
2981 if (!sawValue) log_verbose(" %s, value %d:", pname, v);
2982 log_verbose("%d=\"%s\"", choice, vname);
2983 sawValue = TRUE;
2984
2985 /* test reverse mapping */
2986 rev = u_getPropertyValueEnum(propEnum, vname);
2987 if (rev != v) {
2988 log_err("Value round-trip failure (%s): %d -> %s -> %d\n",
2989 pname, v, vname, rev);
2990 }
2991 }
2992 if (!vname && choice>0) break;
2993 }
2994 if (sawValue) {
2995 log_verbose("\n");
2996 }
2997 if (!sawValue && v>=max) break;
2998 }
2999 }
3000 if (!sawProp) {
3001 if (p>=UCHAR_STRING_LIMIT) {
3002 break;
3003 } else if (p>=UCHAR_DOUBLE_LIMIT) {
3004 p = UCHAR_STRING_START - 1;
3005 } else if (p>=UCHAR_MASK_LIMIT) {
3006 p = UCHAR_DOUBLE_START - 1;
3007 } else if (p>=UCHAR_INT_LIMIT) {
3008 p = UCHAR_MASK_START - 1;
3009 } else if (p>=UCHAR_BINARY_LIMIT) {
3010 p = UCHAR_INT_START - 1;
3011 }
3012 }
3013 }
3014 }
3015
3016 /**
3017 * Test the property values API. See JB#2410.
3018 */
3019 static void
3020 TestPropertyValues(void) {
3021 int32_t i, p, min, max;
3022 UErrorCode ec;
3023
3024 /* Min should be 0 for everything. */
3025 /* Until JB#2478 is fixed, the one exception is UCHAR_BLOCK. */
3026 for (p=UCHAR_INT_START; p<UCHAR_INT_LIMIT; ++p) {
3027 UProperty propEnum = (UProperty)p;
3028 min = u_getIntPropertyMinValue(propEnum);
3029 if (min != 0) {
3030 if (p == UCHAR_BLOCK) {
3031 /* This is okay...for now. See JB#2487.
3032 TODO Update this for JB#2487. */
3033 } else {
3034 const char* name;
3035 name = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
3036 if (name == NULL)
3037 name = "<ERROR>";
3038 log_err("FAIL: u_getIntPropertyMinValue(%s) = %d, exp. 0\n",
3039 name, min);
3040 }
3041 }
3042 }
3043
3044 if( u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY_MASK)!=0 ||
3045 u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY_MASK)!=-1) {
3046 log_err("error: u_getIntPropertyMin/MaxValue(UCHAR_GENERAL_CATEGORY_MASK) is wrong\n");
3047 }
3048
3049 /* Max should be -1 for invalid properties. */
3050 max = u_getIntPropertyMaxValue(UCHAR_INVALID_CODE);
3051 if (max != -1) {
3052 log_err("FAIL: u_getIntPropertyMaxValue(-1) = %d, exp. -1\n",
3053 max);
3054 }
3055
3056 /* Script should return USCRIPT_INVALID_CODE for an invalid code point. */
3057 for (i=0; i<2; ++i) {
3058 int32_t script;
3059 const char* desc;
3060 ec = U_ZERO_ERROR;
3061 switch (i) {
3062 case 0:
3063 script = uscript_getScript(-1, &ec);
3064 desc = "uscript_getScript(-1)";
3065 break;
3066 case 1:
3067 script = u_getIntPropertyValue(-1, UCHAR_SCRIPT);
3068 desc = "u_getIntPropertyValue(-1, UCHAR_SCRIPT)";
3069 break;
3070 default:
3071 log_err("Internal test error. Too many scripts\n");
3072 return;
3073 }
3074 /* We don't explicitly test ec. It should be U_FAILURE but it
3075 isn't documented as such. */
3076 if (script != (int32_t)USCRIPT_INVALID_CODE) {
3077 log_err("FAIL: %s = %d, exp. 0\n",
3078 desc, script);
3079 }
3080 }
3081 }
3082
3083 /* various tests for consistency of UCD data and API behavior */
3084 static void
3085 TestConsistency() {
3086 char buffer[300];
3087 USet *set1, *set2, *set3, *set4;
3088 UErrorCode errorCode;
3089
3090 UChar32 start, end;
3091 int32_t i, length;
3092
3093 U_STRING_DECL(hyphenPattern, "[:Hyphen:]", 10);
3094 U_STRING_DECL(dashPattern, "[:Dash:]", 8);
3095 U_STRING_DECL(lowerPattern, "[:Lowercase:]", 13);
3096 U_STRING_DECL(formatPattern, "[:Cf:]", 6);
3097 U_STRING_DECL(alphaPattern, "[:Alphabetic:]", 14);
3098
3099 U_STRING_DECL(mathBlocksPattern,
3100 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
3101 214);
3102 U_STRING_DECL(mathPattern, "[:Math:]", 8);
3103 U_STRING_DECL(unassignedPattern, "[:Cn:]", 6);
3104 U_STRING_DECL(unknownPattern, "[:sc=Unknown:]", 14);
3105 U_STRING_DECL(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3106
3107 U_STRING_INIT(hyphenPattern, "[:Hyphen:]", 10);
3108 U_STRING_INIT(dashPattern, "[:Dash:]", 8);
3109 U_STRING_INIT(lowerPattern, "[:Lowercase:]", 13);
3110 U_STRING_INIT(formatPattern, "[:Cf:]", 6);
3111 U_STRING_INIT(alphaPattern, "[:Alphabetic:]", 14);
3112
3113 U_STRING_INIT(mathBlocksPattern,
3114 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
3115 214);
3116 U_STRING_INIT(mathPattern, "[:Math:]", 8);
3117 U_STRING_INIT(unassignedPattern, "[:Cn:]", 6);
3118 U_STRING_INIT(unknownPattern, "[:sc=Unknown:]", 14);
3119 U_STRING_INIT(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3120
3121 /*
3122 * It used to be that UCD.html and its precursors said
3123 * "Those dashes used to mark connections between pieces of words,
3124 * plus the Katakana middle dot."
3125 *
3126 * Unicode 4 changed 00AD Soft Hyphen to Cf and removed it from Dash
3127 * but not from Hyphen.
3128 * UTC 94 (2003mar) decided to leave it that way and to change UCD.html.
3129 * Therefore, do not show errors when testing the Hyphen property.
3130 */
3131 log_verbose("Starting with Unicode 4, inconsistencies with [:Hyphen:] are\n"
3132 "known to the UTC and not considered errors.\n");
3133
3134 errorCode=U_ZERO_ERROR;
3135 set1=uset_openPattern(hyphenPattern, 10, &errorCode);
3136 set2=uset_openPattern(dashPattern, 8, &errorCode);
3137 if(U_SUCCESS(errorCode)) {
3138 /* remove the Katakana middle dot(s) from set1 */
3139 uset_remove(set1, 0x30fb);
3140 uset_remove(set1, 0xff65); /* halfwidth variant */
3141 showAMinusB(set1, set2, "[:Hyphen:]", "[:Dash:]", FALSE);
3142 } else {
3143 log_data_err("error opening [:Hyphen:] or [:Dash:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3144 }
3145
3146 /* check that Cf is neither Hyphen nor Dash nor Alphabetic */
3147 set3=uset_openPattern(formatPattern, 6, &errorCode);
3148 set4=uset_openPattern(alphaPattern, 14, &errorCode);
3149 if(U_SUCCESS(errorCode)) {
3150 showAIntersectB(set3, set1, "[:Cf:]", "[:Hyphen:]", FALSE);
3151 showAIntersectB(set3, set2, "[:Cf:]", "[:Dash:]", TRUE);
3152 showAIntersectB(set3, set4, "[:Cf:]", "[:Alphabetic:]", TRUE);
3153 } else {
3154 log_data_err("error opening [:Cf:] or [:Alpbabetic:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3155 }
3156
3157 uset_close(set1);
3158 uset_close(set2);
3159 uset_close(set3);
3160 uset_close(set4);
3161
3162 /*
3163 * Check that each lowercase character has "small" in its name
3164 * and not "capital".
3165 * There are some such characters, some of which seem odd.
3166 * Use the verbose flag to see these notices.
3167 */
3168 errorCode=U_ZERO_ERROR;
3169 set1=uset_openPattern(lowerPattern, 13, &errorCode);
3170 if(U_SUCCESS(errorCode)) {
3171 for(i=0;; ++i) {
3172 length=uset_getItem(set1, i, &start, &end, NULL, 0, &errorCode);
3173 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
3174 break; /* done */
3175 }
3176 if(U_FAILURE(errorCode)) {
3177 log_err("error iterating over [:Lowercase:] at item %d: %s\n",
3178 i, u_errorName(errorCode));
3179 break;
3180 }
3181 if(length!=0) {
3182 break; /* done with code points, got a string or -1 */
3183 }
3184
3185 while(start<=end) {
3186 length=u_charName(start, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode);
3187 if(U_FAILURE(errorCode)) {
3188 log_data_err("error getting the name of U+%04x - %s\n", start, u_errorName(errorCode));
3189 errorCode=U_ZERO_ERROR;
3190 }
3191 if( (strstr(buffer, "SMALL")==NULL || strstr(buffer, "CAPITAL")!=NULL) &&
3192 strstr(buffer, "SMALL CAPITAL")==NULL
3193 ) {
3194 log_verbose("info: [:Lowercase:] contains U+%04x whose name does not suggest lowercase: %s\n", start, buffer);
3195 }
3196 ++start;
3197 }
3198 }
3199 } else {
3200 log_data_err("error opening [:Lowercase:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3201 }
3202 uset_close(set1);
3203
3204 /* verify that all assigned characters in Math blocks are exactly Math characters */
3205 errorCode=U_ZERO_ERROR;
3206 set1=uset_openPattern(mathBlocksPattern, -1, &errorCode);
3207 set2=uset_openPattern(mathPattern, 8, &errorCode);
3208 set3=uset_openPattern(unassignedPattern, 6, &errorCode);
3209 if(U_SUCCESS(errorCode)) {
3210 uset_retainAll(set2, set1); /* [math blocks]&[:Math:] */
3211 uset_complement(set3); /* assigned characters */
3212 uset_retainAll(set1, set3); /* [math blocks]&[assigned] */
3213 compareUSets(set1, set2,
3214 "[assigned Math block chars]", "[math blocks]&[:Math:]",
3215 TRUE);
3216 } else {
3217 log_data_err("error opening [math blocks] or [:Math:] or [:Cn:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3218 }
3219 uset_close(set1);
3220 uset_close(set2);
3221 uset_close(set3);
3222
3223 /* new in Unicode 5.0: exactly all unassigned+PUA+surrogate code points have script=Unknown */
3224 errorCode=U_ZERO_ERROR;
3225 set1=uset_openPattern(unknownPattern, 14, &errorCode);
3226 set2=uset_openPattern(reservedPattern, 20, &errorCode);
3227 if(U_SUCCESS(errorCode)) {
3228 compareUSets(set1, set2,
3229 "[:sc=Unknown:]", "[[:Cn:][:Co:][:Cs:]]",
3230 TRUE);
3231 } else {
3232 log_data_err("error opening [:sc=Unknown:] or [[:Cn:][:Co:][:Cs:]] - %s (Are you missing data?)\n", u_errorName(errorCode));
3233 }
3234 uset_close(set1);
3235 uset_close(set2);
3236 }
3237
3238 /*
3239 * Starting with ICU4C 3.4, the core Unicode properties files
3240 * (uprops.icu, ucase.icu, ubidi.icu, unorm.icu)
3241 * are hardcoded in the common DLL and therefore not included
3242 * in the data package any more.
3243 * Test requiring these files are disabled so that
3244 * we need not jump through hoops (like adding snapshots of these files
3245 * to testdata).
3246 * See Jitterbug 4497.
3247 */
3248 #define HARDCODED_DATA_4497 1
3249
3250 /* API coverage for ubidi_props.c */
3251 static void TestUBiDiProps() {
3252 #if !HARDCODED_DATA_4497
3253 UDataMemory *pData;
3254 UBiDiProps *bdp;
3255 const UBiDiProps *cbdp;
3256 UErrorCode errorCode;
3257
3258 /* coverage for ubidi_openBinary() */
3259 errorCode=U_ZERO_ERROR;
3260 pData=udata_open(NULL, UBIDI_DATA_TYPE, UBIDI_DATA_NAME, &errorCode);
3261 if(U_FAILURE(errorCode)) {
3262 log_data_err("unable to open " UBIDI_DATA_NAME "." UBIDI_DATA_TYPE ": %s\n",
3263 u_errorName(errorCode));
3264 return;
3265 }
3266
3267 bdp=ubidi_openBinary((const uint8_t *)pData->pHeader, -1, &errorCode);
3268 if(U_FAILURE(errorCode)) {
3269 log_err("ubidi_openBinary() fails for the contents of " UBIDI_DATA_NAME "." UBIDI_DATA_TYPE ": %s\n",
3270 u_errorName(errorCode));
3271 udata_close(pData);
3272 return;
3273 }
3274
3275 if(0x2215!=ubidi_getMirror(bdp, 0x29F5)) { /* verify some data */
3276 log_err("ubidi_openBinary() does not seem to return working UBiDiProps\n");
3277 }
3278
3279 ubidi_closeProps(bdp);
3280 udata_close(pData);
3281
3282 /* coverage for ubidi_getDummy() */
3283 errorCode=U_ZERO_ERROR;
3284 cbdp=ubidi_getDummy(&errorCode);
3285 if(ubidi_getClass(cbdp, 0x20)!=0) {
3286 log_err("ubidi_getClass(dummy, space)!=0\n");
3287 }
3288 #endif
3289 }
3290
3291 /* test case folding, compare return values with CaseFolding.txt ------------ */
3292
3293 /* bit set for which case foldings for a character have been tested already */
3294 enum {
3295 CF_SIMPLE=1,
3296 CF_FULL=2,
3297 CF_TURKIC=4,
3298 CF_ALL=7
3299 };
3300
3301 static void
3302 testFold(UChar32 c, int which,
3303 UChar32 simple, UChar32 turkic,
3304 const UChar *full, int32_t fullLength,
3305 const UChar *turkicFull, int32_t turkicFullLength) {
3306 UChar s[2], t[32];
3307 UChar32 c2;
3308 int32_t length, length2;
3309
3310 UErrorCode errorCode=U_ZERO_ERROR;
3311
3312 length=0;
3313 U16_APPEND_UNSAFE(s, length, c);
3314
3315 if((which&CF_SIMPLE)!=0 && (c2=u_foldCase(c, 0))!=simple) {
3316 log_err("u_foldCase(U+%04lx, default)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3317 }
3318 if((which&CF_FULL)!=0) {
3319 length2=u_strFoldCase(t, UPRV_LENGTHOF(t), s, length, 0, &errorCode);
3320 if(length2!=fullLength || 0!=u_memcmp(t, full, fullLength)) {
3321 log_err("u_strFoldCase(U+%04lx, default) does not fold properly\n", (long)c);
3322 }
3323 }
3324 if((which&CF_TURKIC)!=0) {
3325 if((c2=u_foldCase(c, U_FOLD_CASE_EXCLUDE_SPECIAL_I))!=turkic) {
3326 log_err("u_foldCase(U+%04lx, turkic)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3327 }
3328
3329 length2=u_strFoldCase(t, UPRV_LENGTHOF(t), s, length, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &errorCode);
3330 if(length2!=turkicFullLength || 0!=u_memcmp(t, turkicFull, length2)) {
3331 log_err("u_strFoldCase(U+%04lx, turkic) does not fold properly\n", (long)c);
3332 }
3333 }
3334 }
3335
3336 /* test that c case-folds to itself */
3337 static void
3338 testFoldToSelf(UChar32 c, int which) {
3339 UChar s[2];
3340 int32_t length;
3341
3342 length=0;
3343 U16_APPEND_UNSAFE(s, length, c);
3344 testFold(c, which, c, c, s, length, s, length);
3345 }
3346
3347 struct CaseFoldingData {
3348 USet *notSeen;
3349 UChar32 prev, prevSimple;
3350 UChar prevFull[32];
3351 int32_t prevFullLength;
3352 int which;
3353 };
3354 typedef struct CaseFoldingData CaseFoldingData;
3355
3356 static void U_CALLCONV
3357 caseFoldingLineFn(void *context,
3358 char *fields[][2], int32_t fieldCount,
3359 UErrorCode *pErrorCode) {
3360 CaseFoldingData *pData=(CaseFoldingData *)context;
3361 char *end;
3362 UChar full[32];
3363 UChar32 c, prev, simple;
3364 int32_t count;
3365 int which;
3366 char status;
3367
3368 /* get code point */
3369 const char *s=u_skipWhitespace(fields[0][0]);
3370 if(0==strncmp(s, "0000..10FFFF", 12)) {
3371 /*
3372 * Ignore the line
3373 * # @missing: 0000..10FFFF; C; <code point>
3374 * because maps-to-self is already our default, and this line breaks this parser.
3375 */
3376 return;
3377 }
3378 c=(UChar32)strtoul(s, &end, 16);
3379 end=(char *)u_skipWhitespace(end);
3380 if(end<=fields[0][0] || end!=fields[0][1]) {
3381 log_err("syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
3382 *pErrorCode=U_PARSE_ERROR;
3383 return;
3384 }
3385
3386 /* get the status of this mapping */
3387 status=*u_skipWhitespace(fields[1][0]);
3388 if(status!='C' && status!='S' && status!='F' && status!='T') {
3389 log_err("unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
3390 *pErrorCode=U_PARSE_ERROR;
3391 return;
3392 }
3393
3394 /* get the mapping */
3395 count=u_parseString(fields[2][0], full, 32, (uint32_t *)&simple, pErrorCode);
3396 if(U_FAILURE(*pErrorCode)) {
3397 log_err("error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
3398 return;
3399 }
3400
3401 /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
3402 if(count==0 || count>2 || (count==2 && U16_IS_SINGLE(full[1]))) {
3403 simple=c;
3404 }
3405
3406 if(c!=(prev=pData->prev)) {
3407 /*
3408 * Test remaining mappings for the previous code point.
3409 * If a turkic folding was not mentioned, then it should fold the same
3410 * as the regular simple case folding.
3411 */
3412 UChar prevString[2];
3413 int32_t length;
3414
3415 length=0;
3416 U16_APPEND_UNSAFE(prevString, length, prev);
3417 testFold(prev, (~pData->which)&CF_ALL,
3418 prev, pData->prevSimple,
3419 prevString, length,
3420 pData->prevFull, pData->prevFullLength);
3421 pData->prev=pData->prevSimple=c;
3422 length=0;
3423 U16_APPEND_UNSAFE(pData->prevFull, length, c);
3424 pData->prevFullLength=length;
3425 pData->which=0;
3426 }
3427
3428 /*
3429 * Turn the status into a bit set of case foldings to test.
3430 * Remember non-Turkic case foldings as defaults for Turkic mode.
3431 */
3432 switch(status) {
3433 case 'C':
3434 which=CF_SIMPLE|CF_FULL;
3435 pData->prevSimple=simple;
3436 u_memcpy(pData->prevFull, full, count);
3437 pData->prevFullLength=count;
3438 break;
3439 case 'S':
3440 which=CF_SIMPLE;
3441 pData->prevSimple=simple;
3442 break;
3443 case 'F':
3444 which=CF_FULL;
3445 u_memcpy(pData->prevFull, full, count);
3446 pData->prevFullLength=count;
3447 break;
3448 case 'T':
3449 which=CF_TURKIC;
3450 break;
3451 default:
3452 which=0;
3453 break; /* won't happen because of test above */
3454 }
3455
3456 testFold(c, which, simple, simple, full, count, full, count);
3457
3458 /* remember which case foldings of c have been tested */
3459 pData->which|=which;
3460
3461 /* remove c from the set of ones not mentioned in CaseFolding.txt */
3462 uset_remove(pData->notSeen, c);
3463 }
3464
3465 static void
3466 TestCaseFolding() {
3467 CaseFoldingData data={ NULL };
3468 char *fields[3][2];
3469 UErrorCode errorCode;
3470
3471 static char *lastLine= (char *)"10FFFF; C; 10FFFF;";
3472
3473 errorCode=U_ZERO_ERROR;
3474 /* test BMP & plane 1 - nothing interesting above */
3475 data.notSeen=uset_open(0, 0x1ffff);
3476 data.prevFullLength=1; /* length of full case folding of U+0000 */
3477
3478 parseUCDFile("CaseFolding.txt", fields, 3, caseFoldingLineFn, &data, &errorCode);
3479 if(U_SUCCESS(errorCode)) {
3480 int32_t i, start, end;
3481
3482 /* add a pseudo-last line to finish testing of the actual last one */
3483 fields[0][0]=lastLine;
3484 fields[0][1]=lastLine+6;
3485 fields[1][0]=lastLine+7;
3486 fields[1][1]=lastLine+9;
3487 fields[2][0]=lastLine+10;
3488 fields[2][1]=lastLine+17;
3489 caseFoldingLineFn(&data, fields, 3, &errorCode);
3490
3491 /* verify that all code points that are not mentioned in CaseFolding.txt fold to themselves */
3492 for(i=0;
3493 0==uset_getItem(data.notSeen, i, &start, &end, NULL, 0, &errorCode) &&
3494 U_SUCCESS(errorCode);
3495 ++i
3496 ) {
3497 do {
3498 testFoldToSelf(start, CF_ALL);
3499 } while(++start<=end);
3500 }
3501 }
3502
3503 uset_close(data.notSeen);
3504 }