]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/cintltst/cucdtst.c
ICU-511.35.tar.gz
[apple/icu.git] / icuSources / test / cintltst / cucdtst.c
1 /********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 1997-2013, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6 /*******************************************************************************
7 *
8 * File CUCDTST.C
9 *
10 * Modification History:
11 * Name Description
12 * Madhu Katragadda Ported for C API, added tests for string functions
13 ********************************************************************************
14 */
15
16 #include <string.h>
17 #include <math.h>
18 #include <stdlib.h>
19
20 #include "unicode/utypes.h"
21 #include "unicode/uchar.h"
22 #include "unicode/putil.h"
23 #include "unicode/ustring.h"
24 #include "unicode/uloc.h"
25 #include "unicode/unorm2.h"
26
27 #include "cintltst.h"
28 #include "putilimp.h"
29 #include "uparse.h"
30 #include "ucase.h"
31 #include "ubidi_props.h"
32 #include "uprops.h"
33 #include "uset_imp.h"
34 #include "usc_impl.h"
35 #include "udatamem.h" /* for testing ucase_openBinary() */
36 #include "cucdapi.h"
37
38 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
39
40 /* prototypes --------------------------------------------------------------- */
41
42 static void TestUpperLower(void);
43 static void TestLetterNumber(void);
44 static void TestMisc(void);
45 static void TestPOSIX(void);
46 static void TestControlPrint(void);
47 static void TestIdentifier(void);
48 static void TestUnicodeData(void);
49 static void TestCodeUnit(void);
50 static void TestCodePoint(void);
51 static void TestCharLength(void);
52 static void TestCharNames(void);
53 static void TestMirroring(void);
54 static void TestUScriptRunAPI(void);
55 static void TestAdditionalProperties(void);
56 static void TestNumericProperties(void);
57 static void TestPropertyNames(void);
58 static void TestPropertyValues(void);
59 static void TestConsistency(void);
60 static void TestUCase(void);
61 static void TestUBiDiProps(void);
62 static void TestCaseFolding(void);
63
64 /* internal methods used */
65 static int32_t MakeProp(char* str);
66 static int32_t MakeDir(char* str);
67
68 /* helpers ------------------------------------------------------------------ */
69
70 static void
71 parseUCDFile(const char *filename,
72 char *fields[][2], int32_t fieldCount,
73 UParseLineFn *lineFn, void *context,
74 UErrorCode *pErrorCode) {
75 char path[256];
76 char backupPath[256];
77
78 if(U_FAILURE(*pErrorCode)) {
79 return;
80 }
81
82 /* Look inside ICU_DATA first */
83 strcpy(path, u_getDataDirectory());
84 strcat(path, ".." U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING);
85 strcat(path, filename);
86
87 /* As a fallback, try to guess where the source data was located
88 * at the time ICU was built, and look there.
89 */
90 strcpy(backupPath, ctest_dataSrcDir());
91 strcat(backupPath, U_FILE_SEP_STRING);
92 strcat(backupPath, "unidata" U_FILE_SEP_STRING);
93 strcat(backupPath, filename);
94
95 u_parseDelimitedFile(path, ';', fields, fieldCount, lineFn, context, pErrorCode);
96 if(*pErrorCode==U_FILE_ACCESS_ERROR) {
97 *pErrorCode=U_ZERO_ERROR;
98 u_parseDelimitedFile(backupPath, ';', fields, fieldCount, lineFn, context, pErrorCode);
99 }
100 if(U_FAILURE(*pErrorCode)) {
101 log_err_status(*pErrorCode, "error parsing %s: %s\n", filename, u_errorName(*pErrorCode));
102 }
103 }
104
105 /* test data ---------------------------------------------------------------- */
106
107 static const UChar LAST_CHAR_CODE_IN_FILE = 0xFFFD;
108 static const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf";
109 static const int32_t tagValues[] =
110 {
111 /* Mn */ U_NON_SPACING_MARK,
112 /* Mc */ U_COMBINING_SPACING_MARK,
113 /* Me */ U_ENCLOSING_MARK,
114 /* Nd */ U_DECIMAL_DIGIT_NUMBER,
115 /* Nl */ U_LETTER_NUMBER,
116 /* No */ U_OTHER_NUMBER,
117 /* Zs */ U_SPACE_SEPARATOR,
118 /* Zl */ U_LINE_SEPARATOR,
119 /* Zp */ U_PARAGRAPH_SEPARATOR,
120 /* Cc */ U_CONTROL_CHAR,
121 /* Cf */ U_FORMAT_CHAR,
122 /* Cs */ U_SURROGATE,
123 /* Co */ U_PRIVATE_USE_CHAR,
124 /* Cn */ U_UNASSIGNED,
125 /* Lu */ U_UPPERCASE_LETTER,
126 /* Ll */ U_LOWERCASE_LETTER,
127 /* Lt */ U_TITLECASE_LETTER,
128 /* Lm */ U_MODIFIER_LETTER,
129 /* Lo */ U_OTHER_LETTER,
130 /* Pc */ U_CONNECTOR_PUNCTUATION,
131 /* Pd */ U_DASH_PUNCTUATION,
132 /* Ps */ U_START_PUNCTUATION,
133 /* Pe */ U_END_PUNCTUATION,
134 /* Po */ U_OTHER_PUNCTUATION,
135 /* Sm */ U_MATH_SYMBOL,
136 /* Sc */ U_CURRENCY_SYMBOL,
137 /* Sk */ U_MODIFIER_SYMBOL,
138 /* So */ U_OTHER_SYMBOL,
139 /* Pi */ U_INITIAL_PUNCTUATION,
140 /* Pf */ U_FINAL_PUNCTUATION
141 };
142
143 static const char dirStrings[][5] = {
144 "L",
145 "R",
146 "EN",
147 "ES",
148 "ET",
149 "AN",
150 "CS",
151 "B",
152 "S",
153 "WS",
154 "ON",
155 "LRE",
156 "LRO",
157 "AL",
158 "RLE",
159 "RLO",
160 "PDF",
161 "NSM",
162 "BN"
163 };
164
165 void addUnicodeTest(TestNode** root);
166
167 void addUnicodeTest(TestNode** root)
168 {
169 addTest(root, &TestCodeUnit, "tsutil/cucdtst/TestCodeUnit");
170 addTest(root, &TestCodePoint, "tsutil/cucdtst/TestCodePoint");
171 addTest(root, &TestCharLength, "tsutil/cucdtst/TestCharLength");
172 addTest(root, &TestBinaryValues, "tsutil/cucdtst/TestBinaryValues");
173 addTest(root, &TestUnicodeData, "tsutil/cucdtst/TestUnicodeData");
174 addTest(root, &TestAdditionalProperties, "tsutil/cucdtst/TestAdditionalProperties");
175 addTest(root, &TestNumericProperties, "tsutil/cucdtst/TestNumericProperties");
176 addTest(root, &TestUpperLower, "tsutil/cucdtst/TestUpperLower");
177 addTest(root, &TestLetterNumber, "tsutil/cucdtst/TestLetterNumber");
178 addTest(root, &TestMisc, "tsutil/cucdtst/TestMisc");
179 addTest(root, &TestPOSIX, "tsutil/cucdtst/TestPOSIX");
180 addTest(root, &TestControlPrint, "tsutil/cucdtst/TestControlPrint");
181 addTest(root, &TestIdentifier, "tsutil/cucdtst/TestIdentifier");
182 addTest(root, &TestCharNames, "tsutil/cucdtst/TestCharNames");
183 addTest(root, &TestMirroring, "tsutil/cucdtst/TestMirroring");
184 addTest(root, &TestUScriptCodeAPI, "tsutil/cucdtst/TestUScriptCodeAPI");
185 addTest(root, &TestHasScript, "tsutil/cucdtst/TestHasScript");
186 addTest(root, &TestGetScriptExtensions, "tsutil/cucdtst/TestGetScriptExtensions");
187 addTest(root, &TestScriptMetadataAPI, "tsutil/cucdtst/TestScriptMetadataAPI");
188 addTest(root, &TestUScriptRunAPI, "tsutil/cucdtst/TestUScriptRunAPI");
189 addTest(root, &TestPropertyNames, "tsutil/cucdtst/TestPropertyNames");
190 addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues");
191 addTest(root, &TestConsistency, "tsutil/cucdtst/TestConsistency");
192 addTest(root, &TestUCase, "tsutil/cucdtst/TestUCase");
193 addTest(root, &TestUBiDiProps, "tsutil/cucdtst/TestUBiDiProps");
194 addTest(root, &TestCaseFolding, "tsutil/cucdtst/TestCaseFolding");
195 }
196
197 /*==================================================== */
198 /* test u_toupper() and u_tolower() */
199 /*==================================================== */
200 static void TestUpperLower()
201 {
202 const UChar upper[] = {0x41, 0x42, 0x00b2, 0x01c4, 0x01c6, 0x01c9, 0x01c8, 0x01c9, 0x000c, 0x0000};
203 const UChar lower[] = {0x61, 0x62, 0x00b2, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01c9, 0x000c, 0x0000};
204 U_STRING_DECL(upperTest, "abcdefg123hij.?:klmno", 21);
205 U_STRING_DECL(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
206 int32_t i;
207
208 U_STRING_INIT(upperTest, "abcdefg123hij.?:klmno", 21);
209 U_STRING_INIT(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
210
211 /*
212 Checks LetterLike Symbols which were previously a source of confusion
213 [Bertrand A. D. 02/04/98]
214 */
215 for (i=0x2100;i<0x2138;i++)
216 {
217 /* Unicode 5.0 adds lowercase U+214E (TURNED SMALL F) to U+2132 (TURNED CAPITAL F) */
218 if(i!=0x2126 && i!=0x212a && i!=0x212b && i!=0x2132)
219 {
220 if (i != (int)u_tolower(i)) /* itself */
221 log_err("Failed case conversion with itself: U+%04x\n", i);
222 if (i != (int)u_toupper(i))
223 log_err("Failed case conversion with itself: U+%04x\n", i);
224 }
225 }
226
227 for(i=0; i < u_strlen(upper); i++){
228 if(u_tolower(upper[i]) != lower[i]){
229 log_err("FAILED u_tolower() for %lx Expected %lx Got %lx\n", upper[i], lower[i], u_tolower(upper[i]));
230 }
231 }
232
233 log_verbose("testing upper lower\n");
234 for (i = 0; i < 21; i++) {
235
236 if (u_isalpha(upperTest[i]) && !u_islower(upperTest[i]))
237 {
238 log_err("Failed isLowerCase test at %c\n", upperTest[i]);
239 }
240 else if (u_isalpha(lowerTest[i]) && !u_isupper(lowerTest[i]))
241 {
242 log_err("Failed isUpperCase test at %c\n", lowerTest[i]);
243 }
244 else if (upperTest[i] != u_tolower(lowerTest[i]))
245 {
246 log_err("Failed case conversion from %c To %c :\n", lowerTest[i], upperTest[i]);
247 }
248 else if (lowerTest[i] != u_toupper(upperTest[i]))
249 {
250 log_err("Failed case conversion : %c To %c \n", upperTest[i], lowerTest[i]);
251 }
252 else if (upperTest[i] != u_tolower(upperTest[i]))
253 {
254 log_err("Failed case conversion with itself: %c\n", upperTest[i]);
255 }
256 else if (lowerTest[i] != u_toupper(lowerTest[i]))
257 {
258 log_err("Failed case conversion with itself: %c\n", lowerTest[i]);
259 }
260 }
261 log_verbose("done testing upper lower\n");
262
263 log_verbose("testing u_istitle\n");
264 {
265 static const UChar expected[] = {
266 0x1F88,
267 0x1F89,
268 0x1F8A,
269 0x1F8B,
270 0x1F8C,
271 0x1F8D,
272 0x1F8E,
273 0x1F8F,
274 0x1F88,
275 0x1F89,
276 0x1F8A,
277 0x1F8B,
278 0x1F8C,
279 0x1F8D,
280 0x1F8E,
281 0x1F8F,
282 0x1F98,
283 0x1F99,
284 0x1F9A,
285 0x1F9B,
286 0x1F9C,
287 0x1F9D,
288 0x1F9E,
289 0x1F9F,
290 0x1F98,
291 0x1F99,
292 0x1F9A,
293 0x1F9B,
294 0x1F9C,
295 0x1F9D,
296 0x1F9E,
297 0x1F9F,
298 0x1FA8,
299 0x1FA9,
300 0x1FAA,
301 0x1FAB,
302 0x1FAC,
303 0x1FAD,
304 0x1FAE,
305 0x1FAF,
306 0x1FA8,
307 0x1FA9,
308 0x1FAA,
309 0x1FAB,
310 0x1FAC,
311 0x1FAD,
312 0x1FAE,
313 0x1FAF,
314 0x1FBC,
315 0x1FBC,
316 0x1FCC,
317 0x1FCC,
318 0x1FFC,
319 0x1FFC,
320 };
321 int32_t num = sizeof(expected)/sizeof(expected[0]);
322 for(i=0; i<num; i++){
323 if(!u_istitle(expected[i])){
324 log_err("u_istitle failed for 0x%4X. Expected TRUE, got FALSE\n",expected[i]);
325 }
326 }
327
328 }
329 }
330
331 /* compare two sets and verify that their difference or intersection is empty */
332 static UBool
333 showADiffB(const USet *a, const USet *b,
334 const char *a_name, const char *b_name,
335 UBool expect, UBool diffIsError) {
336 USet *aa;
337 int32_t i, start, end, length;
338 UErrorCode errorCode;
339
340 /*
341 * expect:
342 * TRUE -> a-b should be empty, that is, b should contain all of a
343 * FALSE -> a&b should be empty, that is, a should contain none of b (and vice versa)
344 */
345 if(expect ? uset_containsAll(b, a) : uset_containsNone(a, b)) {
346 return TRUE;
347 }
348
349 /* clone a to aa because a is const */
350 aa=uset_open(1, 0);
351 if(aa==NULL) {
352 /* unusual problem - out of memory? */
353 return FALSE;
354 }
355 uset_addAll(aa, a);
356
357 /* compute the set in question */
358 if(expect) {
359 /* a-b */
360 uset_removeAll(aa, b);
361 } else {
362 /* a&b */
363 uset_retainAll(aa, b);
364 }
365
366 /* aa is not empty because of the initial tests above; show its contents */
367 errorCode=U_ZERO_ERROR;
368 i=0;
369 for(;;) {
370 length=uset_getItem(aa, i, &start, &end, NULL, 0, &errorCode);
371 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
372 break; /* done */
373 }
374 if(U_FAILURE(errorCode)) {
375 log_err("error comparing %s with %s at difference item %d: %s\n",
376 a_name, b_name, i, u_errorName(errorCode));
377 break;
378 }
379 if(length!=0) {
380 break; /* done with code points, got a string or -1 */
381 }
382
383 if(diffIsError) {
384 if(expect) {
385 log_err("error: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
386 } else {
387 log_err("error: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
388 }
389 } else {
390 if(expect) {
391 log_verbose("info: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
392 } else {
393 log_verbose("info: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
394 }
395 }
396
397 ++i;
398 }
399
400 uset_close(aa);
401 return FALSE;
402 }
403
404 static UBool
405 showAMinusB(const USet *a, const USet *b,
406 const char *a_name, const char *b_name,
407 UBool diffIsError) {
408 return showADiffB(a, b, a_name, b_name, TRUE, diffIsError);
409 }
410
411 static UBool
412 showAIntersectB(const USet *a, const USet *b,
413 const char *a_name, const char *b_name,
414 UBool diffIsError) {
415 return showADiffB(a, b, a_name, b_name, FALSE, diffIsError);
416 }
417
418 static UBool
419 compareUSets(const USet *a, const USet *b,
420 const char *a_name, const char *b_name,
421 UBool diffIsError) {
422 /*
423 * Use an arithmetic & not a logical && so that both branches
424 * are always taken and all differences are shown.
425 */
426 return
427 showAMinusB(a, b, a_name, b_name, diffIsError) &
428 showAMinusB(b, a, b_name, a_name, diffIsError);
429 }
430
431 /* test isLetter(u_isapha()) and isDigit(u_isdigit()) */
432 static void TestLetterNumber()
433 {
434 UChar i = 0x0000;
435
436 log_verbose("Testing for isalpha\n");
437 for (i = 0x0041; i < 0x005B; i++) {
438 if (!u_isalpha(i))
439 {
440 log_err("Failed isLetter test at %.4X\n", i);
441 }
442 }
443 for (i = 0x0660; i < 0x066A; i++) {
444 if (u_isalpha(i))
445 {
446 log_err("Failed isLetter test with numbers at %.4X\n", i);
447 }
448 }
449
450 log_verbose("Testing for isdigit\n");
451 for (i = 0x0660; i < 0x066A; i++) {
452 if (!u_isdigit(i))
453 {
454 log_verbose("Failed isNumber test at %.4X\n", i);
455 }
456 }
457
458 log_verbose("Testing for isalnum\n");
459 for (i = 0x0041; i < 0x005B; i++) {
460 if (!u_isalnum(i))
461 {
462 log_err("Failed isAlNum test at %.4X\n", i);
463 }
464 }
465 for (i = 0x0660; i < 0x066A; i++) {
466 if (!u_isalnum(i))
467 {
468 log_err("Failed isAlNum test at %.4X\n", i);
469 }
470 }
471
472 {
473 /*
474 * The following checks work only starting from Unicode 4.0.
475 * Check the version number here.
476 */
477 static UVersionInfo u401={ 4, 0, 1, 0 };
478 UVersionInfo version;
479 u_getUnicodeVersion(version);
480 if(version[0]<4 || 0==memcmp(version, u401, 4)) {
481 return;
482 }
483 }
484
485 {
486 /*
487 * Sanity check:
488 * Verify that exactly the digit characters have decimal digit values.
489 * This assumption is used in the implementation of u_digit()
490 * (which checks nt=de)
491 * compared with the parallel java.lang.Character.digit()
492 * (which checks Nd).
493 *
494 * This was not true in Unicode 3.2 and earlier.
495 * Unicode 4.0 fixed discrepancies.
496 * Unicode 4.0.1 re-introduced problems in this area due to an
497 * unintentionally incomplete last-minute change.
498 */
499 U_STRING_DECL(digitsPattern, "[:Nd:]", 6);
500 U_STRING_DECL(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
501
502 USet *digits, *decimalValues;
503 UErrorCode errorCode;
504
505 U_STRING_INIT(digitsPattern, "[:Nd:]", 6);
506 U_STRING_INIT(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
507 errorCode=U_ZERO_ERROR;
508 digits=uset_openPattern(digitsPattern, 6, &errorCode);
509 decimalValues=uset_openPattern(decimalValuesPattern, 24, &errorCode);
510
511 if(U_SUCCESS(errorCode)) {
512 compareUSets(digits, decimalValues, "[:Nd:]", "[:Numeric_Type=Decimal:]", TRUE);
513 }
514
515 uset_close(digits);
516 uset_close(decimalValues);
517 }
518 }
519
520 static void testSampleCharProps(UBool propFn(UChar32), const char *propName,
521 const UChar32 *sampleChars, int32_t sampleCharsLength,
522 UBool expected) {
523 int32_t i;
524 for (i = 0; i < sampleCharsLength; ++i) {
525 UBool result = propFn(sampleChars[i]);
526 if (result != expected) {
527 log_err("error: character property function %s(U+%04x)=%d is wrong\n",
528 propName, sampleChars[i], result);
529 }
530 }
531 }
532
533 /* Tests for isDefined(u_isdefined)(, isBaseForm(u_isbase()), isSpaceChar(u_isspace()), isWhiteSpace(), u_CharDigitValue() */
534 static void TestMisc()
535 {
536 static const UChar32 sampleSpaces[] = {0x0020, 0x00a0, 0x2000, 0x2001, 0x2005};
537 static const UChar32 sampleNonSpaces[] = {0x61, 0x62, 0x63, 0x64, 0x74};
538 static const UChar32 sampleUndefined[] = {0xfff1, 0xfff7, 0xfa6e};
539 static const UChar32 sampleDefined[] = {0x523E, 0x4f88, 0xfffd};
540 static const UChar32 sampleBase[] = {0x0061, 0x0031, 0x03d2};
541 static const UChar32 sampleNonBase[] = {0x002B, 0x0020, 0x203B};
542 /* static const UChar sampleChars[] = {0x000a, 0x0045, 0x4e00, 0xDC00, 0xFFE8, 0xFFF0};*/
543 static const UChar32 sampleDigits[]= {0x0030, 0x0662, 0x0F23, 0x0ED5};
544 static const UChar32 sampleNonDigits[] = {0x0010, 0x0041, 0x0122, 0x68FE};
545 static const UChar32 sampleWhiteSpaces[] = {0x2008, 0x2009, 0x200a, 0x001c, 0x000c};
546 static const UChar32 sampleNonWhiteSpaces[] = {0x61, 0x62, 0x3c, 0x28, 0x3f, 0x85, 0x2007, 0xffef};
547
548 static const int32_t sampleDigitValues[] = {0, 2, 3, 5};
549
550 uint32_t mask;
551
552 int32_t i;
553 char icuVersion[U_MAX_VERSION_STRING_LENGTH];
554 UVersionInfo realVersion;
555
556 memset(icuVersion, 0, U_MAX_VERSION_STRING_LENGTH);
557
558 testSampleCharProps(u_isspace, "u_isspace", sampleSpaces, LENGTHOF(sampleSpaces), TRUE);
559 testSampleCharProps(u_isspace, "u_isspace", sampleNonSpaces, LENGTHOF(sampleNonSpaces), FALSE);
560
561 testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
562 sampleSpaces, LENGTHOF(sampleSpaces), TRUE);
563 testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
564 sampleNonSpaces, LENGTHOF(sampleNonSpaces), FALSE);
565
566 testSampleCharProps(u_isWhitespace, "u_isWhitespace",
567 sampleWhiteSpaces, LENGTHOF(sampleWhiteSpaces), TRUE);
568 testSampleCharProps(u_isWhitespace, "u_isWhitespace",
569 sampleNonWhiteSpaces, LENGTHOF(sampleNonWhiteSpaces), FALSE);
570
571 testSampleCharProps(u_isdefined, "u_isdefined",
572 sampleDefined, LENGTHOF(sampleDefined), TRUE);
573 testSampleCharProps(u_isdefined, "u_isdefined",
574 sampleUndefined, LENGTHOF(sampleUndefined), FALSE);
575
576 testSampleCharProps(u_isbase, "u_isbase", sampleBase, LENGTHOF(sampleBase), TRUE);
577 testSampleCharProps(u_isbase, "u_isbase", sampleNonBase, LENGTHOF(sampleNonBase), FALSE);
578
579 testSampleCharProps(u_isdigit, "u_isdigit", sampleDigits, LENGTHOF(sampleDigits), TRUE);
580 testSampleCharProps(u_isdigit, "u_isdigit", sampleNonDigits, LENGTHOF(sampleNonDigits), FALSE);
581
582 for (i = 0; i < LENGTHOF(sampleDigits); i++) {
583 if (u_charDigitValue(sampleDigits[i]) != sampleDigitValues[i]) {
584 log_err("error: u_charDigitValue(U+04x)=%d != %d\n",
585 sampleDigits[i], u_charDigitValue(sampleDigits[i]), sampleDigitValues[i]);
586 }
587 }
588
589 /* Tests the ICU version #*/
590 u_getVersion(realVersion);
591 u_versionToString(realVersion, icuVersion);
592 if (strncmp(icuVersion, U_ICU_VERSION, uprv_min((int32_t)strlen(icuVersion), (int32_t)strlen(U_ICU_VERSION))) != 0)
593 {
594 log_err("ICU version test failed. Header says=%s, got=%s \n", U_ICU_VERSION, icuVersion);
595 }
596 #if defined(ICU_VERSION)
597 /* test only happens where we have configure.in with VERSION - sanity check. */
598 if(strcmp(U_ICU_VERSION, ICU_VERSION))
599 {
600 log_err("ICU version mismatch: Header says %s, build environment says %s.\n", U_ICU_VERSION, ICU_VERSION);
601 }
602 #endif
603
604 /* test U_GC_... */
605 if(
606 U_GET_GC_MASK(0x41)!=U_GC_LU_MASK ||
607 U_GET_GC_MASK(0x662)!=U_GC_ND_MASK ||
608 U_GET_GC_MASK(0xa0)!=U_GC_ZS_MASK ||
609 U_GET_GC_MASK(0x28)!=U_GC_PS_MASK ||
610 U_GET_GC_MASK(0x2044)!=U_GC_SM_MASK ||
611 U_GET_GC_MASK(0xe0063)!=U_GC_CF_MASK
612 ) {
613 log_err("error: U_GET_GC_MASK does not work properly\n");
614 }
615
616 mask=0;
617 mask=(mask&~U_GC_CN_MASK)|U_GC_CN_MASK;
618
619 mask=(mask&~U_GC_LU_MASK)|U_GC_LU_MASK;
620 mask=(mask&~U_GC_LL_MASK)|U_GC_LL_MASK;
621 mask=(mask&~U_GC_LT_MASK)|U_GC_LT_MASK;
622 mask=(mask&~U_GC_LM_MASK)|U_GC_LM_MASK;
623 mask=(mask&~U_GC_LO_MASK)|U_GC_LO_MASK;
624
625 mask=(mask&~U_GC_MN_MASK)|U_GC_MN_MASK;
626 mask=(mask&~U_GC_ME_MASK)|U_GC_ME_MASK;
627 mask=(mask&~U_GC_MC_MASK)|U_GC_MC_MASK;
628
629 mask=(mask&~U_GC_ND_MASK)|U_GC_ND_MASK;
630 mask=(mask&~U_GC_NL_MASK)|U_GC_NL_MASK;
631 mask=(mask&~U_GC_NO_MASK)|U_GC_NO_MASK;
632
633 mask=(mask&~U_GC_ZS_MASK)|U_GC_ZS_MASK;
634 mask=(mask&~U_GC_ZL_MASK)|U_GC_ZL_MASK;
635 mask=(mask&~U_GC_ZP_MASK)|U_GC_ZP_MASK;
636
637 mask=(mask&~U_GC_CC_MASK)|U_GC_CC_MASK;
638 mask=(mask&~U_GC_CF_MASK)|U_GC_CF_MASK;
639 mask=(mask&~U_GC_CO_MASK)|U_GC_CO_MASK;
640 mask=(mask&~U_GC_CS_MASK)|U_GC_CS_MASK;
641
642 mask=(mask&~U_GC_PD_MASK)|U_GC_PD_MASK;
643 mask=(mask&~U_GC_PS_MASK)|U_GC_PS_MASK;
644 mask=(mask&~U_GC_PE_MASK)|U_GC_PE_MASK;
645 mask=(mask&~U_GC_PC_MASK)|U_GC_PC_MASK;
646 mask=(mask&~U_GC_PO_MASK)|U_GC_PO_MASK;
647
648 mask=(mask&~U_GC_SM_MASK)|U_GC_SM_MASK;
649 mask=(mask&~U_GC_SC_MASK)|U_GC_SC_MASK;
650 mask=(mask&~U_GC_SK_MASK)|U_GC_SK_MASK;
651 mask=(mask&~U_GC_SO_MASK)|U_GC_SO_MASK;
652
653 mask=(mask&~U_GC_PI_MASK)|U_GC_PI_MASK;
654 mask=(mask&~U_GC_PF_MASK)|U_GC_PF_MASK;
655
656 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
657 log_err("error: problems with U_GC_XX_MASK constants\n");
658 }
659
660 mask=0;
661 mask=(mask&~U_GC_C_MASK)|U_GC_C_MASK;
662 mask=(mask&~U_GC_L_MASK)|U_GC_L_MASK;
663 mask=(mask&~U_GC_M_MASK)|U_GC_M_MASK;
664 mask=(mask&~U_GC_N_MASK)|U_GC_N_MASK;
665 mask=(mask&~U_GC_Z_MASK)|U_GC_Z_MASK;
666 mask=(mask&~U_GC_P_MASK)|U_GC_P_MASK;
667 mask=(mask&~U_GC_S_MASK)|U_GC_S_MASK;
668
669 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
670 log_err("error: problems with U_GC_Y_MASK constants\n");
671 }
672 {
673 static const UChar32 digit[10]={ 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039 };
674 for(i=0; i<10; i++){
675 if(digit[i]!=u_forDigit(i,10)){
676 log_err("u_forDigit failed for %i. Expected: 0x%4X Got: 0x%4X\n",i,digit[i],u_forDigit(i,10));
677 }
678 }
679 }
680
681 /* test u_digit() */
682 {
683 static const struct {
684 UChar32 c;
685 int8_t radix, value;
686 } data[]={
687 /* base 16 */
688 { 0x0031, 16, 1 },
689 { 0x0038, 16, 8 },
690 { 0x0043, 16, 12 },
691 { 0x0066, 16, 15 },
692 { 0x00e4, 16, -1 },
693 { 0x0662, 16, 2 },
694 { 0x06f5, 16, 5 },
695 { 0xff13, 16, 3 },
696 { 0xff41, 16, 10 },
697
698 /* base 8 */
699 { 0x0031, 8, 1 },
700 { 0x0038, 8, -1 },
701 { 0x0043, 8, -1 },
702 { 0x0066, 8, -1 },
703 { 0x00e4, 8, -1 },
704 { 0x0662, 8, 2 },
705 { 0x06f5, 8, 5 },
706 { 0xff13, 8, 3 },
707 { 0xff41, 8, -1 },
708
709 /* base 36 */
710 { 0x5a, 36, 35 },
711 { 0x7a, 36, 35 },
712 { 0xff3a, 36, 35 },
713 { 0xff5a, 36, 35 },
714
715 /* wrong radix values */
716 { 0x0031, 1, -1 },
717 { 0xff3a, 37, -1 }
718 };
719
720 for(i=0; i<LENGTHOF(data); ++i) {
721 if(u_digit(data[i].c, data[i].radix)!=data[i].value) {
722 log_err("u_digit(U+%04x, %d)=%d expected %d\n",
723 data[i].c,
724 data[i].radix,
725 u_digit(data[i].c, data[i].radix),
726 data[i].value);
727 }
728 }
729 }
730 }
731
732 /* test C/POSIX-style functions --------------------------------------------- */
733
734 /* bit flags */
735 #define ISAL 1
736 #define ISLO 2
737 #define ISUP 4
738
739 #define ISDI 8
740 #define ISXD 0x10
741
742 #define ISAN 0x20
743
744 #define ISPU 0x40
745 #define ISGR 0x80
746 #define ISPR 0x100
747
748 #define ISSP 0x200
749 #define ISBL 0x400
750 #define ISCN 0x800
751
752 /* C/POSIX-style functions, in the same order as the bit flags */
753 typedef UBool U_EXPORT2 IsPOSIXClass(UChar32 c);
754
755 static const struct {
756 IsPOSIXClass *fn;
757 const char *name;
758 } posixClasses[]={
759 { u_isalpha, "isalpha" },
760 { u_islower, "islower" },
761 { u_isupper, "isupper" },
762 { u_isdigit, "isdigit" },
763 { u_isxdigit, "isxdigit" },
764 { u_isalnum, "isalnum" },
765 { u_ispunct, "ispunct" },
766 { u_isgraph, "isgraph" },
767 { u_isprint, "isprint" },
768 { u_isspace, "isspace" },
769 { u_isblank, "isblank" },
770 { u_iscntrl, "iscntrl" }
771 };
772
773 static const struct {
774 UChar32 c;
775 uint32_t posixResults;
776 } posixData[]={
777 { 0x0008, ISCN }, /* backspace */
778 { 0x0009, ISSP|ISBL|ISCN }, /* TAB */
779 { 0x000a, ISSP| ISCN }, /* LF */
780 { 0x000c, ISSP| ISCN }, /* FF */
781 { 0x000d, ISSP| ISCN }, /* CR */
782 { 0x0020, ISPR|ISSP|ISBL }, /* space */
783 { 0x0021, ISPU|ISGR|ISPR }, /* ! */
784 { 0x0033, ISDI|ISXD|ISAN| ISGR|ISPR }, /* 3 */
785 { 0x0040, ISPU|ISGR|ISPR }, /* @ */
786 { 0x0041, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* A */
787 { 0x007a, ISAL|ISLO| ISAN| ISGR|ISPR }, /* z */
788 { 0x007b, ISPU|ISGR|ISPR }, /* { */
789 { 0x0085, ISSP| ISCN }, /* NEL */
790 { 0x00a0, ISPR|ISSP|ISBL }, /* NBSP */
791 { 0x00a4, ISGR|ISPR }, /* currency sign */
792 { 0x00e4, ISAL|ISLO| ISAN| ISGR|ISPR }, /* a-umlaut */
793 { 0x0300, ISGR|ISPR }, /* combining grave */
794 { 0x0600, ISCN }, /* arabic number sign */
795 { 0x0627, ISAL| ISAN| ISGR|ISPR }, /* alef */
796 { 0x0663, ISDI|ISXD|ISAN| ISGR|ISPR }, /* arabic 3 */
797 { 0x2002, ISPR|ISSP|ISBL }, /* en space */
798 { 0x2007, ISPR|ISSP|ISBL }, /* figure space */
799 { 0x2009, ISPR|ISSP|ISBL }, /* thin space */
800 { 0x200b, ISCN }, /* ZWSP */
801 /*{ 0x200b, ISPR|ISSP },*/ /* ZWSP */ /* ZWSP became a control char in 4.0.1*/
802 { 0x200e, ISCN }, /* LRM */
803 { 0x2028, ISPR|ISSP| ISCN }, /* LS */
804 { 0x2029, ISPR|ISSP| ISCN }, /* PS */
805 { 0x20ac, ISGR|ISPR }, /* Euro */
806 { 0xff15, ISDI|ISXD|ISAN| ISGR|ISPR }, /* fullwidth 5 */
807 { 0xff25, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* fullwidth E */
808 { 0xff35, ISAL| ISUP| ISAN| ISGR|ISPR }, /* fullwidth U */
809 { 0xff45, ISAL|ISLO| ISXD|ISAN| ISGR|ISPR }, /* fullwidth e */
810 { 0xff55, ISAL|ISLO| ISAN| ISGR|ISPR } /* fullwidth u */
811 };
812
813 static void
814 TestPOSIX() {
815 uint32_t mask;
816 int32_t cl, i;
817 UBool expect;
818
819 mask=1;
820 for(cl=0; cl<12; ++cl) {
821 for(i=0; i<LENGTHOF(posixData); ++i) {
822 expect=(UBool)((posixData[i].posixResults&mask)!=0);
823 if(posixClasses[cl].fn(posixData[i].c)!=expect) {
824 log_err("u_%s(U+%04x)=%s is wrong\n",
825 posixClasses[cl].name, posixData[i].c, expect ? "FALSE" : "TRUE");
826 }
827 }
828 mask<<=1;
829 }
830 }
831
832 /* Tests for isControl(u_iscntrl()) and isPrintable(u_isprint()) */
833 static void TestControlPrint()
834 {
835 const UChar32 sampleControl[] = {0x1b, 0x97, 0x82, 0x2028, 0x2029, 0x200c, 0x202b};
836 const UChar32 sampleNonControl[] = {0x61, 0x0031, 0x00e2};
837 const UChar32 samplePrintable[] = {0x0042, 0x005f, 0x2014};
838 const UChar32 sampleNonPrintable[] = {0x200c, 0x009f, 0x001b};
839 UChar32 c;
840
841 testSampleCharProps(u_iscntrl, "u_iscntrl", sampleControl, LENGTHOF(sampleControl), TRUE);
842 testSampleCharProps(u_iscntrl, "u_iscntrl", sampleNonControl, LENGTHOF(sampleNonControl), FALSE);
843
844 testSampleCharProps(u_isprint, "u_isprint",
845 samplePrintable, LENGTHOF(samplePrintable), TRUE);
846 testSampleCharProps(u_isprint, "u_isprint",
847 sampleNonPrintable, LENGTHOF(sampleNonPrintable), FALSE);
848
849 /* test all ISO 8 controls */
850 for(c=0; c<=0x9f; ++c) {
851 if(c==0x20) {
852 /* skip ASCII graphic characters and continue with DEL */
853 c=0x7f;
854 }
855 if(!u_iscntrl(c)) {
856 log_err("error: u_iscntrl(ISO 8 control U+%04x)=FALSE\n", c);
857 }
858 if(!u_isISOControl(c)) {
859 log_err("error: u_isISOControl(ISO 8 control U+%04x)=FALSE\n", c);
860 }
861 if(u_isprint(c)) {
862 log_err("error: u_isprint(ISO 8 control U+%04x)=TRUE\n", c);
863 }
864 }
865
866 /* test all Latin-1 graphic characters */
867 for(c=0x20; c<=0xff; ++c) {
868 if(c==0x7f) {
869 c=0xa0;
870 } else if(c==0xad) {
871 /* Unicode 4 changes 00AD Soft Hyphen to Cf (and it is in fact not printable) */
872 ++c;
873 }
874 if(!u_isprint(c)) {
875 log_err("error: u_isprint(Latin-1 graphic character U+%04x)=FALSE\n", c);
876 }
877 }
878 }
879
880 /* u_isJavaIDStart, u_isJavaIDPart, u_isIDStart(), u_isIDPart(), u_isIDIgnorable()*/
881 static void TestIdentifier()
882 {
883 const UChar32 sampleJavaIDStart[] = {0x0071, 0x00e4, 0x005f};
884 const UChar32 sampleNonJavaIDStart[] = {0x0020, 0x2030, 0x0082};
885 const UChar32 sampleJavaIDPart[] = {0x005f, 0x0032, 0x0045};
886 const UChar32 sampleNonJavaIDPart[] = {0x2030, 0x2020, 0x0020};
887 const UChar32 sampleUnicodeIDStart[] = {0x0250, 0x00e2, 0x0061};
888 const UChar32 sampleNonUnicodeIDStart[] = {0x2000, 0x000a, 0x2019};
889 const UChar32 sampleUnicodeIDPart[] = {0x005f, 0x0032, 0x0045};
890 const UChar32 sampleNonUnicodeIDPart[] = {0x2030, 0x00a3, 0x0020};
891 const UChar32 sampleIDIgnore[] = {0x0006, 0x0010, 0x206b, 0x85};
892 const UChar32 sampleNonIDIgnore[] = {0x0075, 0x00a3, 0x0061};
893
894 testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
895 sampleJavaIDStart, LENGTHOF(sampleJavaIDStart), TRUE);
896 testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
897 sampleNonJavaIDStart, LENGTHOF(sampleNonJavaIDStart), FALSE);
898
899 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
900 sampleJavaIDPart, LENGTHOF(sampleJavaIDPart), TRUE);
901 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
902 sampleNonJavaIDPart, LENGTHOF(sampleNonJavaIDPart), FALSE);
903
904 /* IDPart should imply IDStart */
905 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
906 sampleJavaIDStart, LENGTHOF(sampleJavaIDStart), TRUE);
907
908 testSampleCharProps(u_isIDStart, "u_isIDStart",
909 sampleUnicodeIDStart, LENGTHOF(sampleUnicodeIDStart), TRUE);
910 testSampleCharProps(u_isIDStart, "u_isIDStart",
911 sampleNonUnicodeIDStart, LENGTHOF(sampleNonUnicodeIDStart), FALSE);
912
913 testSampleCharProps(u_isIDPart, "u_isIDPart",
914 sampleUnicodeIDPart, LENGTHOF(sampleUnicodeIDPart), TRUE);
915 testSampleCharProps(u_isIDPart, "u_isIDPart",
916 sampleNonUnicodeIDPart, LENGTHOF(sampleNonUnicodeIDPart), FALSE);
917
918 /* IDPart should imply IDStart */
919 testSampleCharProps(u_isIDPart, "u_isIDPart",
920 sampleUnicodeIDStart, LENGTHOF(sampleUnicodeIDStart), TRUE);
921
922 testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
923 sampleIDIgnore, LENGTHOF(sampleIDIgnore), TRUE);
924 testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
925 sampleNonIDIgnore, LENGTHOF(sampleNonIDIgnore), FALSE);
926 }
927
928 /* for each line of UnicodeData.txt, check some of the properties */
929 typedef struct UnicodeDataContext {
930 #if UCONFIG_NO_NORMALIZATION
931 const void *dummy;
932 #else
933 const UNormalizer2 *nfc;
934 const UNormalizer2 *nfkc;
935 #endif
936 } UnicodeDataContext;
937
938 /*
939 * ### TODO
940 * This test fails incorrectly if the First or Last code point of a repetitive area
941 * is overridden, which is allowed and is encouraged for the PUAs.
942 * Currently, this means that both area First/Last and override lines are
943 * tested against the properties from the API,
944 * and the area boundary will not match and cause an error.
945 *
946 * This function should detect area boundaries and skip them for the test of individual
947 * code points' properties.
948 * Then it should check that the areas contain all the same properties except where overridden.
949 * For this, it would have had to set a flag for which code points were listed explicitly.
950 */
951 static void U_CALLCONV
952 unicodeDataLineFn(void *context,
953 char *fields[][2], int32_t fieldCount,
954 UErrorCode *pErrorCode)
955 {
956 char buffer[100];
957 const char *d;
958 char *end;
959 uint32_t value;
960 UChar32 c;
961 int32_t i;
962 int8_t type;
963 int32_t dt;
964 UChar dm[32], s[32];
965 int32_t dmLength, length;
966
967 #if !UCONFIG_NO_NORMALIZATION
968 const UNormalizer2 *nfc, *nfkc;
969 #endif
970
971 /* get the character code, field 0 */
972 c=strtoul(fields[0][0], &end, 16);
973 if(end<=fields[0][0] || end!=fields[0][1]) {
974 log_err("error: syntax error in field 0 at %s\n", fields[0][0]);
975 return;
976 }
977 if((uint32_t)c>=UCHAR_MAX_VALUE + 1) {
978 log_err("error in UnicodeData.txt: code point %lu out of range\n", c);
979 return;
980 }
981
982 /* get general category, field 2 */
983 *fields[2][1]=0;
984 type = (int8_t)tagValues[MakeProp(fields[2][0])];
985 if(u_charType(c)!=type) {
986 log_err("error: u_charType(U+%04lx)==%u instead of %u\n", c, u_charType(c), type);
987 }
988 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
989 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
990 }
991
992 /* get canonical combining class, field 3 */
993 value=strtoul(fields[3][0], &end, 10);
994 if(end<=fields[3][0] || end!=fields[3][1]) {
995 log_err("error: syntax error in field 3 at code 0x%lx\n", c);
996 return;
997 }
998 if(value>255) {
999 log_err("error in UnicodeData.txt: combining class %lu out of range\n", value);
1000 return;
1001 }
1002 #if !UCONFIG_NO_NORMALIZATION
1003 if(value!=u_getCombiningClass(c) || value!=(uint32_t)u_getIntPropertyValue(c, UCHAR_CANONICAL_COMBINING_CLASS)) {
1004 log_err("error: u_getCombiningClass(U+%04lx)==%hu instead of %lu\n", c, u_getCombiningClass(c), value);
1005 }
1006 nfkc=((UnicodeDataContext *)context)->nfkc;
1007 if(value!=unorm2_getCombiningClass(nfkc, c)) {
1008 log_err("error: unorm2_getCombiningClass(nfkc, U+%04lx)==%hu instead of %lu\n", c, unorm2_getCombiningClass(nfkc, c), value);
1009 }
1010 #endif
1011
1012 /* get BiDi category, field 4 */
1013 *fields[4][1]=0;
1014 i=MakeDir(fields[4][0]);
1015 if(i!=u_charDirection(c) || i!=u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)) {
1016 log_err("error: u_charDirection(U+%04lx)==%u instead of %u (%s)\n", c, u_charDirection(c), MakeDir(fields[4][0]), fields[4][0]);
1017 }
1018
1019 /* get Decomposition_Type & Decomposition_Mapping, field 5 */
1020 d=NULL;
1021 if(fields[5][0]==fields[5][1]) {
1022 /* no decomposition, except UnicodeData.txt omits Hangul syllable decompositions */
1023 if(c==0xac00 || c==0xd7a3) {
1024 dt=U_DT_CANONICAL;
1025 } else {
1026 dt=U_DT_NONE;
1027 }
1028 } else {
1029 d=fields[5][0];
1030 *fields[5][1]=0;
1031 dt=UCHAR_INVALID_CODE;
1032 if(*d=='<') {
1033 end=strchr(++d, '>');
1034 if(end!=NULL) {
1035 *end=0;
1036 dt=u_getPropertyValueEnum(UCHAR_DECOMPOSITION_TYPE, d);
1037 d=u_skipWhitespace(end+1);
1038 }
1039 } else {
1040 dt=U_DT_CANONICAL;
1041 }
1042 }
1043 if(dt>U_DT_NONE) {
1044 if(c==0xac00) {
1045 dm[0]=0x1100;
1046 dm[1]=0x1161;
1047 dm[2]=0;
1048 dmLength=2;
1049 } else if(c==0xd7a3) {
1050 dm[0]=0xd788;
1051 dm[1]=0x11c2;
1052 dm[2]=0;
1053 dmLength=2;
1054 } else {
1055 dmLength=u_parseString(d, dm, 32, NULL, pErrorCode);
1056 }
1057 } else {
1058 dmLength=-1;
1059 }
1060 if(dt<0 || U_FAILURE(*pErrorCode)) {
1061 log_err("error in UnicodeData.txt: syntax error in U+%04lX decomposition field\n", (long)c);
1062 return;
1063 }
1064 #if !UCONFIG_NO_NORMALIZATION
1065 i=u_getIntPropertyValue(c, UCHAR_DECOMPOSITION_TYPE);
1066 if(i!=dt) {
1067 log_err("error: u_getIntPropertyValue(U+%04lx, UCHAR_DECOMPOSITION_TYPE)==%d instead of %d\n", c, i, dt);
1068 }
1069 /* Expect Decomposition_Mapping=nfkc.getRawDecomposition(c). */
1070 length=unorm2_getRawDecomposition(nfkc, c, s, 32, pErrorCode);
1071 if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1072 log_err("error: unorm2_getRawDecomposition(nfkc, U+%04lx)==%d instead of %d "
1073 "or the Decomposition_Mapping is different (%s)\n",
1074 c, length, dmLength, u_errorName(*pErrorCode));
1075 return;
1076 }
1077 /* For canonical decompositions only, expect Decomposition_Mapping=nfc.getRawDecomposition(c). */
1078 if(dt!=U_DT_CANONICAL) {
1079 dmLength=-1;
1080 }
1081 nfc=((UnicodeDataContext *)context)->nfc;
1082 length=unorm2_getRawDecomposition(nfc, c, s, 32, pErrorCode);
1083 if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1084 log_err("error: unorm2_getRawDecomposition(nfc, U+%04lx)==%d instead of %d "
1085 "or the Decomposition_Mapping is different (%s)\n",
1086 c, length, dmLength, u_errorName(*pErrorCode));
1087 return;
1088 }
1089 /* recompose */
1090 if(dt==U_DT_CANONICAL && !u_hasBinaryProperty(c, UCHAR_FULL_COMPOSITION_EXCLUSION)) {
1091 UChar32 a, b, composite;
1092 i=0;
1093 U16_NEXT(dm, i, dmLength, a);
1094 U16_NEXT(dm, i, dmLength, b);
1095 /* i==dmLength */
1096 composite=unorm2_composePair(nfc, a, b);
1097 if(composite!=c) {
1098 log_err("error: nfc U+%04lX decomposes to U+%04lX+U+%04lX but does not compose back (instead U+%04lX)\n",
1099 (long)c, (long)a, (long)b, (long)composite);
1100 }
1101 /*
1102 * Note: NFKC has fewer round-trip mappings than NFC,
1103 * so we can't just test unorm2_composePair(nfkc, a, b) here without further data.
1104 */
1105 }
1106 #endif
1107
1108 /* get ISO Comment, field 11 */
1109 *fields[11][1]=0;
1110 i=u_getISOComment(c, buffer, sizeof(buffer), pErrorCode);
1111 if(U_FAILURE(*pErrorCode) || 0!=strcmp(fields[11][0], buffer)) {
1112 log_err_status(*pErrorCode, "error: u_getISOComment(U+%04lx) wrong (%s): \"%s\" should be \"%s\"\n",
1113 c, u_errorName(*pErrorCode),
1114 U_FAILURE(*pErrorCode) ? buffer : "[error]",
1115 fields[11][0]);
1116 }
1117
1118 /* get uppercase mapping, field 12 */
1119 if(fields[12][0]!=fields[12][1]) {
1120 value=strtoul(fields[12][0], &end, 16);
1121 if(end!=fields[12][1]) {
1122 log_err("error: syntax error in field 12 at code 0x%lx\n", c);
1123 return;
1124 }
1125 if((UChar32)value!=u_toupper(c)) {
1126 log_err("error: u_toupper(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_toupper(c), value);
1127 }
1128 } else {
1129 /* no case mapping: the API must map the code point to itself */
1130 if(c!=u_toupper(c)) {
1131 log_err("error: U+%04lx does not have an uppercase mapping but u_toupper()==U+%04lx\n", c, u_toupper(c));
1132 }
1133 }
1134
1135 /* get lowercase mapping, field 13 */
1136 if(fields[13][0]!=fields[13][1]) {
1137 value=strtoul(fields[13][0], &end, 16);
1138 if(end!=fields[13][1]) {
1139 log_err("error: syntax error in field 13 at code 0x%lx\n", c);
1140 return;
1141 }
1142 if((UChar32)value!=u_tolower(c)) {
1143 log_err("error: u_tolower(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_tolower(c), value);
1144 }
1145 } else {
1146 /* no case mapping: the API must map the code point to itself */
1147 if(c!=u_tolower(c)) {
1148 log_err("error: U+%04lx does not have a lowercase mapping but u_tolower()==U+%04lx\n", c, u_tolower(c));
1149 }
1150 }
1151
1152 /* get titlecase mapping, field 14 */
1153 if(fields[14][0]!=fields[14][1]) {
1154 value=strtoul(fields[14][0], &end, 16);
1155 if(end!=fields[14][1]) {
1156 log_err("error: syntax error in field 14 at code 0x%lx\n", c);
1157 return;
1158 }
1159 if((UChar32)value!=u_totitle(c)) {
1160 log_err("error: u_totitle(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_totitle(c), value);
1161 }
1162 } else {
1163 /* no case mapping: the API must map the code point to itself */
1164 if(c!=u_totitle(c)) {
1165 log_err("error: U+%04lx does not have a titlecase mapping but u_totitle()==U+%04lx\n", c, u_totitle(c));
1166 }
1167 }
1168 }
1169
1170 static UBool U_CALLCONV
1171 enumTypeRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1172 static const UChar32 test[][2]={
1173 {0x41, U_UPPERCASE_LETTER},
1174 {0x308, U_NON_SPACING_MARK},
1175 {0xfffe, U_GENERAL_OTHER_TYPES},
1176 {0xe0041, U_FORMAT_CHAR},
1177 {0xeffff, U_UNASSIGNED}
1178 };
1179
1180 int32_t i, count;
1181
1182 if(0!=strcmp((const char *)context, "a1")) {
1183 log_err("error: u_enumCharTypes() passes on an incorrect context pointer\n");
1184 return FALSE;
1185 }
1186
1187 count=LENGTHOF(test);
1188 for(i=0; i<count; ++i) {
1189 if(start<=test[i][0] && test[i][0]<limit) {
1190 if(type!=(UCharCategory)test[i][1]) {
1191 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld instead of U+%04lx with %ld\n",
1192 start, limit, (long)type, test[i][0], test[i][1]);
1193 }
1194 /* stop at the range that includes the last test code point (increases code coverage for enumeration) */
1195 return i==(count-1) ? FALSE : TRUE;
1196 }
1197 }
1198
1199 if(start>test[count-1][0]) {
1200 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld after it should have stopped\n",
1201 start, limit, (long)type);
1202 return FALSE;
1203 }
1204
1205 return TRUE;
1206 }
1207
1208 static UBool U_CALLCONV
1209 enumDefaultsRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1210 /* default Bidi classes for unassigned code points, from the DerivedBidiClass.txt header */
1211 static const int32_t defaultBidi[][2]={ /* { limit, class } */
1212 { 0x0590, U_LEFT_TO_RIGHT },
1213 { 0x0600, U_RIGHT_TO_LEFT },
1214 { 0x07C0, U_RIGHT_TO_LEFT_ARABIC },
1215 { 0x08A0, U_RIGHT_TO_LEFT },
1216 { 0x0900, U_RIGHT_TO_LEFT_ARABIC }, /* Unicode 6.1 changes U+08A0..U+08FF from R to AL */
1217 { 0xFB1D, U_LEFT_TO_RIGHT },
1218 { 0xFB50, U_RIGHT_TO_LEFT },
1219 { 0xFE00, U_RIGHT_TO_LEFT_ARABIC },
1220 { 0xFE70, U_LEFT_TO_RIGHT },
1221 { 0xFF00, U_RIGHT_TO_LEFT_ARABIC },
1222 { 0x10800, U_LEFT_TO_RIGHT },
1223 { 0x11000, U_RIGHT_TO_LEFT },
1224 { 0x1E800, U_LEFT_TO_RIGHT }, /* new default-R range in Unicode 5.2: U+1E800 - U+1EFFF */
1225 { 0x1EE00, U_RIGHT_TO_LEFT },
1226 { 0x1EF00, U_RIGHT_TO_LEFT_ARABIC }, /* Unicode 6.1 changes U+1EE00..U+1EEFF from R to AL */
1227 { 0x1F000, U_RIGHT_TO_LEFT },
1228 { 0x110000, U_LEFT_TO_RIGHT }
1229 };
1230
1231 UChar32 c;
1232 int32_t i;
1233 UCharDirection shouldBeDir;
1234
1235 /*
1236 * LineBreak.txt specifies:
1237 * # - Assigned characters that are not listed explicitly are given the value
1238 * # "AL".
1239 * # - Unassigned characters are given the value "XX".
1240 *
1241 * PUA characters are listed explicitly with "XX".
1242 * Verify that no assigned character has "XX".
1243 */
1244 if(type!=U_UNASSIGNED && type!=U_PRIVATE_USE_CHAR) {
1245 c=start;
1246 while(c<limit) {
1247 if(0==u_getIntPropertyValue(c, UCHAR_LINE_BREAK)) {
1248 log_err("error UCHAR_LINE_BREAK(assigned U+%04lx)=XX\n", c);
1249 }
1250 ++c;
1251 }
1252 }
1253
1254 /*
1255 * Verify default Bidi classes.
1256 * For recent Unicode versions, see UCD.html.
1257 *
1258 * For older Unicode versions:
1259 * See table 3-7 "Bidirectional Character Types" in UAX #9.
1260 * http://www.unicode.org/reports/tr9/
1261 *
1262 * See also DerivedBidiClass.txt for Cn code points!
1263 *
1264 * Unicode 4.0.1/Public Review Issue #28 (http://www.unicode.org/review/resolved-pri.html)
1265 * changed some default values.
1266 * In particular, non-characters and unassigned Default Ignorable Code Points
1267 * change from L to BN.
1268 *
1269 * UCD.html version 4.0.1 does not yet reflect these changes.
1270 */
1271 if(type==U_UNASSIGNED || type==U_PRIVATE_USE_CHAR) {
1272 /* enumerate the intersections of defaultBidi ranges with [start..limit[ */
1273 c=start;
1274 for(i=0; i<LENGTHOF(defaultBidi) && c<limit; ++i) {
1275 if((int32_t)c<defaultBidi[i][0]) {
1276 while(c<limit && (int32_t)c<defaultBidi[i][0]) {
1277 if(U_IS_UNICODE_NONCHAR(c) || u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
1278 shouldBeDir=U_BOUNDARY_NEUTRAL;
1279 } else {
1280 shouldBeDir=(UCharDirection)defaultBidi[i][1];
1281 }
1282
1283 if( u_charDirection(c)!=shouldBeDir ||
1284 u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)!=shouldBeDir
1285 ) {
1286 log_err("error: u_charDirection(unassigned/PUA U+%04lx)=%s should be %s\n",
1287 c, dirStrings[u_charDirection(c)], dirStrings[shouldBeDir]);
1288 }
1289 ++c;
1290 }
1291 }
1292 }
1293 }
1294
1295 return TRUE;
1296 }
1297
1298 /* tests for several properties */
1299 static void TestUnicodeData()
1300 {
1301 UVersionInfo expectVersionArray;
1302 UVersionInfo versionArray;
1303 char *fields[15][2];
1304 UErrorCode errorCode;
1305 UChar32 c;
1306 int8_t type;
1307
1308 UnicodeDataContext context;
1309
1310 u_versionFromString(expectVersionArray, U_UNICODE_VERSION);
1311 u_getUnicodeVersion(versionArray);
1312 if(memcmp(versionArray, expectVersionArray, U_MAX_VERSION_LENGTH) != 0)
1313 {
1314 log_err("Testing u_getUnicodeVersion() - expected " U_UNICODE_VERSION " got %d.%d.%d.%d\n",
1315 versionArray[0], versionArray[1], versionArray[2], versionArray[3]);
1316 }
1317
1318 #if defined(ICU_UNICODE_VERSION)
1319 /* test only happens where we have configure.in with UNICODE_VERSION - sanity check. */
1320 if(strcmp(U_UNICODE_VERSION, ICU_UNICODE_VERSION))
1321 {
1322 log_err("Testing configure.in's ICU_UNICODE_VERSION - expected " U_UNICODE_VERSION " got " ICU_UNICODE_VERSION "\n");
1323 }
1324 #endif
1325
1326 if (ublock_getCode((UChar)0x0041) != UBLOCK_BASIC_LATIN || u_getIntPropertyValue(0x41, UCHAR_BLOCK)!=(int32_t)UBLOCK_BASIC_LATIN) {
1327 log_err("ublock_getCode(U+0041) property failed! Expected : %i Got: %i \n", UBLOCK_BASIC_LATIN,ublock_getCode((UChar)0x0041));
1328 }
1329
1330 errorCode=U_ZERO_ERROR;
1331 #if !UCONFIG_NO_NORMALIZATION
1332 context.nfc=unorm2_getNFCInstance(&errorCode);
1333 context.nfkc=unorm2_getNFKCInstance(&errorCode);
1334 if(U_FAILURE(errorCode)) {
1335 log_data_err("error: unable to open an NFC or NFKC UNormalizer2 - %s\n", u_errorName(errorCode));
1336 return;
1337 }
1338 #endif
1339 parseUCDFile("UnicodeData.txt", fields, 15, unicodeDataLineFn, &context, &errorCode);
1340 if(U_FAILURE(errorCode)) {
1341 return; /* if we couldn't parse UnicodeData.txt, we should return */
1342 }
1343
1344 /* sanity check on repeated properties */
1345 for(c=0xfffe; c<=0x10ffff;) {
1346 type=u_charType(c);
1347 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1348 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1349 }
1350 if(type!=U_UNASSIGNED) {
1351 log_err("error: u_charType(U+%04lx)!=U_UNASSIGNED (returns %d)\n", c, u_charType(c));
1352 }
1353 if((c&0xffff)==0xfffe) {
1354 ++c;
1355 } else {
1356 c+=0xffff;
1357 }
1358 }
1359
1360 /* test that PUA is not "unassigned" */
1361 for(c=0xe000; c<=0x10fffd;) {
1362 type=u_charType(c);
1363 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1364 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1365 }
1366 if(type==U_UNASSIGNED) {
1367 log_err("error: u_charType(U+%04lx)==U_UNASSIGNED\n", c);
1368 } else if(type!=U_PRIVATE_USE_CHAR) {
1369 log_verbose("PUA override: u_charType(U+%04lx)=%d\n", c, type);
1370 }
1371 if(c==0xf8ff) {
1372 c=0xf0000;
1373 } else if(c==0xffffd) {
1374 c=0x100000;
1375 } else {
1376 ++c;
1377 }
1378 }
1379
1380 /* test u_enumCharTypes() */
1381 u_enumCharTypes(enumTypeRange, "a1");
1382
1383 /* check default properties */
1384 u_enumCharTypes(enumDefaultsRange, NULL);
1385 }
1386
1387 static void TestCodeUnit(){
1388 const UChar codeunit[]={0x0000,0xe065,0x20ac,0xd7ff,0xd800,0xd841,0xd905,0xdbff,0xdc00,0xdc02,0xddee,0xdfff,0};
1389
1390 int32_t i;
1391
1392 for(i=0; i<(int32_t)(sizeof(codeunit)/sizeof(codeunit[0])); i++){
1393 UChar c=codeunit[i];
1394 if(i<4){
1395 if(!(UTF_IS_SINGLE(c)) || (UTF_IS_LEAD(c)) || (UTF_IS_TRAIL(c)) ||(UTF_IS_SURROGATE(c))){
1396 log_err("ERROR: U+%04x is a single", c);
1397 }
1398
1399 }
1400 if(i >= 4 && i< 8){
1401 if(!(UTF_IS_LEAD(c)) || UTF_IS_SINGLE(c) || UTF_IS_TRAIL(c) || !(UTF_IS_SURROGATE(c))){
1402 log_err("ERROR: U+%04x is a first surrogate", c);
1403 }
1404 }
1405 if(i >= 8 && i< 12){
1406 if(!(UTF_IS_TRAIL(c)) || UTF_IS_SINGLE(c) || UTF_IS_LEAD(c) || !(UTF_IS_SURROGATE(c))){
1407 log_err("ERROR: U+%04x is a second surrogate", c);
1408 }
1409 }
1410 }
1411
1412 }
1413
1414 static void TestCodePoint(){
1415 const UChar32 codePoint[]={
1416 /*surrogate, notvalid(codepoint), not a UnicodeChar, not Error */
1417 0xd800,
1418 0xdbff,
1419 0xdc00,
1420 0xdfff,
1421 0xdc04,
1422 0xd821,
1423 /*not a surrogate, valid, isUnicodeChar , not Error*/
1424 0x20ac,
1425 0xd7ff,
1426 0xe000,
1427 0xe123,
1428 0x0061,
1429 0xe065,
1430 0x20402,
1431 0x24506,
1432 0x23456,
1433 0x20402,
1434 0x10402,
1435 0x23456,
1436 /*not a surrogate, not valid, isUnicodeChar, isError */
1437 0x0015,
1438 0x009f,
1439 /*not a surrogate, not valid, not isUnicodeChar, isError */
1440 0xffff,
1441 0xfffe,
1442 };
1443 int32_t i;
1444 for(i=0; i<(int32_t)(sizeof(codePoint)/sizeof(codePoint[0])); i++){
1445 UChar32 c=codePoint[i];
1446 if(i<6){
1447 if(!UTF_IS_SURROGATE(c) || !U_IS_SURROGATE(c) || !U16_IS_SURROGATE(c)){
1448 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1449 }
1450 if(UTF_IS_VALID(c)){
1451 log_err("ERROR: isValid() failed for U+%04x\n", c);
1452 }
1453 if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){
1454 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1455 }
1456 if(UTF_IS_ERROR(c)){
1457 log_err("ERROR: isError() failed for U+%04x\n", c);
1458 }
1459 }else if(i >=6 && i<18){
1460 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
1461 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1462 }
1463 if(!UTF_IS_VALID(c)){
1464 log_err("ERROR: isValid() failed for U+%04x\n", c);
1465 }
1466 if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){
1467 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1468 }
1469 if(UTF_IS_ERROR(c)){
1470 log_err("ERROR: isError() failed for U+%04x\n", c);
1471 }
1472 }else if(i >=18 && i<20){
1473 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
1474 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1475 }
1476 if(UTF_IS_VALID(c)){
1477 log_err("ERROR: isValid() failed for U+%04x\n", c);
1478 }
1479 if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){
1480 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1481 }
1482 if(!UTF_IS_ERROR(c)){
1483 log_err("ERROR: isError() failed for U+%04x\n", c);
1484 }
1485 }
1486 else if(i >=18 && i<(int32_t)(sizeof(codePoint)/sizeof(codePoint[0]))){
1487 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
1488 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1489 }
1490 if(UTF_IS_VALID(c)){
1491 log_err("ERROR: isValid() failed for U+%04x\n", c);
1492 }
1493 if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){
1494 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1495 }
1496 if(!UTF_IS_ERROR(c)){
1497 log_err("ERROR: isError() failed for U+%04x\n", c);
1498 }
1499 }
1500 }
1501
1502 if(
1503 !U_IS_BMP(0) || !U_IS_BMP(0x61) || !U_IS_BMP(0x20ac) ||
1504 !U_IS_BMP(0xd9da) || !U_IS_BMP(0xdfed) || !U_IS_BMP(0xffff) ||
1505 U_IS_BMP(U_SENTINEL) || U_IS_BMP(0x10000) || U_IS_BMP(0x50005) ||
1506 U_IS_BMP(0x10ffff) || U_IS_BMP(0x110000) || U_IS_BMP(0x7fffffff)
1507 ) {
1508 log_err("error with U_IS_BMP()\n");
1509 }
1510
1511 if(
1512 U_IS_SUPPLEMENTARY(0) || U_IS_SUPPLEMENTARY(0x61) || U_IS_SUPPLEMENTARY(0x20ac) ||
1513 U_IS_SUPPLEMENTARY(0xd9da) || U_IS_SUPPLEMENTARY(0xdfed) || U_IS_SUPPLEMENTARY(0xffff) ||
1514 U_IS_SUPPLEMENTARY(U_SENTINEL) || !U_IS_SUPPLEMENTARY(0x10000) || !U_IS_SUPPLEMENTARY(0x50005) ||
1515 !U_IS_SUPPLEMENTARY(0x10ffff) || U_IS_SUPPLEMENTARY(0x110000) || U_IS_SUPPLEMENTARY(0x7fffffff)
1516 ) {
1517 log_err("error with U_IS_SUPPLEMENTARY()\n");
1518 }
1519 }
1520
1521 static void TestCharLength()
1522 {
1523 const int32_t codepoint[]={
1524 1, 0x0061,
1525 1, 0xe065,
1526 1, 0x20ac,
1527 2, 0x20402,
1528 2, 0x23456,
1529 2, 0x24506,
1530 2, 0x20402,
1531 2, 0x10402,
1532 1, 0xd7ff,
1533 1, 0xe000
1534 };
1535
1536 int32_t i;
1537 UBool multiple;
1538 for(i=0; i<(int32_t)(sizeof(codepoint)/sizeof(codepoint[0])); i=(int16_t)(i+2)){
1539 UChar32 c=codepoint[i+1];
1540 if(UTF_CHAR_LENGTH(c) != codepoint[i] || U16_LENGTH(c) != codepoint[i]){
1541 log_err("The no: of code units for U+%04x:- Expected: %d Got: %d\n", c, codepoint[i], U16_LENGTH(c));
1542 }
1543 multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
1544 if(UTF_NEED_MULTIPLE_UCHAR(c) != multiple){
1545 log_err("ERROR: Unicode::needMultipleUChar() failed for U+%04x\n", c);
1546 }
1547 }
1548 }
1549
1550 /*internal functions ----*/
1551 static int32_t MakeProp(char* str)
1552 {
1553 int32_t result = 0;
1554 char* matchPosition =0;
1555
1556 matchPosition = strstr(tagStrings, str);
1557 if (matchPosition == 0)
1558 {
1559 log_err("unrecognized type letter ");
1560 log_err(str);
1561 }
1562 else
1563 result = (int32_t)((matchPosition - tagStrings) / 2);
1564 return result;
1565 }
1566
1567 static int32_t MakeDir(char* str)
1568 {
1569 int32_t pos = 0;
1570 for (pos = 0; pos < 19; pos++) {
1571 if (strcmp(str, dirStrings[pos]) == 0) {
1572 return pos;
1573 }
1574 }
1575 return -1;
1576 }
1577
1578 /* test u_charName() -------------------------------------------------------- */
1579
1580 static const struct {
1581 uint32_t code;
1582 const char *name, *oldName, *extName, *alias;
1583 } names[]={
1584 {0x0061, "LATIN SMALL LETTER A", "", "LATIN SMALL LETTER A"},
1585 {0x01a2, "LATIN CAPITAL LETTER OI", "",
1586 "LATIN CAPITAL LETTER OI",
1587 "LATIN CAPITAL LETTER GHA"},
1588 {0x0284, "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK", "",
1589 "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK" },
1590 {0x0fd0, "TIBETAN MARK BSKA- SHOG GI MGO RGYAN", "",
1591 "TIBETAN MARK BSKA- SHOG GI MGO RGYAN",
1592 "TIBETAN MARK BKA- SHOG GI MGO RGYAN"},
1593 {0x3401, "CJK UNIFIED IDEOGRAPH-3401", "", "CJK UNIFIED IDEOGRAPH-3401" },
1594 {0x7fed, "CJK UNIFIED IDEOGRAPH-7FED", "", "CJK UNIFIED IDEOGRAPH-7FED" },
1595 {0xac00, "HANGUL SYLLABLE GA", "", "HANGUL SYLLABLE GA" },
1596 {0xd7a3, "HANGUL SYLLABLE HIH", "", "HANGUL SYLLABLE HIH" },
1597 {0xd800, "", "", "<lead surrogate-D800>" },
1598 {0xdc00, "", "", "<trail surrogate-DC00>" },
1599 {0xff08, "FULLWIDTH LEFT PARENTHESIS", "", "FULLWIDTH LEFT PARENTHESIS" },
1600 {0xffe5, "FULLWIDTH YEN SIGN", "", "FULLWIDTH YEN SIGN" },
1601 {0xffff, "", "", "<noncharacter-FFFF>" },
1602 {0x1d0c5, "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", "",
1603 "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS",
1604 "BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS"},
1605 {0x23456, "CJK UNIFIED IDEOGRAPH-23456", "", "CJK UNIFIED IDEOGRAPH-23456" }
1606 };
1607
1608 static UBool
1609 enumCharNamesFn(void *context,
1610 UChar32 code, UCharNameChoice nameChoice,
1611 const char *name, int32_t length) {
1612 int32_t *pCount=(int32_t *)context;
1613 const char *expected;
1614 int i;
1615
1616 if(length<=0 || length!=(int32_t)strlen(name)) {
1617 /* should not be called with an empty string or invalid length */
1618 log_err("u_enumCharName(0x%lx)=%s but length=%ld\n", name, length);
1619 return TRUE;
1620 }
1621
1622 ++*pCount;
1623 for(i=0; i<sizeof(names)/sizeof(names[0]); ++i) {
1624 if(code==(UChar32)names[i].code) {
1625 switch (nameChoice) {
1626 case U_EXTENDED_CHAR_NAME:
1627 if(0!=strcmp(name, names[i].extName)) {
1628 log_err("u_enumCharName(0x%lx - Extended)=%s instead of %s\n", code, name, names[i].extName);
1629 }
1630 break;
1631 case U_UNICODE_CHAR_NAME:
1632 if(0!=strcmp(name, names[i].name)) {
1633 log_err("u_enumCharName(0x%lx)=%s instead of %s\n", code, name, names[i].name);
1634 }
1635 break;
1636 case U_UNICODE_10_CHAR_NAME:
1637 expected=names[i].oldName;
1638 if(expected[0]==0 || 0!=strcmp(name, expected)) {
1639 log_err("u_enumCharName(0x%lx - 1.0)=%s instead of %s\n", code, name, expected);
1640 }
1641 break;
1642 case U_CHAR_NAME_ALIAS:
1643 expected=names[i].alias;
1644 if(expected==NULL || expected[0]==0 || 0!=strcmp(name, expected)) {
1645 log_err("u_enumCharName(0x%lx - alias)=%s instead of %s\n", code, name, expected);
1646 }
1647 break;
1648 case U_CHAR_NAME_CHOICE_COUNT:
1649 break;
1650 }
1651 break;
1652 }
1653 }
1654 return TRUE;
1655 }
1656
1657 struct enumExtCharNamesContext {
1658 uint32_t length;
1659 int32_t last;
1660 };
1661
1662 static UBool
1663 enumExtCharNamesFn(void *context,
1664 UChar32 code, UCharNameChoice nameChoice,
1665 const char *name, int32_t length) {
1666 struct enumExtCharNamesContext *ecncp = (struct enumExtCharNamesContext *) context;
1667
1668 if (ecncp->last != (int32_t) code - 1) {
1669 if (ecncp->last < 0) {
1670 log_err("u_enumCharName(0x%lx - Ext) after u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x%lx - Ext)\n", code, ecncp->last, ecncp->last + 1);
1671 } else {
1672 log_err("u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x0 - Ext)\n", code);
1673 }
1674 }
1675 ecncp->last = (int32_t) code;
1676
1677 if (!*name) {
1678 log_err("u_enumCharName(0x%lx - Ext) should not be an empty string\n", code);
1679 }
1680
1681 return enumCharNamesFn(&ecncp->length, code, nameChoice, name, length);
1682 }
1683
1684 /**
1685 * This can be made more efficient by moving it into putil.c and having
1686 * it directly access the ebcdic translation tables.
1687 * TODO: If we get this method in putil.c, then delete it from here.
1688 */
1689 static UChar
1690 u_charToUChar(char c) {
1691 UChar uc;
1692 u_charsToUChars(&c, &uc, 1);
1693 return uc;
1694 }
1695
1696 static void
1697 TestCharNames() {
1698 static char name[80];
1699 UErrorCode errorCode=U_ZERO_ERROR;
1700 struct enumExtCharNamesContext extContext;
1701 const char *expected;
1702 int32_t length;
1703 UChar32 c;
1704 int32_t i;
1705
1706 log_verbose("Testing uprv_getMaxCharNameLength()\n");
1707 length=uprv_getMaxCharNameLength();
1708 if(length==0) {
1709 /* no names data available */
1710 return;
1711 }
1712 if(length<83) { /* Unicode 3.2 max char name length */
1713 log_err("uprv_getMaxCharNameLength()=%d is too short");
1714 }
1715 /* ### TODO same tests for max ISO comment length as for max name length */
1716
1717 log_verbose("Testing u_charName()\n");
1718 for(i=0; i<(int32_t)(sizeof(names)/sizeof(names[0])); ++i) {
1719 /* modern Unicode character name */
1720 length=u_charName(names[i].code, U_UNICODE_CHAR_NAME, name, sizeof(name), &errorCode);
1721 if(U_FAILURE(errorCode)) {
1722 log_err("u_charName(0x%lx) error %s\n", names[i].code, u_errorName(errorCode));
1723 return;
1724 }
1725 if(length<0 || 0!=strcmp(name, names[i].name) || length!=(uint16_t)strlen(name)) {
1726 log_err("u_charName(0x%lx) gets: %s (length %ld) instead of: %s\n", names[i].code, name, length, names[i].name);
1727 }
1728
1729 /* find the modern name */
1730 if (*names[i].name) {
1731 c=u_charFromName(U_UNICODE_CHAR_NAME, names[i].name, &errorCode);
1732 if(U_FAILURE(errorCode)) {
1733 log_err("u_charFromName(%s) error %s\n", names[i].name, u_errorName(errorCode));
1734 return;
1735 }
1736 if(c!=(UChar32)names[i].code) {
1737 log_err("u_charFromName(%s) gets 0x%lx instead of 0x%lx\n", names[i].name, c, names[i].code);
1738 }
1739 }
1740
1741 /* Unicode 1.0 character name */
1742 length=u_charName(names[i].code, U_UNICODE_10_CHAR_NAME, name, sizeof(name), &errorCode);
1743 if(U_FAILURE(errorCode)) {
1744 log_err("u_charName(0x%lx - 1.0) error %s\n", names[i].code, u_errorName(errorCode));
1745 return;
1746 }
1747 if(length<0 || (length>0 && 0!=strcmp(name, names[i].oldName)) || length!=(uint16_t)strlen(name)) {
1748 log_err("u_charName(0x%lx - 1.0) gets %s length %ld instead of nothing or %s\n", names[i].code, name, length, names[i].oldName);
1749 }
1750
1751 /* find the Unicode 1.0 name if it is stored (length>0 means that we could read it) */
1752 if(names[i].oldName[0]!=0 /* && length>0 */) {
1753 c=u_charFromName(U_UNICODE_10_CHAR_NAME, names[i].oldName, &errorCode);
1754 if(U_FAILURE(errorCode)) {
1755 log_err("u_charFromName(%s - 1.0) error %s\n", names[i].oldName, u_errorName(errorCode));
1756 return;
1757 }
1758 if(c!=(UChar32)names[i].code) {
1759 log_err("u_charFromName(%s - 1.0) gets 0x%lx instead of 0x%lx\n", names[i].oldName, c, names[i].code);
1760 }
1761 }
1762
1763 /* Unicode character name alias */
1764 length=u_charName(names[i].code, U_CHAR_NAME_ALIAS, name, sizeof(name), &errorCode);
1765 if(U_FAILURE(errorCode)) {
1766 log_err("u_charName(0x%lx - alias) error %s\n", names[i].code, u_errorName(errorCode));
1767 return;
1768 }
1769 expected=names[i].alias;
1770 if(expected==NULL) {
1771 expected="";
1772 }
1773 if(length<0 || (length>0 && 0!=strcmp(name, expected)) || length!=(uint16_t)strlen(name)) {
1774 log_err("u_charName(0x%lx - alias) gets %s length %ld instead of nothing or %s\n",
1775 names[i].code, name, length, expected);
1776 }
1777
1778 /* find the Unicode character name alias if it is stored (length>0 means that we could read it) */
1779 if(expected[0]!=0 /* && length>0 */) {
1780 c=u_charFromName(U_CHAR_NAME_ALIAS, expected, &errorCode);
1781 if(U_FAILURE(errorCode)) {
1782 log_err("u_charFromName(%s - alias) error %s\n",
1783 expected, u_errorName(errorCode));
1784 return;
1785 }
1786 if(c!=(UChar32)names[i].code) {
1787 log_err("u_charFromName(%s - alias) gets 0x%lx instead of 0x%lx\n",
1788 expected, c, names[i].code);
1789 }
1790 }
1791 }
1792
1793 /* test u_enumCharNames() */
1794 length=0;
1795 errorCode=U_ZERO_ERROR;
1796 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumCharNamesFn, &length, U_UNICODE_CHAR_NAME, &errorCode);
1797 if(U_FAILURE(errorCode) || length<94140) {
1798 log_err("u_enumCharNames(%ld..%lx) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE, u_errorName(errorCode), length);
1799 }
1800
1801 extContext.length = 0;
1802 extContext.last = -1;
1803 errorCode=U_ZERO_ERROR;
1804 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumExtCharNamesFn, &extContext, U_EXTENDED_CHAR_NAME, &errorCode);
1805 if(U_FAILURE(errorCode) || extContext.length<UCHAR_MAX_VALUE + 1) {
1806 log_err("u_enumCharNames(%ld..0x%lx - Extended) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, u_errorName(errorCode), extContext.length);
1807 }
1808
1809 /* test that u_charFromName() uppercases the input name, i.e., works with mixed-case names (new in 2.0) */
1810 if(0x61!=u_charFromName(U_UNICODE_CHAR_NAME, "LATin smALl letTER A", &errorCode)) {
1811 log_err("u_charFromName(U_UNICODE_CHAR_NAME, \"LATin smALl letTER A\") did not find U+0061 (%s)\n", u_errorName(errorCode));
1812 }
1813
1814 /* Test getCharNameCharacters */
1815 if(!getTestOption(QUICK_OPTION)) {
1816 enum { BUFSIZE = 256 };
1817 UErrorCode ec = U_ZERO_ERROR;
1818 char buf[BUFSIZE];
1819 int32_t maxLength;
1820 UChar32 cp;
1821 UChar pat[BUFSIZE], dumbPat[BUFSIZE];
1822 int32_t l1, l2;
1823 UBool map[256];
1824 UBool ok;
1825
1826 USet* set = uset_open(1, 0); /* empty set */
1827 USet* dumb = uset_open(1, 0); /* empty set */
1828
1829 /*
1830 * uprv_getCharNameCharacters() will likely return more lowercase
1831 * letters than actual character names contain because
1832 * it includes all the characters in lowercased names of
1833 * general categories, for the full possible set of extended names.
1834 */
1835 {
1836 USetAdder sa={
1837 NULL,
1838 uset_add,
1839 uset_addRange,
1840 uset_addString,
1841 NULL /* don't need remove() */
1842 };
1843 sa.set=set;
1844 uprv_getCharNameCharacters(&sa);
1845 }
1846
1847 /* build set the dumb (but sure-fire) way */
1848 for (i=0; i<256; ++i) {
1849 map[i] = FALSE;
1850 }
1851
1852 maxLength=0;
1853 for (cp=0; cp<0x110000; ++cp) {
1854 int32_t len = u_charName(cp, U_EXTENDED_CHAR_NAME,
1855 buf, BUFSIZE, &ec);
1856 if (U_FAILURE(ec)) {
1857 log_err("FAIL: u_charName failed when it shouldn't\n");
1858 uset_close(set);
1859 uset_close(dumb);
1860 return;
1861 }
1862 if(len>maxLength) {
1863 maxLength=len;
1864 }
1865
1866 for (i=0; i<len; ++i) {
1867 if (!map[(uint8_t) buf[i]]) {
1868 uset_add(dumb, (UChar32)u_charToUChar(buf[i]));
1869 map[(uint8_t) buf[i]] = TRUE;
1870 }
1871 }
1872
1873 /* test for leading/trailing whitespace */
1874 if(buf[0]==' ' || buf[0]=='\t' || buf[len-1]==' ' || buf[len-1]=='\t') {
1875 log_err("u_charName(U+%04x) returns a name with leading or trailing whitespace\n", cp);
1876 }
1877 }
1878
1879 if(map[(uint8_t)'\t']) {
1880 log_err("u_charName() returned a name with a TAB for some code point\n", cp);
1881 }
1882
1883 length=uprv_getMaxCharNameLength();
1884 if(length!=maxLength) {
1885 log_err("uprv_getMaxCharNameLength()=%d differs from the maximum length %d of all extended names\n",
1886 length, maxLength);
1887 }
1888
1889 /* compare the sets. Where is my uset_equals?!! */
1890 ok=TRUE;
1891 for(i=0; i<256; ++i) {
1892 if(uset_contains(set, i)!=uset_contains(dumb, i)) {
1893 if(0x61<=i && i<=0x7a /* a-z */ && uset_contains(set, i) && !uset_contains(dumb, i)) {
1894 /* ignore lowercase a-z that are in set but not in dumb */
1895 ok=TRUE;
1896 } else {
1897 ok=FALSE;
1898 break;
1899 }
1900 }
1901 }
1902
1903 l1 = uset_toPattern(set, pat, BUFSIZE, TRUE, &ec);
1904 l2 = uset_toPattern(dumb, dumbPat, BUFSIZE, TRUE, &ec);
1905 if (U_FAILURE(ec)) {
1906 log_err("FAIL: uset_toPattern failed when it shouldn't\n");
1907 uset_close(set);
1908 uset_close(dumb);
1909 return;
1910 }
1911
1912 if (l1 >= BUFSIZE) {
1913 l1 = BUFSIZE-1;
1914 pat[l1] = 0;
1915 }
1916 if (l2 >= BUFSIZE) {
1917 l2 = BUFSIZE-1;
1918 dumbPat[l2] = 0;
1919 }
1920
1921 if (!ok) {
1922 log_err("FAIL: uprv_getCharNameCharacters() returned %s, expected %s (too many lowercase a-z are ok)\n",
1923 aescstrdup(pat, l1), aescstrdup(dumbPat, l2));
1924 } else if(getTestOption(VERBOSITY_OPTION)) {
1925 log_verbose("Ok: uprv_getCharNameCharacters() returned %s\n", aescstrdup(pat, l1));
1926 }
1927
1928 uset_close(set);
1929 uset_close(dumb);
1930 }
1931
1932 /* ### TODO: test error cases and other interesting things */
1933 }
1934
1935 /* test u_isMirrored() and u_charMirror() ----------------------------------- */
1936
1937 static void
1938 TestMirroring() {
1939 USet *set;
1940 UErrorCode errorCode;
1941
1942 UChar32 start, end, c2, c3;
1943 int32_t i;
1944
1945 U_STRING_DECL(mirroredPattern, "[:Bidi_Mirrored:]", 17);
1946
1947 U_STRING_INIT(mirroredPattern, "[:Bidi_Mirrored:]", 17);
1948
1949 log_verbose("Testing u_isMirrored()\n");
1950 if(!(u_isMirrored(0x28) && u_isMirrored(0xbb) && u_isMirrored(0x2045) && u_isMirrored(0x232a) &&
1951 !u_isMirrored(0x27) && !u_isMirrored(0x61) && !u_isMirrored(0x284) && !u_isMirrored(0x3400)
1952 )
1953 ) {
1954 log_err("u_isMirrored() does not work correctly\n");
1955 }
1956
1957 log_verbose("Testing u_charMirror()\n");
1958 if(!(u_charMirror(0x3c)==0x3e && u_charMirror(0x5d)==0x5b && u_charMirror(0x208d)==0x208e && u_charMirror(0x3017)==0x3016 &&
1959 u_charMirror(0xbb)==0xab && u_charMirror(0x2215)==0x29F5 && u_charMirror(0x29F5)==0x2215 && /* large delta between the code points */
1960 u_charMirror(0x2e)==0x2e && u_charMirror(0x6f3)==0x6f3 && u_charMirror(0x301c)==0x301c && u_charMirror(0xa4ab)==0xa4ab &&
1961 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
1962 u_charMirror(0x2018)==0x2018 && u_charMirror(0x201b)==0x201b && u_charMirror(0x301d)==0x301d
1963 )
1964 ) {
1965 log_err("u_charMirror() does not work correctly\n");
1966 }
1967
1968 /* verify that Bidi_Mirroring_Glyph roundtrips */
1969 errorCode=U_ZERO_ERROR;
1970 set=uset_openPattern(mirroredPattern, 17, &errorCode);
1971
1972 if (U_FAILURE(errorCode)) {
1973 log_data_err("uset_openPattern(mirroredPattern, 17, &errorCode) failed!\n");
1974 } else {
1975 for(i=0; 0==uset_getItem(set, i, &start, &end, NULL, 0, &errorCode); ++i) {
1976 do {
1977 c2=u_charMirror(start);
1978 c3=u_charMirror(c2);
1979 if(c3!=start) {
1980 log_err("u_charMirror() does not roundtrip: U+%04lx->U+%04lx->U+%04lx\n", (long)start, (long)c2, (long)c3);
1981 }
1982 } while(++start<=end);
1983 }
1984 }
1985
1986 uset_close(set);
1987 }
1988
1989
1990 struct RunTestData
1991 {
1992 const char *runText;
1993 UScriptCode runCode;
1994 };
1995
1996 typedef struct RunTestData RunTestData;
1997
1998 static void
1999 CheckScriptRuns(UScriptRun *scriptRun, int32_t *runStarts, const RunTestData *testData, int32_t nRuns,
2000 const char *prefix)
2001 {
2002 int32_t run, runStart, runLimit;
2003 UScriptCode runCode;
2004
2005 /* iterate over all the runs */
2006 run = 0;
2007 while (uscript_nextRun(scriptRun, &runStart, &runLimit, &runCode)) {
2008 if (runStart != runStarts[run]) {
2009 log_err("%s: incorrect start offset for run %d: expected %d, got %d\n",
2010 prefix, run, runStarts[run], runStart);
2011 }
2012
2013 if (runLimit != runStarts[run + 1]) {
2014 log_err("%s: incorrect limit offset for run %d: expected %d, got %d\n",
2015 prefix, run, runStarts[run + 1], runLimit);
2016 }
2017
2018 if (runCode != testData[run].runCode) {
2019 log_err("%s: incorrect script for run %d: expected \"%s\", got \"%s\"\n",
2020 prefix, run, uscript_getName(testData[run].runCode), uscript_getName(runCode));
2021 }
2022
2023 run += 1;
2024
2025 /* stop when we've seen all the runs we expect to see */
2026 if (run >= nRuns) {
2027 break;
2028 }
2029 }
2030
2031 /* Complain if we didn't see then number of runs we expected */
2032 if (run != nRuns) {
2033 log_err("%s: incorrect number of runs: expected %d, got %d\n", prefix, run, nRuns);
2034 }
2035 }
2036
2037 static void
2038 TestUScriptRunAPI()
2039 {
2040 static const RunTestData testData1[] = {
2041 {"\\u0020\\u0946\\u0939\\u093F\\u0928\\u094D\\u0926\\u0940\\u0020", USCRIPT_DEVANAGARI},
2042 {"\\u0627\\u0644\\u0639\\u0631\\u0628\\u064A\\u0629\\u0020", USCRIPT_ARABIC},
2043 {"\\u0420\\u0443\\u0441\\u0441\\u043A\\u0438\\u0439\\u0020", USCRIPT_CYRILLIC},
2044 {"English (", USCRIPT_LATIN},
2045 {"\\u0E44\\u0E17\\u0E22", USCRIPT_THAI},
2046 {") ", USCRIPT_LATIN},
2047 {"\\u6F22\\u5B75", USCRIPT_HAN},
2048 {"\\u3068\\u3072\\u3089\\u304C\\u306A\\u3068", USCRIPT_HIRAGANA},
2049 {"\\u30AB\\u30BF\\u30AB\\u30CA", USCRIPT_KATAKANA},
2050 {"\\U00010400\\U00010401\\U00010402\\U00010403", USCRIPT_DESERET}
2051 };
2052
2053 static const RunTestData testData2[] = {
2054 {"((((((((((abc))))))))))", USCRIPT_LATIN}
2055 };
2056
2057 static const struct {
2058 const RunTestData *testData;
2059 int32_t nRuns;
2060 } testDataEntries[] = {
2061 {testData1, LENGTHOF(testData1)},
2062 {testData2, LENGTHOF(testData2)}
2063 };
2064
2065 static const int32_t nTestEntries = LENGTHOF(testDataEntries);
2066 int32_t testEntry;
2067
2068 for (testEntry = 0; testEntry < nTestEntries; testEntry += 1) {
2069 UChar testString[1024];
2070 int32_t runStarts[256];
2071 int32_t nTestRuns = testDataEntries[testEntry].nRuns;
2072 const RunTestData *testData = testDataEntries[testEntry].testData;
2073
2074 int32_t run, stringLimit;
2075 UScriptRun *scriptRun = NULL;
2076 UErrorCode err;
2077
2078 /*
2079 * Fill in the test string and the runStarts array.
2080 */
2081 stringLimit = 0;
2082 for (run = 0; run < nTestRuns; run += 1) {
2083 runStarts[run] = stringLimit;
2084 stringLimit += u_unescape(testData[run].runText, &testString[stringLimit], 1024 - stringLimit);
2085 /*stringLimit -= 1;*/
2086 }
2087
2088 /* The limit of the last run */
2089 runStarts[nTestRuns] = stringLimit;
2090
2091 /*
2092 * Make sure that calling uscript_OpenRun with a NULL text pointer
2093 * and a non-zero text length returns the correct error.
2094 */
2095 err = U_ZERO_ERROR;
2096 scriptRun = uscript_openRun(NULL, stringLimit, &err);
2097
2098 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2099 log_err("uscript_openRun(NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2100 }
2101
2102 if (scriptRun != NULL) {
2103 log_err("uscript_openRun(NULL, stringLimit, &err) returned a non-NULL result.\n");
2104 uscript_closeRun(scriptRun);
2105 }
2106
2107 /*
2108 * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2109 * and a zero text length returns the correct error.
2110 */
2111 err = U_ZERO_ERROR;
2112 scriptRun = uscript_openRun(testString, 0, &err);
2113
2114 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2115 log_err("uscript_openRun(testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2116 }
2117
2118 if (scriptRun != NULL) {
2119 log_err("uscript_openRun(testString, 0, &err) returned a non-NULL result.\n");
2120 uscript_closeRun(scriptRun);
2121 }
2122
2123 /*
2124 * Make sure that calling uscript_openRun with a NULL text pointer
2125 * and a zero text length doesn't return an error.
2126 */
2127 err = U_ZERO_ERROR;
2128 scriptRun = uscript_openRun(NULL, 0, &err);
2129
2130 if (U_FAILURE(err)) {
2131 log_err("Got error %s from uscript_openRun(NULL, 0, &err)\n", u_errorName(err));
2132 }
2133
2134 /* Make sure that the empty iterator doesn't find any runs */
2135 if (uscript_nextRun(scriptRun, NULL, NULL, NULL)) {
2136 log_err("uscript_nextRun(...) returned TRUE for an empty iterator.\n");
2137 }
2138
2139 /*
2140 * Make sure that calling uscript_setRunText with a NULL text pointer
2141 * and a non-zero text length returns the correct error.
2142 */
2143 err = U_ZERO_ERROR;
2144 uscript_setRunText(scriptRun, NULL, stringLimit, &err);
2145
2146 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2147 log_err("uscript_setRunText(scriptRun, NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2148 }
2149
2150 /*
2151 * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2152 * and a zero text length returns the correct error.
2153 */
2154 err = U_ZERO_ERROR;
2155 uscript_setRunText(scriptRun, testString, 0, &err);
2156
2157 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2158 log_err("uscript_setRunText(scriptRun, testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2159 }
2160
2161 /*
2162 * Now call uscript_setRunText on the empty iterator
2163 * and make sure that it works.
2164 */
2165 err = U_ZERO_ERROR;
2166 uscript_setRunText(scriptRun, testString, stringLimit, &err);
2167
2168 if (U_FAILURE(err)) {
2169 log_err("Got error %s from uscript_setRunText(...)\n", u_errorName(err));
2170 } else {
2171 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_setRunText");
2172 }
2173
2174 uscript_closeRun(scriptRun);
2175
2176 /*
2177 * Now open an interator over the testString
2178 * using uscript_openRun and make sure that it works
2179 */
2180 scriptRun = uscript_openRun(testString, stringLimit, &err);
2181
2182 if (U_FAILURE(err)) {
2183 log_err("Got error %s from uscript_openRun(...)\n", u_errorName(err));
2184 } else {
2185 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_openRun");
2186 }
2187
2188 /* Now reset the iterator, and make sure
2189 * that it still works.
2190 */
2191 uscript_resetRun(scriptRun);
2192
2193 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_resetRun");
2194
2195 /* Close the iterator */
2196 uscript_closeRun(scriptRun);
2197 }
2198 }
2199
2200 /* test additional, non-core properties */
2201 static void
2202 TestAdditionalProperties() {
2203 /* test data for u_charAge() */
2204 static const struct {
2205 UChar32 c;
2206 UVersionInfo version;
2207 } charAges[]={
2208 {0x41, { 1, 1, 0, 0 }},
2209 {0xffff, { 1, 1, 0, 0 }},
2210 {0x20ab, { 2, 0, 0, 0 }},
2211 {0x2fffe, { 2, 0, 0, 0 }},
2212 {0x20ac, { 2, 1, 0, 0 }},
2213 {0xfb1d, { 3, 0, 0, 0 }},
2214 {0x3f4, { 3, 1, 0, 0 }},
2215 {0x10300, { 3, 1, 0, 0 }},
2216 {0x220, { 3, 2, 0, 0 }},
2217 {0xff60, { 3, 2, 0, 0 }}
2218 };
2219
2220 /* test data for u_hasBinaryProperty() */
2221 static const int32_t
2222 props[][3]={ /* code point, property, value */
2223 { 0x0627, UCHAR_ALPHABETIC, TRUE },
2224 { 0x1034a, UCHAR_ALPHABETIC, TRUE },
2225 { 0x2028, UCHAR_ALPHABETIC, FALSE },
2226
2227 { 0x0066, UCHAR_ASCII_HEX_DIGIT, TRUE },
2228 { 0x0067, UCHAR_ASCII_HEX_DIGIT, FALSE },
2229
2230 { 0x202c, UCHAR_BIDI_CONTROL, TRUE },
2231 { 0x202f, UCHAR_BIDI_CONTROL, FALSE },
2232
2233 { 0x003c, UCHAR_BIDI_MIRRORED, TRUE },
2234 { 0x003d, UCHAR_BIDI_MIRRORED, FALSE },
2235
2236 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
2237 { 0x2018, UCHAR_BIDI_MIRRORED, FALSE },
2238 { 0x201d, UCHAR_BIDI_MIRRORED, FALSE },
2239 { 0x201f, UCHAR_BIDI_MIRRORED, FALSE },
2240 { 0x301e, UCHAR_BIDI_MIRRORED, FALSE },
2241
2242 { 0x058a, UCHAR_DASH, TRUE },
2243 { 0x007e, UCHAR_DASH, FALSE },
2244
2245 { 0x0c4d, UCHAR_DIACRITIC, TRUE },
2246 { 0x3000, UCHAR_DIACRITIC, FALSE },
2247
2248 { 0x0e46, UCHAR_EXTENDER, TRUE },
2249 { 0x0020, UCHAR_EXTENDER, FALSE },
2250
2251 #if !UCONFIG_NO_NORMALIZATION
2252 { 0xfb1d, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2253 { 0x1d15f, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2254 { 0xfb1e, UCHAR_FULL_COMPOSITION_EXCLUSION, FALSE },
2255
2256 { 0x110a, UCHAR_NFD_INERT, TRUE }, /* Jamo L */
2257 { 0x0308, UCHAR_NFD_INERT, FALSE },
2258
2259 { 0x1164, UCHAR_NFKD_INERT, TRUE }, /* Jamo V */
2260 { 0x1d79d, UCHAR_NFKD_INERT, FALSE }, /* math compat version of xi */
2261
2262 { 0x0021, UCHAR_NFC_INERT, TRUE }, /* ! */
2263 { 0x0061, UCHAR_NFC_INERT, FALSE }, /* a */
2264 { 0x00e4, UCHAR_NFC_INERT, FALSE }, /* a-umlaut */
2265 { 0x0102, UCHAR_NFC_INERT, FALSE }, /* a-breve */
2266 { 0xac1c, UCHAR_NFC_INERT, FALSE }, /* Hangul LV */
2267 { 0xac1d, UCHAR_NFC_INERT, TRUE }, /* Hangul LVT */
2268
2269 { 0x1d79d, UCHAR_NFKC_INERT, FALSE }, /* math compat version of xi */
2270 { 0x2a6d6, UCHAR_NFKC_INERT, TRUE }, /* Han, last of CJK ext. B */
2271
2272 { 0x00e4, UCHAR_SEGMENT_STARTER, TRUE },
2273 { 0x0308, UCHAR_SEGMENT_STARTER, FALSE },
2274 { 0x110a, UCHAR_SEGMENT_STARTER, TRUE }, /* Jamo L */
2275 { 0x1164, UCHAR_SEGMENT_STARTER, FALSE },/* Jamo V */
2276 { 0xac1c, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LV */
2277 { 0xac1d, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LVT */
2278 #endif
2279
2280 { 0x0044, UCHAR_HEX_DIGIT, TRUE },
2281 { 0xff46, UCHAR_HEX_DIGIT, TRUE },
2282 { 0x0047, UCHAR_HEX_DIGIT, FALSE },
2283
2284 { 0x30fb, UCHAR_HYPHEN, TRUE },
2285 { 0xfe58, UCHAR_HYPHEN, FALSE },
2286
2287 { 0x2172, UCHAR_ID_CONTINUE, TRUE },
2288 { 0x0307, UCHAR_ID_CONTINUE, TRUE },
2289 { 0x005c, UCHAR_ID_CONTINUE, FALSE },
2290
2291 { 0x2172, UCHAR_ID_START, TRUE },
2292 { 0x007a, UCHAR_ID_START, TRUE },
2293 { 0x0039, UCHAR_ID_START, FALSE },
2294
2295 { 0x4db5, UCHAR_IDEOGRAPHIC, TRUE },
2296 { 0x2f999, UCHAR_IDEOGRAPHIC, TRUE },
2297 { 0x2f99, UCHAR_IDEOGRAPHIC, FALSE },
2298
2299 { 0x200c, UCHAR_JOIN_CONTROL, TRUE },
2300 { 0x2029, UCHAR_JOIN_CONTROL, FALSE },
2301
2302 { 0x1d7bc, UCHAR_LOWERCASE, TRUE },
2303 { 0x0345, UCHAR_LOWERCASE, TRUE },
2304 { 0x0030, UCHAR_LOWERCASE, FALSE },
2305
2306 { 0x1d7a9, UCHAR_MATH, TRUE },
2307 { 0x2135, UCHAR_MATH, TRUE },
2308 { 0x0062, UCHAR_MATH, FALSE },
2309
2310 { 0xfde1, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2311 { 0x10ffff, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2312 { 0x10fffd, UCHAR_NONCHARACTER_CODE_POINT, FALSE },
2313
2314 { 0x0022, UCHAR_QUOTATION_MARK, TRUE },
2315 { 0xff62, UCHAR_QUOTATION_MARK, TRUE },
2316 { 0xd840, UCHAR_QUOTATION_MARK, FALSE },
2317
2318 { 0x061f, UCHAR_TERMINAL_PUNCTUATION, TRUE },
2319 { 0xe003f, UCHAR_TERMINAL_PUNCTUATION, FALSE },
2320
2321 { 0x1d44a, UCHAR_UPPERCASE, TRUE },
2322 { 0x2162, UCHAR_UPPERCASE, TRUE },
2323 { 0x0345, UCHAR_UPPERCASE, FALSE },
2324
2325 { 0x0020, UCHAR_WHITE_SPACE, TRUE },
2326 { 0x202f, UCHAR_WHITE_SPACE, TRUE },
2327 { 0x3001, UCHAR_WHITE_SPACE, FALSE },
2328
2329 { 0x0711, UCHAR_XID_CONTINUE, TRUE },
2330 { 0x1d1aa, UCHAR_XID_CONTINUE, TRUE },
2331 { 0x007c, UCHAR_XID_CONTINUE, FALSE },
2332
2333 { 0x16ee, UCHAR_XID_START, TRUE },
2334 { 0x23456, UCHAR_XID_START, TRUE },
2335 { 0x1d1aa, UCHAR_XID_START, FALSE },
2336
2337 /*
2338 * Version break:
2339 * The following properties are only supported starting with the
2340 * Unicode version indicated in the second field.
2341 */
2342 { -1, 0x320, 0 },
2343
2344 { 0x180c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2345 { 0xfe02, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2346 { 0x1801, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, FALSE },
2347
2348 { 0x0149, UCHAR_DEPRECATED, TRUE }, /* changed in Unicode 5.2 */
2349 { 0x0341, UCHAR_DEPRECATED, FALSE }, /* changed in Unicode 5.2 */
2350 { 0xe0041, UCHAR_DEPRECATED, TRUE }, /* changed from Unicode 5 to 5.1 */
2351 { 0xe0100, UCHAR_DEPRECATED, FALSE },
2352
2353 { 0x00a0, UCHAR_GRAPHEME_BASE, TRUE },
2354 { 0x0a4d, UCHAR_GRAPHEME_BASE, FALSE },
2355 { 0xff9d, UCHAR_GRAPHEME_BASE, TRUE },
2356 { 0xff9f, UCHAR_GRAPHEME_BASE, FALSE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2357
2358 { 0x0300, UCHAR_GRAPHEME_EXTEND, TRUE },
2359 { 0xff9d, UCHAR_GRAPHEME_EXTEND, FALSE },
2360 { 0xff9f, UCHAR_GRAPHEME_EXTEND, TRUE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2361 { 0x0603, UCHAR_GRAPHEME_EXTEND, FALSE },
2362
2363 { 0x0a4d, UCHAR_GRAPHEME_LINK, TRUE },
2364 { 0xff9f, UCHAR_GRAPHEME_LINK, FALSE },
2365
2366 { 0x2ff7, UCHAR_IDS_BINARY_OPERATOR, TRUE },
2367 { 0x2ff3, UCHAR_IDS_BINARY_OPERATOR, FALSE },
2368
2369 { 0x2ff3, UCHAR_IDS_TRINARY_OPERATOR, TRUE },
2370 { 0x2f03, UCHAR_IDS_TRINARY_OPERATOR, FALSE },
2371
2372 { 0x0ec1, UCHAR_LOGICAL_ORDER_EXCEPTION, TRUE },
2373 { 0xdcba, UCHAR_LOGICAL_ORDER_EXCEPTION, FALSE },
2374
2375 { 0x2e9b, UCHAR_RADICAL, TRUE },
2376 { 0x4e00, UCHAR_RADICAL, FALSE },
2377
2378 { 0x012f, UCHAR_SOFT_DOTTED, TRUE },
2379 { 0x0049, UCHAR_SOFT_DOTTED, FALSE },
2380
2381 { 0xfa11, UCHAR_UNIFIED_IDEOGRAPH, TRUE },
2382 { 0xfa12, UCHAR_UNIFIED_IDEOGRAPH, FALSE },
2383
2384 { -1, 0x401, 0 }, /* version break for Unicode 4.0.1 */
2385
2386 { 0x002e, UCHAR_S_TERM, TRUE },
2387 { 0x0061, UCHAR_S_TERM, FALSE },
2388
2389 { 0x180c, UCHAR_VARIATION_SELECTOR, TRUE },
2390 { 0xfe03, UCHAR_VARIATION_SELECTOR, TRUE },
2391 { 0xe01ef, UCHAR_VARIATION_SELECTOR, TRUE },
2392 { 0xe0200, UCHAR_VARIATION_SELECTOR, FALSE },
2393
2394 /* enum/integer type properties */
2395
2396 /* UCHAR_BIDI_CLASS tested for assigned characters in TestUnicodeData() */
2397 /* test default Bidi classes for unassigned code points */
2398 { 0x0590, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2399 { 0x05cf, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2400 { 0x05ed, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2401 { 0x07f2, UCHAR_BIDI_CLASS, U_DIR_NON_SPACING_MARK }, /* Nko, new in Unicode 5.0 */
2402 { 0x07fe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, /* unassigned R */
2403 { 0x089f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2404 { 0xfb37, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2405 { 0xfb42, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2406 { 0x10806, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2407 { 0x10909, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2408 { 0x10fe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2409
2410 { 0x0605, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2411 { 0x061c, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2412 { 0x063f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2413 { 0x070e, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2414 { 0x0775, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2415 { 0xfbc2, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2416 { 0xfd90, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2417 { 0xfefe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2418
2419 { 0x02AF, UCHAR_BLOCK, UBLOCK_IPA_EXTENSIONS },
2420 { 0x0C4E, UCHAR_BLOCK, UBLOCK_TELUGU },
2421 { 0x155A, UCHAR_BLOCK, UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS },
2422 { 0x1717, UCHAR_BLOCK, UBLOCK_TAGALOG },
2423 { 0x1900, UCHAR_BLOCK, UBLOCK_LIMBU },
2424 { 0x1AFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2425 { 0x3040, UCHAR_BLOCK, UBLOCK_HIRAGANA },
2426 { 0x1D0FF, UCHAR_BLOCK, UBLOCK_BYZANTINE_MUSICAL_SYMBOLS },
2427 { 0x50000, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2428 { 0xEFFFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2429 { 0x10D0FF, UCHAR_BLOCK, UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B },
2430
2431 /* UCHAR_CANONICAL_COMBINING_CLASS tested for assigned characters in TestUnicodeData() */
2432 { 0xd7d7, UCHAR_CANONICAL_COMBINING_CLASS, 0 },
2433
2434 { 0x00A0, UCHAR_DECOMPOSITION_TYPE, U_DT_NOBREAK },
2435 { 0x00A8, UCHAR_DECOMPOSITION_TYPE, U_DT_COMPAT },
2436 { 0x00bf, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2437 { 0x00c0, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2438 { 0x1E9B, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2439 { 0xBCDE, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2440 { 0xFB5D, UCHAR_DECOMPOSITION_TYPE, U_DT_MEDIAL },
2441 { 0x1D736, UCHAR_DECOMPOSITION_TYPE, U_DT_FONT },
2442 { 0xe0033, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2443
2444 { 0x0009, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2445 { 0x0020, UCHAR_EAST_ASIAN_WIDTH, U_EA_NARROW },
2446 { 0x00B1, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2447 { 0x20A9, UCHAR_EAST_ASIAN_WIDTH, U_EA_HALFWIDTH },
2448 { 0x2FFB, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2449 { 0x3000, UCHAR_EAST_ASIAN_WIDTH, U_EA_FULLWIDTH },
2450 { 0x35bb, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2451 { 0x58bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2452 { 0xD7A3, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2453 { 0xEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2454 { 0x1D198, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2455 { 0x20000, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2456 { 0x2F8C7, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2457 { 0x3a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, /* plane 3 got default W values in Unicode 4 */
2458 { 0x5a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2459 { 0xFEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2460 { 0x10EEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2461
2462 /* UCHAR_GENERAL_CATEGORY tested for assigned characters in TestUnicodeData() */
2463 { 0xd7c7, UCHAR_GENERAL_CATEGORY, 0 },
2464 { 0xd7d7, UCHAR_GENERAL_CATEGORY, U_OTHER_LETTER }, /* changed in Unicode 5.2 */
2465
2466 { 0x0444, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2467 { 0x0639, UCHAR_JOINING_GROUP, U_JG_AIN },
2468 { 0x072A, UCHAR_JOINING_GROUP, U_JG_DALATH_RISH },
2469 { 0x0647, UCHAR_JOINING_GROUP, U_JG_HEH },
2470 { 0x06C1, UCHAR_JOINING_GROUP, U_JG_HEH_GOAL },
2471
2472 { 0x200C, UCHAR_JOINING_TYPE, U_JT_NON_JOINING },
2473 { 0x200D, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2474 { 0x0639, UCHAR_JOINING_TYPE, U_JT_DUAL_JOINING },
2475 { 0x0640, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2476 { 0x06C3, UCHAR_JOINING_TYPE, U_JT_RIGHT_JOINING },
2477 { 0x0300, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2478 { 0x070F, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2479 { 0xe0033, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2480
2481 /* TestUnicodeData() verifies that no assigned character has "XX" (unknown) */
2482 { 0xe7e7, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2483 { 0x10fffd, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2484 { 0x0028, UCHAR_LINE_BREAK, U_LB_OPEN_PUNCTUATION },
2485 { 0x232A, UCHAR_LINE_BREAK, U_LB_CLOSE_PUNCTUATION },
2486 { 0x3401, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2487 { 0x4e02, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2488 { 0x20004, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2489 { 0xf905, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2490 { 0xdb7e, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2491 { 0xdbfd, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2492 { 0xdffc, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2493 { 0x2762, UCHAR_LINE_BREAK, U_LB_EXCLAMATION },
2494 { 0x002F, UCHAR_LINE_BREAK, U_LB_BREAK_SYMBOLS },
2495 { 0x1D49C, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2496 { 0x1731, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2497
2498 /* UCHAR_NUMERIC_TYPE tested in TestNumericProperties() */
2499
2500 /* UCHAR_SCRIPT tested in TestUScriptCodeAPI() */
2501
2502 { 0x10ff, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2503 { 0x1100, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2504 { 0x1111, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2505 { 0x1159, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2506 { 0x115a, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2507 { 0x115e, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2508 { 0x115f, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2509
2510 { 0xa95f, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2511 { 0xa960, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2512 { 0xa97c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2513 { 0xa97d, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2514
2515 { 0x1160, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2516 { 0x1161, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2517 { 0x1172, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2518 { 0x11a2, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2519 { 0x11a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2520 { 0x11a7, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2521
2522 { 0xd7af, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2523 { 0xd7b0, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2524 { 0xd7c6, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2525 { 0xd7c7, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2526
2527 { 0x11a8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2528 { 0x11b8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2529 { 0x11c8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2530 { 0x11f9, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2531 { 0x11fa, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2532 { 0x11ff, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2533 { 0x1200, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2534
2535 { 0xd7ca, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2536 { 0xd7cb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2537 { 0xd7fb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2538 { 0xd7fc, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2539
2540 { 0xac00, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2541 { 0xac1c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2542 { 0xc5ec, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2543 { 0xd788, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2544
2545 { 0xac01, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2546 { 0xac1b, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2547 { 0xac1d, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2548 { 0xc5ee, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2549 { 0xd7a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2550
2551 { 0xd7a4, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2552
2553 { -1, 0x410, 0 }, /* version break for Unicode 4.1 */
2554
2555 { 0x00d7, UCHAR_PATTERN_SYNTAX, TRUE },
2556 { 0xfe45, UCHAR_PATTERN_SYNTAX, TRUE },
2557 { 0x0061, UCHAR_PATTERN_SYNTAX, FALSE },
2558
2559 { 0x0020, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2560 { 0x0085, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2561 { 0x200f, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2562 { 0x00a0, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2563 { 0x3000, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2564
2565 { 0x1d200, UCHAR_BLOCK, UBLOCK_ANCIENT_GREEK_MUSICAL_NOTATION },
2566 { 0x2c8e, UCHAR_BLOCK, UBLOCK_COPTIC },
2567 { 0xfe17, UCHAR_BLOCK, UBLOCK_VERTICAL_FORMS },
2568
2569 { 0x1a00, UCHAR_SCRIPT, USCRIPT_BUGINESE },
2570 { 0x2cea, UCHAR_SCRIPT, USCRIPT_COPTIC },
2571 { 0xa82b, UCHAR_SCRIPT, USCRIPT_SYLOTI_NAGRI },
2572 { 0x103d0, UCHAR_SCRIPT, USCRIPT_OLD_PERSIAN },
2573
2574 { 0xcc28, UCHAR_LINE_BREAK, U_LB_H2 },
2575 { 0xcc29, UCHAR_LINE_BREAK, U_LB_H3 },
2576 { 0xac03, UCHAR_LINE_BREAK, U_LB_H3 },
2577 { 0x115f, UCHAR_LINE_BREAK, U_LB_JL },
2578 { 0x11aa, UCHAR_LINE_BREAK, U_LB_JT },
2579 { 0x11a1, UCHAR_LINE_BREAK, U_LB_JV },
2580
2581 { 0xb2c9, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_LVT },
2582 { 0x036f, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_EXTEND },
2583 { 0x0000, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_CONTROL },
2584 { 0x1160, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_V },
2585
2586 { 0x05f4, UCHAR_WORD_BREAK, U_WB_MIDLETTER },
2587 { 0x4ef0, UCHAR_WORD_BREAK, U_WB_OTHER },
2588 { 0x19d9, UCHAR_WORD_BREAK, U_WB_NUMERIC },
2589 { 0x2044, UCHAR_WORD_BREAK, U_WB_MIDNUM },
2590
2591 { 0xfffd, UCHAR_SENTENCE_BREAK, U_SB_OTHER },
2592 { 0x1ffc, UCHAR_SENTENCE_BREAK, U_SB_UPPER },
2593 { 0xff63, UCHAR_SENTENCE_BREAK, U_SB_CLOSE },
2594 { 0x2028, UCHAR_SENTENCE_BREAK, U_SB_SEP },
2595
2596 { -1, 0x520, 0 }, /* version break for Unicode 5.2 */
2597
2598 /* unassigned code points in new default Bidi R blocks */
2599 { 0x1ede4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2600 { 0x1efe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2601
2602 /* test some script codes >127 */
2603 { 0xa6e6, UCHAR_SCRIPT, USCRIPT_BAMUM },
2604 { 0xa4d0, UCHAR_SCRIPT, USCRIPT_LISU },
2605 { 0x10a7f, UCHAR_SCRIPT, USCRIPT_OLD_SOUTH_ARABIAN },
2606
2607 { -1, 0x600, 0 }, /* version break for Unicode 6.0 */
2608
2609 /* value changed in Unicode 6.0 */
2610 { 0x06C3, UCHAR_JOINING_GROUP, U_JG_TEH_MARBUTA_GOAL },
2611
2612 { -1, 0x610, 0 }, /* version break for Unicode 6.1 */
2613
2614 /* unassigned code points in new/changed default Bidi AL blocks */
2615 { 0x08ba, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2616 { 0x1eee4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2617
2618 /* undefined UProperty values */
2619 { 0x61, 0x4a7, 0 },
2620 { 0x234bc, 0x15ed, 0 }
2621 };
2622
2623 UVersionInfo version;
2624 UChar32 c;
2625 int32_t i, result, uVersion;
2626 UProperty which;
2627
2628 /* what is our Unicode version? */
2629 u_getUnicodeVersion(version);
2630 uVersion=((int32_t)version[0]<<8)|(version[1]<<4)|version[2]; /* major/minor/update version numbers */
2631
2632 u_charAge(0x20, version);
2633 if(version[0]==0) {
2634 /* no additional properties available */
2635 log_err("TestAdditionalProperties: no additional properties available, not tested\n");
2636 return;
2637 }
2638
2639 /* test u_charAge() */
2640 for(i=0; i<sizeof(charAges)/sizeof(charAges[0]); ++i) {
2641 u_charAge(charAges[i].c, version);
2642 if(0!=memcmp(version, charAges[i].version, sizeof(UVersionInfo))) {
2643 log_err("error: u_charAge(U+%04lx)={ %u, %u, %u, %u } instead of { %u, %u, %u, %u }\n",
2644 charAges[i].c,
2645 version[0], version[1], version[2], version[3],
2646 charAges[i].version[0], charAges[i].version[1], charAges[i].version[2], charAges[i].version[3]);
2647 }
2648 }
2649
2650 if( u_getIntPropertyMinValue(UCHAR_DASH)!=0 ||
2651 u_getIntPropertyMinValue(UCHAR_BIDI_CLASS)!=0 ||
2652 u_getIntPropertyMinValue(UCHAR_BLOCK)!=0 || /* j2478 */
2653 u_getIntPropertyMinValue(UCHAR_SCRIPT)!=0 || /*JB#2410*/
2654 u_getIntPropertyMinValue(0x2345)!=0
2655 ) {
2656 log_err("error: u_getIntPropertyMinValue() wrong\n");
2657 }
2658 if( u_getIntPropertyMaxValue(UCHAR_DASH)!=1) {
2659 log_err("error: u_getIntPropertyMaxValue(UCHAR_DASH) wrong\n");
2660 }
2661 if( u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE)!=1) {
2662 log_err("error: u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE) wrong\n");
2663 }
2664 if( u_getIntPropertyMaxValue((UProperty)(UCHAR_BINARY_LIMIT-1))!=1) {
2665 log_err("error: u_getIntPropertyMaxValue(UCHAR_BINARY_LIMIT-1) wrong\n");
2666 }
2667 if( u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS)!=(int32_t)U_CHAR_DIRECTION_COUNT-1 ) {
2668 log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS) wrong\n");
2669 }
2670 if( u_getIntPropertyMaxValue(UCHAR_BLOCK)!=(int32_t)UBLOCK_COUNT-1 ) {
2671 log_err("error: u_getIntPropertyMaxValue(UCHAR_BLOCK) wrong\n");
2672 }
2673 if(u_getIntPropertyMaxValue(UCHAR_LINE_BREAK)!=(int32_t)U_LB_COUNT-1) {
2674 log_err("error: u_getIntPropertyMaxValue(UCHAR_LINE_BREAK) wrong\n");
2675 }
2676 if(u_getIntPropertyMaxValue(UCHAR_SCRIPT)!=(int32_t)USCRIPT_CODE_LIMIT-1) {
2677 log_err("error: u_getIntPropertyMaxValue(UCHAR_SCRIPT) wrong\n");
2678 }
2679 if(u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE)!=(int32_t)U_NT_COUNT-1) {
2680 log_err("error: u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE) wrong\n");
2681 }
2682 if(u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY)!=(int32_t)U_CHAR_CATEGORY_COUNT-1) {
2683 log_err("error: u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY) wrong\n");
2684 }
2685 if(u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE)!=(int32_t)U_HST_COUNT-1) {
2686 log_err("error: u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE) wrong\n");
2687 }
2688 if(u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK)!=(int32_t)U_GCB_COUNT-1) {
2689 log_err("error: u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK) wrong\n");
2690 }
2691 if(u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK)!=(int32_t)U_SB_COUNT-1) {
2692 log_err("error: u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK) wrong\n");
2693 }
2694 if(u_getIntPropertyMaxValue(UCHAR_WORD_BREAK)!=(int32_t)U_WB_COUNT-1) {
2695 log_err("error: u_getIntPropertyMaxValue(UCHAR_WORD_BREAK) wrong\n");
2696 }
2697 /*JB#2410*/
2698 if( u_getIntPropertyMaxValue(0x2345)!=-1) {
2699 log_err("error: u_getIntPropertyMaxValue(0x2345) wrong\n");
2700 }
2701 if( u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) != (int32_t) (U_DT_COUNT - 1)) {
2702 log_err("error: u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) wrong\n");
2703 }
2704 if( u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) != (int32_t) (U_JG_COUNT -1)) {
2705 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) wrong\n");
2706 }
2707 if( u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) != (int32_t) (U_JT_COUNT -1)) {
2708 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) wrong\n");
2709 }
2710 if( u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) != (int32_t) (U_EA_COUNT -1)) {
2711 log_err("error: u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) wrong\n");
2712 }
2713
2714 /* test u_hasBinaryProperty() and u_getIntPropertyValue() */
2715 for(i=0; i<sizeof(props)/sizeof(props[0]); ++i) {
2716 const char *whichName;
2717
2718 if(props[i][0]<0) {
2719 /* Unicode version break */
2720 if(uVersion<props[i][1]) {
2721 break; /* do not test properties that are not yet supported */
2722 } else {
2723 continue; /* skip this row */
2724 }
2725 }
2726
2727 c=(UChar32)props[i][0];
2728 which=(UProperty)props[i][1];
2729 whichName=u_getPropertyName(which, U_LONG_PROPERTY_NAME);
2730
2731 if(which<UCHAR_INT_START) {
2732 result=u_hasBinaryProperty(c, which);
2733 if(result!=props[i][2]) {
2734 log_data_err("error: u_hasBinaryProperty(U+%04lx, %s)=%d is wrong (props[%d]) - (Are you missing data?)\n",
2735 c, whichName, result, i);
2736 }
2737 }
2738
2739 result=u_getIntPropertyValue(c, which);
2740 if(result!=props[i][2]) {
2741 log_data_err("error: u_getIntPropertyValue(U+%04lx, %s)=%d is wrong, should be %d (props[%d]) - (Are you missing data?)\n",
2742 c, whichName, result, props[i][2], i);
2743 }
2744
2745 /* test separate functions, too */
2746 switch((UProperty)props[i][1]) {
2747 case UCHAR_ALPHABETIC:
2748 if(u_isUAlphabetic((UChar32)props[i][0])!=(UBool)props[i][2]) {
2749 log_err("error: u_isUAlphabetic(U+%04lx)=%d is wrong (props[%d])\n",
2750 props[i][0], result, i);
2751 }
2752 break;
2753 case UCHAR_LOWERCASE:
2754 if(u_isULowercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2755 log_err("error: u_isULowercase(U+%04lx)=%d is wrong (props[%d])\n",
2756 props[i][0], result, i);
2757 }
2758 break;
2759 case UCHAR_UPPERCASE:
2760 if(u_isUUppercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2761 log_err("error: u_isUUppercase(U+%04lx)=%d is wrong (props[%d])\n",
2762 props[i][0], result, i);
2763 }
2764 break;
2765 case UCHAR_WHITE_SPACE:
2766 if(u_isUWhiteSpace((UChar32)props[i][0])!=(UBool)props[i][2]) {
2767 log_err("error: u_isUWhiteSpace(U+%04lx)=%d is wrong (props[%d])\n",
2768 props[i][0], result, i);
2769 }
2770 break;
2771 default:
2772 break;
2773 }
2774 }
2775 }
2776
2777 static void
2778 TestNumericProperties(void) {
2779 /* see UnicodeData.txt, DerivedNumericValues.txt */
2780 static const struct {
2781 UChar32 c;
2782 int32_t type;
2783 double numValue;
2784 } values[]={
2785 { 0x12456, U_NT_NUMERIC, -1. },
2786 { 0x12457, U_NT_NUMERIC, -1. },
2787 { 0x0F33, U_NT_NUMERIC, -1./2. },
2788 { 0x0C66, U_NT_DECIMAL, 0 },
2789 { 0x96f6, U_NT_NUMERIC, 0 },
2790 { 0xa833, U_NT_NUMERIC, 1./16. },
2791 { 0x2152, U_NT_NUMERIC, 1./10. },
2792 { 0x2151, U_NT_NUMERIC, 1./9. },
2793 { 0x1245f, U_NT_NUMERIC, 1./8. },
2794 { 0x2150, U_NT_NUMERIC, 1./7. },
2795 { 0x2159, U_NT_NUMERIC, 1./6. },
2796 { 0x09f6, U_NT_NUMERIC, 3./16. },
2797 { 0x2155, U_NT_NUMERIC, 1./5. },
2798 { 0x00BD, U_NT_NUMERIC, 1./2. },
2799 { 0x0031, U_NT_DECIMAL, 1. },
2800 { 0x4e00, U_NT_NUMERIC, 1. },
2801 { 0x58f1, U_NT_NUMERIC, 1. },
2802 { 0x10320, U_NT_NUMERIC, 1. },
2803 { 0x0F2B, U_NT_NUMERIC, 3./2. },
2804 { 0x00B2, U_NT_DIGIT, 2. },
2805 { 0x5f10, U_NT_NUMERIC, 2. },
2806 { 0x1813, U_NT_DECIMAL, 3. },
2807 { 0x5f0e, U_NT_NUMERIC, 3. },
2808 { 0x2173, U_NT_NUMERIC, 4. },
2809 { 0x8086, U_NT_NUMERIC, 4. },
2810 { 0x278E, U_NT_DIGIT, 5. },
2811 { 0x1D7F2, U_NT_DECIMAL, 6. },
2812 { 0x247A, U_NT_DIGIT, 7. },
2813 { 0x7396, U_NT_NUMERIC, 9. },
2814 { 0x1372, U_NT_NUMERIC, 10. },
2815 { 0x216B, U_NT_NUMERIC, 12. },
2816 { 0x16EE, U_NT_NUMERIC, 17. },
2817 { 0x249A, U_NT_NUMERIC, 19. },
2818 { 0x303A, U_NT_NUMERIC, 30. },
2819 { 0x5345, U_NT_NUMERIC, 30. },
2820 { 0x32B2, U_NT_NUMERIC, 37. },
2821 { 0x1375, U_NT_NUMERIC, 40. },
2822 { 0x10323, U_NT_NUMERIC, 50. },
2823 { 0x0BF1, U_NT_NUMERIC, 100. },
2824 { 0x964c, U_NT_NUMERIC, 100. },
2825 { 0x217E, U_NT_NUMERIC, 500. },
2826 { 0x2180, U_NT_NUMERIC, 1000. },
2827 { 0x4edf, U_NT_NUMERIC, 1000. },
2828 { 0x2181, U_NT_NUMERIC, 5000. },
2829 { 0x137C, U_NT_NUMERIC, 10000. },
2830 { 0x4e07, U_NT_NUMERIC, 10000. },
2831 { 0x12432, U_NT_NUMERIC, 216000. },
2832 { 0x12433, U_NT_NUMERIC, 432000. },
2833 { 0x4ebf, U_NT_NUMERIC, 100000000. },
2834 { 0x5146, U_NT_NUMERIC, 1000000000000. },
2835 { -1, U_NT_NONE, U_NO_NUMERIC_VALUE },
2836 { 0x61, U_NT_NONE, U_NO_NUMERIC_VALUE },
2837 { 0x3000, U_NT_NONE, U_NO_NUMERIC_VALUE },
2838 { 0xfffe, U_NT_NONE, U_NO_NUMERIC_VALUE },
2839 { 0x10301, U_NT_NONE, U_NO_NUMERIC_VALUE },
2840 { 0xe0033, U_NT_NONE, U_NO_NUMERIC_VALUE },
2841 { 0x10ffff, U_NT_NONE, U_NO_NUMERIC_VALUE },
2842 { 0x110000, U_NT_NONE, U_NO_NUMERIC_VALUE }
2843 };
2844
2845 double nv;
2846 UChar32 c;
2847 int32_t i, type;
2848
2849 for(i=0; i<LENGTHOF(values); ++i) {
2850 c=values[i].c;
2851 type=u_getIntPropertyValue(c, UCHAR_NUMERIC_TYPE);
2852 nv=u_getNumericValue(c);
2853
2854 if(type!=values[i].type) {
2855 log_err("UCHAR_NUMERIC_TYPE(U+%04lx)=%d should be %d\n", c, type, values[i].type);
2856 }
2857 if(0.000001 <= fabs(nv - values[i].numValue)) {
2858 log_err("u_getNumericValue(U+%04lx)=%g should be %g\n", c, nv, values[i].numValue);
2859 }
2860 }
2861 }
2862
2863 /**
2864 * Test the property names and property value names API.
2865 */
2866 static void
2867 TestPropertyNames(void) {
2868 int32_t p, v, choice=0, rev;
2869 UBool atLeastSomething = FALSE;
2870
2871 for (p=0; ; ++p) {
2872 UProperty propEnum = (UProperty)p;
2873 UBool sawProp = FALSE;
2874 if(p > 10 && !atLeastSomething) {
2875 log_data_err("Never got anything after 10 tries.\nYour data is probably fried. Quitting this test\n", p, choice);
2876 return;
2877 }
2878
2879 for (choice=0; ; ++choice) {
2880 const char* name = u_getPropertyName(propEnum, (UPropertyNameChoice)choice);
2881 if (name) {
2882 if (!sawProp)
2883 log_verbose("prop 0x%04x+%2d:", p&~0xfff, p&0xfff);
2884 log_verbose("%d=\"%s\"", choice, name);
2885 sawProp = TRUE;
2886 atLeastSomething = TRUE;
2887
2888 /* test reverse mapping */
2889 rev = u_getPropertyEnum(name);
2890 if (rev != p) {
2891 log_err("Property round-trip failure: %d -> %s -> %d\n",
2892 p, name, rev);
2893 }
2894 }
2895 if (!name && choice>0) break;
2896 }
2897 if (sawProp) {
2898 /* looks like a valid property; check the values */
2899 const char* pname = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
2900 int32_t max = 0;
2901 if (p == UCHAR_CANONICAL_COMBINING_CLASS) {
2902 max = 255;
2903 } else if (p == UCHAR_GENERAL_CATEGORY_MASK) {
2904 /* it's far too slow to iterate all the way up to
2905 the real max, U_GC_P_MASK */
2906 max = U_GC_NL_MASK;
2907 } else if (p == UCHAR_BLOCK) {
2908 /* UBlockCodes, unlike other values, start at 1 */
2909 max = 1;
2910 }
2911 log_verbose("\n");
2912 for (v=-1; ; ++v) {
2913 UBool sawValue = FALSE;
2914 for (choice=0; ; ++choice) {
2915 const char* vname = u_getPropertyValueName(propEnum, v, (UPropertyNameChoice)choice);
2916 if (vname) {
2917 if (!sawValue) log_verbose(" %s, value %d:", pname, v);
2918 log_verbose("%d=\"%s\"", choice, vname);
2919 sawValue = TRUE;
2920
2921 /* test reverse mapping */
2922 rev = u_getPropertyValueEnum(propEnum, vname);
2923 if (rev != v) {
2924 log_err("Value round-trip failure (%s): %d -> %s -> %d\n",
2925 pname, v, vname, rev);
2926 }
2927 }
2928 if (!vname && choice>0) break;
2929 }
2930 if (sawValue) {
2931 log_verbose("\n");
2932 }
2933 if (!sawValue && v>=max) break;
2934 }
2935 }
2936 if (!sawProp) {
2937 if (p>=UCHAR_STRING_LIMIT) {
2938 break;
2939 } else if (p>=UCHAR_DOUBLE_LIMIT) {
2940 p = UCHAR_STRING_START - 1;
2941 } else if (p>=UCHAR_MASK_LIMIT) {
2942 p = UCHAR_DOUBLE_START - 1;
2943 } else if (p>=UCHAR_INT_LIMIT) {
2944 p = UCHAR_MASK_START - 1;
2945 } else if (p>=UCHAR_BINARY_LIMIT) {
2946 p = UCHAR_INT_START - 1;
2947 }
2948 }
2949 }
2950 }
2951
2952 /**
2953 * Test the property values API. See JB#2410.
2954 */
2955 static void
2956 TestPropertyValues(void) {
2957 int32_t i, p, min, max;
2958 UErrorCode ec;
2959
2960 /* Min should be 0 for everything. */
2961 /* Until JB#2478 is fixed, the one exception is UCHAR_BLOCK. */
2962 for (p=UCHAR_INT_START; p<UCHAR_INT_LIMIT; ++p) {
2963 UProperty propEnum = (UProperty)p;
2964 min = u_getIntPropertyMinValue(propEnum);
2965 if (min != 0) {
2966 if (p == UCHAR_BLOCK) {
2967 /* This is okay...for now. See JB#2487.
2968 TODO Update this for JB#2487. */
2969 } else {
2970 const char* name;
2971 name = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
2972 if (name == NULL)
2973 name = "<ERROR>";
2974 log_err("FAIL: u_getIntPropertyMinValue(%s) = %d, exp. 0\n",
2975 name, min);
2976 }
2977 }
2978 }
2979
2980 if( u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY_MASK)!=0 ||
2981 u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY_MASK)!=-1) {
2982 log_err("error: u_getIntPropertyMin/MaxValue(UCHAR_GENERAL_CATEGORY_MASK) is wrong\n");
2983 }
2984
2985 /* Max should be -1 for invalid properties. */
2986 max = u_getIntPropertyMaxValue(UCHAR_INVALID_CODE);
2987 if (max != -1) {
2988 log_err("FAIL: u_getIntPropertyMaxValue(-1) = %d, exp. -1\n",
2989 max);
2990 }
2991
2992 /* Script should return USCRIPT_INVALID_CODE for an invalid code point. */
2993 for (i=0; i<2; ++i) {
2994 int32_t script;
2995 const char* desc;
2996 ec = U_ZERO_ERROR;
2997 switch (i) {
2998 case 0:
2999 script = uscript_getScript(-1, &ec);
3000 desc = "uscript_getScript(-1)";
3001 break;
3002 case 1:
3003 script = u_getIntPropertyValue(-1, UCHAR_SCRIPT);
3004 desc = "u_getIntPropertyValue(-1, UCHAR_SCRIPT)";
3005 break;
3006 default:
3007 log_err("Internal test error. Too many scripts\n");
3008 return;
3009 }
3010 /* We don't explicitly test ec. It should be U_FAILURE but it
3011 isn't documented as such. */
3012 if (script != (int32_t)USCRIPT_INVALID_CODE) {
3013 log_err("FAIL: %s = %d, exp. 0\n",
3014 desc, script);
3015 }
3016 }
3017 }
3018
3019 /* various tests for consistency of UCD data and API behavior */
3020 static void
3021 TestConsistency() {
3022 char buffer[300];
3023 USet *set1, *set2, *set3, *set4;
3024 UErrorCode errorCode;
3025
3026 UChar32 start, end;
3027 int32_t i, length;
3028
3029 U_STRING_DECL(hyphenPattern, "[:Hyphen:]", 10);
3030 U_STRING_DECL(dashPattern, "[:Dash:]", 8);
3031 U_STRING_DECL(lowerPattern, "[:Lowercase:]", 13);
3032 U_STRING_DECL(formatPattern, "[:Cf:]", 6);
3033 U_STRING_DECL(alphaPattern, "[:Alphabetic:]", 14);
3034
3035 U_STRING_DECL(mathBlocksPattern,
3036 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
3037 1+32+46+46+45+43+1+1); /* +1 for NUL */
3038 U_STRING_DECL(mathPattern, "[:Math:]", 8);
3039 U_STRING_DECL(unassignedPattern, "[:Cn:]", 6);
3040 U_STRING_DECL(unknownPattern, "[:sc=Unknown:]", 14);
3041 U_STRING_DECL(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3042
3043 U_STRING_INIT(hyphenPattern, "[:Hyphen:]", 10);
3044 U_STRING_INIT(dashPattern, "[:Dash:]", 8);
3045 U_STRING_INIT(lowerPattern, "[:Lowercase:]", 13);
3046 U_STRING_INIT(formatPattern, "[:Cf:]", 6);
3047 U_STRING_INIT(alphaPattern, "[:Alphabetic:]", 14);
3048
3049 U_STRING_INIT(mathBlocksPattern,
3050 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
3051 1+32+46+46+45+43+1+1); /* +1 for NUL */
3052 U_STRING_INIT(mathPattern, "[:Math:]", 8);
3053 U_STRING_INIT(unassignedPattern, "[:Cn:]", 6);
3054 U_STRING_INIT(unknownPattern, "[:sc=Unknown:]", 14);
3055 U_STRING_INIT(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3056
3057 /*
3058 * It used to be that UCD.html and its precursors said
3059 * "Those dashes used to mark connections between pieces of words,
3060 * plus the Katakana middle dot."
3061 *
3062 * Unicode 4 changed 00AD Soft Hyphen to Cf and removed it from Dash
3063 * but not from Hyphen.
3064 * UTC 94 (2003mar) decided to leave it that way and to change UCD.html.
3065 * Therefore, do not show errors when testing the Hyphen property.
3066 */
3067 log_verbose("Starting with Unicode 4, inconsistencies with [:Hyphen:] are\n"
3068 "known to the UTC and not considered errors.\n");
3069
3070 errorCode=U_ZERO_ERROR;
3071 set1=uset_openPattern(hyphenPattern, 10, &errorCode);
3072 set2=uset_openPattern(dashPattern, 8, &errorCode);
3073 if(U_SUCCESS(errorCode)) {
3074 /* remove the Katakana middle dot(s) from set1 */
3075 uset_remove(set1, 0x30fb);
3076 uset_remove(set1, 0xff65); /* halfwidth variant */
3077 showAMinusB(set1, set2, "[:Hyphen:]", "[:Dash:]", FALSE);
3078 } else {
3079 log_data_err("error opening [:Hyphen:] or [:Dash:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3080 }
3081
3082 /* check that Cf is neither Hyphen nor Dash nor Alphabetic */
3083 set3=uset_openPattern(formatPattern, 6, &errorCode);
3084 set4=uset_openPattern(alphaPattern, 14, &errorCode);
3085 if(U_SUCCESS(errorCode)) {
3086 showAIntersectB(set3, set1, "[:Cf:]", "[:Hyphen:]", FALSE);
3087 showAIntersectB(set3, set2, "[:Cf:]", "[:Dash:]", TRUE);
3088 showAIntersectB(set3, set4, "[:Cf:]", "[:Alphabetic:]", TRUE);
3089 } else {
3090 log_data_err("error opening [:Cf:] or [:Alpbabetic:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3091 }
3092
3093 uset_close(set1);
3094 uset_close(set2);
3095 uset_close(set3);
3096 uset_close(set4);
3097
3098 /*
3099 * Check that each lowercase character has "small" in its name
3100 * and not "capital".
3101 * There are some such characters, some of which seem odd.
3102 * Use the verbose flag to see these notices.
3103 */
3104 errorCode=U_ZERO_ERROR;
3105 set1=uset_openPattern(lowerPattern, 13, &errorCode);
3106 if(U_SUCCESS(errorCode)) {
3107 for(i=0;; ++i) {
3108 length=uset_getItem(set1, i, &start, &end, NULL, 0, &errorCode);
3109 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
3110 break; /* done */
3111 }
3112 if(U_FAILURE(errorCode)) {
3113 log_err("error iterating over [:Lowercase:] at item %d: %s\n",
3114 i, u_errorName(errorCode));
3115 break;
3116 }
3117 if(length!=0) {
3118 break; /* done with code points, got a string or -1 */
3119 }
3120
3121 while(start<=end) {
3122 length=u_charName(start, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode);
3123 if(U_FAILURE(errorCode)) {
3124 log_data_err("error getting the name of U+%04x - %s\n", start, u_errorName(errorCode));
3125 errorCode=U_ZERO_ERROR;
3126 }
3127 if( (strstr(buffer, "SMALL")==NULL || strstr(buffer, "CAPITAL")!=NULL) &&
3128 strstr(buffer, "SMALL CAPITAL")==NULL
3129 ) {
3130 log_verbose("info: [:Lowercase:] contains U+%04x whose name does not suggest lowercase: %s\n", start, buffer);
3131 }
3132 ++start;
3133 }
3134 }
3135 } else {
3136 log_data_err("error opening [:Lowercase:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3137 }
3138 uset_close(set1);
3139
3140 /* verify that all assigned characters in Math blocks are exactly Math characters */
3141 errorCode=U_ZERO_ERROR;
3142 set1=uset_openPattern(mathBlocksPattern, -1, &errorCode);
3143 set2=uset_openPattern(mathPattern, 8, &errorCode);
3144 set3=uset_openPattern(unassignedPattern, 6, &errorCode);
3145 if(U_SUCCESS(errorCode)) {
3146 uset_retainAll(set2, set1); /* [math blocks]&[:Math:] */
3147 uset_complement(set3); /* assigned characters */
3148 uset_retainAll(set1, set3); /* [math blocks]&[assigned] */
3149 compareUSets(set1, set2,
3150 "[assigned Math block chars]", "[math blocks]&[:Math:]",
3151 TRUE);
3152 } else {
3153 log_data_err("error opening [math blocks] or [:Math:] or [:Cn:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3154 }
3155 uset_close(set1);
3156 uset_close(set2);
3157 uset_close(set3);
3158
3159 /* new in Unicode 5.0: exactly all unassigned+PUA+surrogate code points have script=Unknown */
3160 errorCode=U_ZERO_ERROR;
3161 set1=uset_openPattern(unknownPattern, 14, &errorCode);
3162 set2=uset_openPattern(reservedPattern, 20, &errorCode);
3163 if(U_SUCCESS(errorCode)) {
3164 compareUSets(set1, set2,
3165 "[:sc=Unknown:]", "[[:Cn:][:Co:][:Cs:]]",
3166 TRUE);
3167 } else {
3168 log_data_err("error opening [:sc=Unknown:] or [[:Cn:][:Co:][:Cs:]] - %s (Are you missing data?)\n", u_errorName(errorCode));
3169 }
3170 uset_close(set1);
3171 uset_close(set2);
3172 }
3173
3174 /*
3175 * Starting with ICU4C 3.4, the core Unicode properties files
3176 * (uprops.icu, ucase.icu, ubidi.icu, unorm.icu)
3177 * are hardcoded in the common DLL and therefore not included
3178 * in the data package any more.
3179 * Test requiring these files are disabled so that
3180 * we need not jump through hoops (like adding snapshots of these files
3181 * to testdata).
3182 * See Jitterbug 4497.
3183 */
3184 #define HARDCODED_DATA_4497 1
3185
3186 /* API coverage for ucase.c */
3187 static void TestUCase() {
3188 #if !HARDCODED_DATA_4497
3189 UDataMemory *pData;
3190 UCaseProps *csp;
3191 const UCaseProps *ccsp;
3192 UErrorCode errorCode;
3193
3194 /* coverage for ucase_openBinary() */
3195 errorCode=U_ZERO_ERROR;
3196 pData=udata_open(NULL, UCASE_DATA_TYPE, UCASE_DATA_NAME, &errorCode);
3197 if(U_FAILURE(errorCode)) {
3198 log_data_err("unable to open " UCASE_DATA_NAME "." UCASE_DATA_TYPE ": %s\n",
3199 u_errorName(errorCode));
3200 return;
3201 }
3202
3203 csp=ucase_openBinary((const uint8_t *)pData->pHeader, -1, &errorCode);
3204 if(U_FAILURE(errorCode)) {
3205 log_err("ucase_openBinary() fails for the contents of " UCASE_DATA_NAME "." UCASE_DATA_TYPE ": %s\n",
3206 u_errorName(errorCode));
3207 udata_close(pData);
3208 return;
3209 }
3210
3211 if(UCASE_LOWER!=ucase_getType(csp, 0xdf)) { /* verify islower(sharp s) */
3212 log_err("ucase_openBinary() does not seem to return working UCaseProps\n");
3213 }
3214
3215 ucase_close(csp);
3216 udata_close(pData);
3217
3218 /* coverage for ucase_getDummy() */
3219 errorCode=U_ZERO_ERROR;
3220 ccsp=ucase_getDummy(&errorCode);
3221 if(ucase_tolower(ccsp, 0x41)!=0x41) {
3222 log_err("ucase_tolower(dummy, A)!=A\n");
3223 }
3224 #endif
3225 }
3226
3227 /* API coverage for ubidi_props.c */
3228 static void TestUBiDiProps() {
3229 #if !HARDCODED_DATA_4497
3230 UDataMemory *pData;
3231 UBiDiProps *bdp;
3232 const UBiDiProps *cbdp;
3233 UErrorCode errorCode;
3234
3235 /* coverage for ubidi_openBinary() */
3236 errorCode=U_ZERO_ERROR;
3237 pData=udata_open(NULL, UBIDI_DATA_TYPE, UBIDI_DATA_NAME, &errorCode);
3238 if(U_FAILURE(errorCode)) {
3239 log_data_err("unable to open " UBIDI_DATA_NAME "." UBIDI_DATA_TYPE ": %s\n",
3240 u_errorName(errorCode));
3241 return;
3242 }
3243
3244 bdp=ubidi_openBinary((const uint8_t *)pData->pHeader, -1, &errorCode);
3245 if(U_FAILURE(errorCode)) {
3246 log_err("ubidi_openBinary() fails for the contents of " UBIDI_DATA_NAME "." UBIDI_DATA_TYPE ": %s\n",
3247 u_errorName(errorCode));
3248 udata_close(pData);
3249 return;
3250 }
3251
3252 if(0x2215!=ubidi_getMirror(bdp, 0x29F5)) { /* verify some data */
3253 log_err("ubidi_openBinary() does not seem to return working UBiDiProps\n");
3254 }
3255
3256 ubidi_closeProps(bdp);
3257 udata_close(pData);
3258
3259 /* coverage for ubidi_getDummy() */
3260 errorCode=U_ZERO_ERROR;
3261 cbdp=ubidi_getDummy(&errorCode);
3262 if(ubidi_getClass(cbdp, 0x20)!=0) {
3263 log_err("ubidi_getClass(dummy, space)!=0\n");
3264 }
3265 #endif
3266 }
3267
3268 /* test case folding, compare return values with CaseFolding.txt ------------ */
3269
3270 /* bit set for which case foldings for a character have been tested already */
3271 enum {
3272 CF_SIMPLE=1,
3273 CF_FULL=2,
3274 CF_TURKIC=4,
3275 CF_ALL=7
3276 };
3277
3278 static void
3279 testFold(UChar32 c, int which,
3280 UChar32 simple, UChar32 turkic,
3281 const UChar *full, int32_t fullLength,
3282 const UChar *turkicFull, int32_t turkicFullLength) {
3283 UChar s[2], t[32];
3284 UChar32 c2;
3285 int32_t length, length2;
3286
3287 UErrorCode errorCode=U_ZERO_ERROR;
3288
3289 length=0;
3290 U16_APPEND_UNSAFE(s, length, c);
3291
3292 if((which&CF_SIMPLE)!=0 && (c2=u_foldCase(c, 0))!=simple) {
3293 log_err("u_foldCase(U+%04lx, default)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3294 }
3295 if((which&CF_FULL)!=0) {
3296 length2=u_strFoldCase(t, LENGTHOF(t), s, length, 0, &errorCode);
3297 if(length2!=fullLength || 0!=u_memcmp(t, full, fullLength)) {
3298 log_err("u_strFoldCase(U+%04lx, default) does not fold properly\n", (long)c);
3299 }
3300 }
3301 if((which&CF_TURKIC)!=0) {
3302 if((c2=u_foldCase(c, U_FOLD_CASE_EXCLUDE_SPECIAL_I))!=turkic) {
3303 log_err("u_foldCase(U+%04lx, turkic)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3304 }
3305
3306 length2=u_strFoldCase(t, LENGTHOF(t), s, length, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &errorCode);
3307 if(length2!=turkicFullLength || 0!=u_memcmp(t, turkicFull, length2)) {
3308 log_err("u_strFoldCase(U+%04lx, turkic) does not fold properly\n", (long)c);
3309 }
3310 }
3311 }
3312
3313 /* test that c case-folds to itself */
3314 static void
3315 testFoldToSelf(UChar32 c, int which) {
3316 UChar s[2];
3317 int32_t length;
3318
3319 length=0;
3320 U16_APPEND_UNSAFE(s, length, c);
3321 testFold(c, which, c, c, s, length, s, length);
3322 }
3323
3324 struct CaseFoldingData {
3325 USet *notSeen;
3326 UChar32 prev, prevSimple;
3327 UChar prevFull[32];
3328 int32_t prevFullLength;
3329 int which;
3330 };
3331 typedef struct CaseFoldingData CaseFoldingData;
3332
3333 static void U_CALLCONV
3334 caseFoldingLineFn(void *context,
3335 char *fields[][2], int32_t fieldCount,
3336 UErrorCode *pErrorCode) {
3337 CaseFoldingData *pData=(CaseFoldingData *)context;
3338 char *end;
3339 UChar full[32];
3340 UChar32 c, prev, simple;
3341 int32_t count;
3342 int which;
3343 char status;
3344
3345 /* get code point */
3346 const char *s=u_skipWhitespace(fields[0][0]);
3347 if(0==strncmp(s, "0000..10FFFF", 12)) {
3348 /*
3349 * Ignore the line
3350 * # @missing: 0000..10FFFF; C; <code point>
3351 * because maps-to-self is already our default, and this line breaks this parser.
3352 */
3353 return;
3354 }
3355 c=(UChar32)strtoul(s, &end, 16);
3356 end=(char *)u_skipWhitespace(end);
3357 if(end<=fields[0][0] || end!=fields[0][1]) {
3358 log_err("syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
3359 *pErrorCode=U_PARSE_ERROR;
3360 return;
3361 }
3362
3363 /* get the status of this mapping */
3364 status=*u_skipWhitespace(fields[1][0]);
3365 if(status!='C' && status!='S' && status!='F' && status!='T') {
3366 log_err("unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
3367 *pErrorCode=U_PARSE_ERROR;
3368 return;
3369 }
3370
3371 /* get the mapping */
3372 count=u_parseString(fields[2][0], full, 32, (uint32_t *)&simple, pErrorCode);
3373 if(U_FAILURE(*pErrorCode)) {
3374 log_err("error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
3375 return;
3376 }
3377
3378 /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
3379 if(count==0 || count>2 || (count==2 && U16_IS_SINGLE(full[1]))) {
3380 simple=c;
3381 }
3382
3383 if(c!=(prev=pData->prev)) {
3384 /*
3385 * Test remaining mappings for the previous code point.
3386 * If a turkic folding was not mentioned, then it should fold the same
3387 * as the regular simple case folding.
3388 */
3389 UChar prevString[2];
3390 int32_t length;
3391
3392 length=0;
3393 U16_APPEND_UNSAFE(prevString, length, prev);
3394 testFold(prev, (~pData->which)&CF_ALL,
3395 prev, pData->prevSimple,
3396 prevString, length,
3397 pData->prevFull, pData->prevFullLength);
3398 pData->prev=pData->prevSimple=c;
3399 length=0;
3400 U16_APPEND_UNSAFE(pData->prevFull, length, c);
3401 pData->prevFullLength=length;
3402 pData->which=0;
3403 }
3404
3405 /*
3406 * Turn the status into a bit set of case foldings to test.
3407 * Remember non-Turkic case foldings as defaults for Turkic mode.
3408 */
3409 switch(status) {
3410 case 'C':
3411 which=CF_SIMPLE|CF_FULL;
3412 pData->prevSimple=simple;
3413 u_memcpy(pData->prevFull, full, count);
3414 pData->prevFullLength=count;
3415 break;
3416 case 'S':
3417 which=CF_SIMPLE;
3418 pData->prevSimple=simple;
3419 break;
3420 case 'F':
3421 which=CF_FULL;
3422 u_memcpy(pData->prevFull, full, count);
3423 pData->prevFullLength=count;
3424 break;
3425 case 'T':
3426 which=CF_TURKIC;
3427 break;
3428 default:
3429 which=0;
3430 break; /* won't happen because of test above */
3431 }
3432
3433 testFold(c, which, simple, simple, full, count, full, count);
3434
3435 /* remember which case foldings of c have been tested */
3436 pData->which|=which;
3437
3438 /* remove c from the set of ones not mentioned in CaseFolding.txt */
3439 uset_remove(pData->notSeen, c);
3440 }
3441
3442 static void
3443 TestCaseFolding() {
3444 CaseFoldingData data={ NULL };
3445 char *fields[3][2];
3446 UErrorCode errorCode;
3447
3448 static char *lastLine= (char *)"10FFFF; C; 10FFFF;";
3449
3450 errorCode=U_ZERO_ERROR;
3451 /* test BMP & plane 1 - nothing interesting above */
3452 data.notSeen=uset_open(0, 0x1ffff);
3453 data.prevFullLength=1; /* length of full case folding of U+0000 */
3454
3455 parseUCDFile("CaseFolding.txt", fields, 3, caseFoldingLineFn, &data, &errorCode);
3456 if(U_SUCCESS(errorCode)) {
3457 int32_t i, start, end;
3458
3459 /* add a pseudo-last line to finish testing of the actual last one */
3460 fields[0][0]=lastLine;
3461 fields[0][1]=lastLine+6;
3462 fields[1][0]=lastLine+7;
3463 fields[1][1]=lastLine+9;
3464 fields[2][0]=lastLine+10;
3465 fields[2][1]=lastLine+17;
3466 caseFoldingLineFn(&data, fields, 3, &errorCode);
3467
3468 /* verify that all code points that are not mentioned in CaseFolding.txt fold to themselves */
3469 for(i=0;
3470 0==uset_getItem(data.notSeen, i, &start, &end, NULL, 0, &errorCode) &&
3471 U_SUCCESS(errorCode);
3472 ++i
3473 ) {
3474 do {
3475 testFoldToSelf(start, CF_ALL);
3476 } while(++start<=end);
3477 }
3478 }
3479
3480 uset_close(data.notSeen);
3481 }