]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/cintltst/cucdtst.c
ICU-8.11.4.tar.gz
[apple/icu.git] / icuSources / test / cintltst / cucdtst.c
CommitLineData
b75a7d8f
A
1/********************************************************************
2 * COPYRIGHT:
73c04bcf 3 * Copyright (c) 1997-2006, International Business Machines Corporation and
b75a7d8f
A
4 * others. All Rights Reserved.
5 ********************************************************************/
6/********************************************************************************
7*
8* File CUCDTST.C
9*
10* Modification History:
11* Name Description
12* Madhu Katragadda Ported for C API, added tests for string functions
13*********************************************************************************
14*/
15
16#include <string.h>
17#include <math.h>
18#include <stdlib.h>
19
20#include "unicode/utypes.h"
21#include "unicode/uchar.h"
22#include "unicode/putil.h"
23#include "unicode/ustring.h"
24#include "unicode/uloc.h"
25
26#include "cintltst.h"
374ca955 27#include "putilimp.h"
b75a7d8f 28#include "uparse.h"
374ca955 29#include "ucase.h"
73c04bcf 30#include "ubidi_props.h"
b75a7d8f 31#include "uprops.h"
374ca955 32#include "uset_imp.h"
b75a7d8f
A
33#include "usc_impl.h"
34#include "unormimp.h"
374ca955
A
35#include "udatamem.h" /* for testing ucase_openBinary() */
36#include "cucdapi.h"
b75a7d8f 37
374ca955 38#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
b75a7d8f
A
39
40/* prototypes --------------------------------------------------------------- */
41
42static void TestUpperLower(void);
43static void TestLetterNumber(void);
44static void TestMisc(void);
45static void TestPOSIX(void);
46static void TestControlPrint(void);
47static void TestIdentifier(void);
48static void TestUnicodeData(void);
49static void TestCodeUnit(void);
50static void TestCodePoint(void);
51static void TestCharLength(void);
52static void TestCharNames(void);
53static void TestMirroring(void);
374ca955 54/* void TestUScriptCodeAPI(void);*/ /* defined in cucdapi.h */
b75a7d8f
A
55static void TestUScriptRunAPI(void);
56static void TestAdditionalProperties(void);
57static void TestNumericProperties(void);
58static void TestPropertyNames(void);
59static void TestPropertyValues(void);
60static void TestConsistency(void);
374ca955 61static void TestUCase(void);
73c04bcf
A
62static void TestUBiDiProps(void);
63static void TestCaseFolding(void);
b75a7d8f
A
64
65/* internal methods used */
66static int32_t MakeProp(char* str);
67static int32_t MakeDir(char* str);
68
73c04bcf
A
69/* helpers ------------------------------------------------------------------ */
70
71static void
72parseUCDFile(const char *filename,
73 char *fields[][2], int32_t fieldCount,
74 UParseLineFn *lineFn, void *context,
75 UErrorCode *pErrorCode) {
76 char path[256];
77 char backupPath[256];
78
79 if(U_FAILURE(*pErrorCode)) {
80 return;
81 }
82
83 /* Look inside ICU_DATA first */
84 strcpy(path, u_getDataDirectory());
85 strcat(path, ".." U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING);
86 strcat(path, filename);
87
88 /* As a fallback, try to guess where the source data was located
89 * at the time ICU was built, and look there.
90 */
91 strcpy(backupPath, ctest_dataSrcDir());
92 strcat(backupPath, U_FILE_SEP_STRING);
93 strcat(backupPath, "unidata" U_FILE_SEP_STRING);
94 strcat(backupPath, filename);
95
96 u_parseDelimitedFile(path, ';', fields, fieldCount, lineFn, context, pErrorCode);
97 if(*pErrorCode==U_FILE_ACCESS_ERROR) {
98 *pErrorCode=U_ZERO_ERROR;
99 u_parseDelimitedFile(backupPath, ';', fields, fieldCount, lineFn, context, pErrorCode);
100 }
101 if(U_FAILURE(*pErrorCode)) {
102 log_err("error parsing %s: %s\n", filename, u_errorName(*pErrorCode));
103 }
104}
105
b75a7d8f
A
106/* test data ---------------------------------------------------------------- */
107
108static const UChar LAST_CHAR_CODE_IN_FILE = 0xFFFD;
109static const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf";
110static const int32_t tagValues[] =
111 {
112 /* Mn */ U_NON_SPACING_MARK,
113 /* Mc */ U_COMBINING_SPACING_MARK,
114 /* Me */ U_ENCLOSING_MARK,
115 /* Nd */ U_DECIMAL_DIGIT_NUMBER,
116 /* Nl */ U_LETTER_NUMBER,
117 /* No */ U_OTHER_NUMBER,
118 /* Zs */ U_SPACE_SEPARATOR,
119 /* Zl */ U_LINE_SEPARATOR,
120 /* Zp */ U_PARAGRAPH_SEPARATOR,
121 /* Cc */ U_CONTROL_CHAR,
122 /* Cf */ U_FORMAT_CHAR,
123 /* Cs */ U_SURROGATE,
124 /* Co */ U_PRIVATE_USE_CHAR,
125 /* Cn */ U_UNASSIGNED,
126 /* Lu */ U_UPPERCASE_LETTER,
127 /* Ll */ U_LOWERCASE_LETTER,
128 /* Lt */ U_TITLECASE_LETTER,
129 /* Lm */ U_MODIFIER_LETTER,
130 /* Lo */ U_OTHER_LETTER,
131 /* Pc */ U_CONNECTOR_PUNCTUATION,
132 /* Pd */ U_DASH_PUNCTUATION,
133 /* Ps */ U_START_PUNCTUATION,
134 /* Pe */ U_END_PUNCTUATION,
135 /* Po */ U_OTHER_PUNCTUATION,
136 /* Sm */ U_MATH_SYMBOL,
137 /* Sc */ U_CURRENCY_SYMBOL,
138 /* Sk */ U_MODIFIER_SYMBOL,
139 /* So */ U_OTHER_SYMBOL,
140 /* Pi */ U_INITIAL_PUNCTUATION,
141 /* Pf */ U_FINAL_PUNCTUATION
142 };
143
144static const char dirStrings[][5] = {
145 "L",
146 "R",
147 "EN",
148 "ES",
149 "ET",
150 "AN",
151 "CS",
152 "B",
153 "S",
154 "WS",
155 "ON",
156 "LRE",
157 "LRO",
158 "AL",
159 "RLE",
160 "RLO",
161 "PDF",
162 "NSM",
163 "BN"
164};
165
166void addUnicodeTest(TestNode** root);
167
168void addUnicodeTest(TestNode** root)
169{
170 addTest(root, &TestUnicodeData, "tsutil/cucdtst/TestUnicodeData");
171 addTest(root, &TestCodeUnit, "tsutil/cucdtst/TestCodeUnit");
172 addTest(root, &TestCodePoint, "tsutil/cucdtst/TestCodePoint");
173 addTest(root, &TestCharLength, "tsutil/cucdtst/TestCharLength");
174 addTest(root, &TestAdditionalProperties, "tsutil/cucdtst/TestAdditionalProperties");
175 addTest(root, &TestNumericProperties, "tsutil/cucdtst/TestNumericProperties");
176 addTest(root, &TestUpperLower, "tsutil/cucdtst/TestUpperLower");
177 addTest(root, &TestLetterNumber, "tsutil/cucdtst/TestLetterNumber");
178 addTest(root, &TestMisc, "tsutil/cucdtst/TestMisc");
179 addTest(root, &TestPOSIX, "tsutil/cucdtst/TestPOSIX");
180 addTest(root, &TestControlPrint, "tsutil/cucdtst/TestControlPrint");
181 addTest(root, &TestIdentifier, "tsutil/cucdtst/TestIdentifier");
182 addTest(root, &TestCharNames, "tsutil/cucdtst/TestCharNames");
183 addTest(root, &TestMirroring, "tsutil/cucdtst/TestMirroring");
184 addTest(root, &TestUScriptCodeAPI, "tsutil/cucdtst/TestUScriptCodeAPI");
185 addTest(root, &TestUScriptRunAPI, "tsutil/cucdtst/TestUScriptRunAPI");
186 addTest(root, &TestPropertyNames, "tsutil/cucdtst/TestPropertyNames");
187 addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues");
188 addTest(root, &TestConsistency, "tsutil/cucdtst/TestConsistency");
374ca955 189 addTest(root, &TestUCase, "tsutil/cucdtst/TestUCase");
73c04bcf
A
190 addTest(root, &TestUBiDiProps, "tsutil/cucdtst/TestUBiDiProps");
191 addTest(root, &TestCaseFolding, "tsutil/cucdtst/TestCaseFolding");
b75a7d8f
A
192}
193
194/*==================================================== */
195/* test u_toupper() and u_tolower() */
196/*==================================================== */
197static void TestUpperLower()
198{
199 const UChar upper[] = {0x41, 0x42, 0x00b2, 0x01c4, 0x01c6, 0x01c9, 0x01c8, 0x01c9, 0x000c, 0x0000};
200 const UChar lower[] = {0x61, 0x62, 0x00b2, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01c9, 0x000c, 0x0000};
201 U_STRING_DECL(upperTest, "abcdefg123hij.?:klmno", 21);
202 U_STRING_DECL(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
203 int32_t i;
204
205 U_STRING_INIT(upperTest, "abcdefg123hij.?:klmno", 21);
206 U_STRING_INIT(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
207
208/*
209Checks LetterLike Symbols which were previously a source of confusion
210[Bertrand A. D. 02/04/98]
211*/
212 for (i=0x2100;i<0x2138;i++)
213 {
73c04bcf
A
214 /* Unicode 5.0 adds lowercase U+214E (TURNED SMALL F) to U+2132 (TURNED CAPITAL F) */
215 if(i!=0x2126 && i!=0x212a && i!=0x212b && i!=0x2132)
b75a7d8f
A
216 {
217 if (i != (int)u_tolower(i)) /* itself */
218 log_err("Failed case conversion with itself: U+%04x\n", i);
219 if (i != (int)u_toupper(i))
220 log_err("Failed case conversion with itself: U+%04x\n", i);
221 }
222 }
223
224 for(i=0; i < u_strlen(upper); i++){
225 if(u_tolower(upper[i]) != lower[i]){
226 log_err("FAILED u_tolower() for %lx Expected %lx Got %lx\n", upper[i], lower[i], u_tolower(upper[i]));
227 }
228 }
229
230 log_verbose("testing upper lower\n");
231 for (i = 0; i < 21; i++) {
232
233 if (u_isalpha(upperTest[i]) && !u_islower(upperTest[i]))
234 {
235 log_err("Failed isLowerCase test at %c\n", upperTest[i]);
236 }
237 else if (u_isalpha(lowerTest[i]) && !u_isupper(lowerTest[i]))
238 {
239 log_err("Failed isUpperCase test at %c\n", lowerTest[i]);
240 }
241 else if (upperTest[i] != u_tolower(lowerTest[i]))
242 {
243 log_err("Failed case conversion from %c To %c :\n", lowerTest[i], upperTest[i]);
244 }
245 else if (lowerTest[i] != u_toupper(upperTest[i]))
246 {
247 log_err("Failed case conversion : %c To %c \n", upperTest[i], lowerTest[i]);
248 }
249 else if (upperTest[i] != u_tolower(upperTest[i]))
250 {
251 log_err("Failed case conversion with itself: %c\n", upperTest[i]);
252 }
253 else if (lowerTest[i] != u_toupper(lowerTest[i]))
254 {
255 log_err("Failed case conversion with itself: %c\n", lowerTest[i]);
256 }
257 }
258 log_verbose("done testing upper lower\n");
259
260 log_verbose("testing u_istitle\n");
261 {
262 static const UChar expected[] = {
263 0x1F88,
264 0x1F89,
265 0x1F8A,
266 0x1F8B,
267 0x1F8C,
268 0x1F8D,
269 0x1F8E,
270 0x1F8F,
271 0x1F88,
272 0x1F89,
273 0x1F8A,
274 0x1F8B,
275 0x1F8C,
276 0x1F8D,
277 0x1F8E,
278 0x1F8F,
279 0x1F98,
280 0x1F99,
281 0x1F9A,
282 0x1F9B,
283 0x1F9C,
284 0x1F9D,
285 0x1F9E,
286 0x1F9F,
287 0x1F98,
288 0x1F99,
289 0x1F9A,
290 0x1F9B,
291 0x1F9C,
292 0x1F9D,
293 0x1F9E,
294 0x1F9F,
295 0x1FA8,
296 0x1FA9,
297 0x1FAA,
298 0x1FAB,
299 0x1FAC,
300 0x1FAD,
301 0x1FAE,
302 0x1FAF,
303 0x1FA8,
304 0x1FA9,
305 0x1FAA,
306 0x1FAB,
307 0x1FAC,
308 0x1FAD,
309 0x1FAE,
310 0x1FAF,
311 0x1FBC,
312 0x1FBC,
313 0x1FCC,
314 0x1FCC,
315 0x1FFC,
316 0x1FFC,
317 };
318 int32_t num = sizeof(expected)/sizeof(expected[0]);
319 for(i=0; i<num; i++){
320 if(!u_istitle(expected[i])){
321 log_err("u_istitle failed for 0x%4X. Expected TRUE, got FALSE\n",expected[i]);
322 }
323 }
324
325 }
326}
327
73c04bcf 328/* compare two sets and verify that their difference or intersection is empty */
b75a7d8f
A
329static UBool
330showADiffB(const USet *a, const USet *b,
331 const char *a_name, const char *b_name,
332 UBool expect, UBool diffIsError) {
73c04bcf 333 USet *aa;
b75a7d8f 334 int32_t i, start, end, length;
b75a7d8f
A
335 UErrorCode errorCode;
336
73c04bcf
A
337 /*
338 * expect:
339 * TRUE -> a-b should be empty, that is, b should contain all of a
340 * FALSE -> a&b should be empty, that is, a should contain none of b (and vice versa)
341 */
342 if(expect ? uset_containsAll(b, a) : uset_containsNone(a, b)) {
343 return TRUE;
344 }
345
346 /* clone a to aa because a is const */
347 aa=uset_open(1, 0);
348 if(aa==NULL) {
349 /* unusual problem - out of memory? */
350 return FALSE;
351 }
352 uset_addAll(aa, a);
353
354 /* compute the set in question */
355 if(expect) {
356 /* a-b */
357 uset_removeAll(aa, b);
358 } else {
359 /* a&b */
360 uset_retainAll(aa, b);
361 }
362
363 /* aa is not empty because of the initial tests above; show its contents */
b75a7d8f 364 errorCode=U_ZERO_ERROR;
b75a7d8f
A
365 i=0;
366 for(;;) {
73c04bcf 367 length=uset_getItem(aa, i, &start, &end, NULL, 0, &errorCode);
b75a7d8f 368 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
73c04bcf 369 break; /* done */
b75a7d8f
A
370 }
371 if(U_FAILURE(errorCode)) {
73c04bcf 372 log_err("error comparing %s with %s at difference item %d: %s\n",
b75a7d8f 373 a_name, b_name, i, u_errorName(errorCode));
73c04bcf 374 break;
b75a7d8f
A
375 }
376 if(length!=0) {
73c04bcf 377 break; /* done with code points, got a string or -1 */
b75a7d8f
A
378 }
379
73c04bcf
A
380 if(diffIsError) {
381 if(expect) {
382 log_err("error: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
383 } else {
384 log_err("error: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
385 }
386 } else {
387 if(expect) {
388 log_verbose("info: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
389 } else {
390 log_verbose("info: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
b75a7d8f
A
391 }
392 }
393
394 ++i;
395 }
73c04bcf
A
396
397 uset_close(aa);
398 return FALSE;
b75a7d8f
A
399}
400
401static UBool
402showAMinusB(const USet *a, const USet *b,
403 const char *a_name, const char *b_name,
404 UBool diffIsError) {
405 return showADiffB(a, b, a_name, b_name, TRUE, diffIsError);
406}
407
408static UBool
409showAIntersectB(const USet *a, const USet *b,
410 const char *a_name, const char *b_name,
411 UBool diffIsError) {
412 return showADiffB(a, b, a_name, b_name, FALSE, diffIsError);
413}
414
415static UBool
416compareUSets(const USet *a, const USet *b,
417 const char *a_name, const char *b_name,
418 UBool diffIsError) {
73c04bcf
A
419 /*
420 * Use an arithmetic & not a logical && so that both branches
421 * are always taken and all differences are shown.
422 */
b75a7d8f 423 return
73c04bcf 424 showAMinusB(a, b, a_name, b_name, diffIsError) &
b75a7d8f
A
425 showAMinusB(b, a, b_name, a_name, diffIsError);
426}
427
428/* test isLetter(u_isapha()) and isDigit(u_isdigit()) */
429static void TestLetterNumber()
430{
431 UChar i = 0x0000;
432
433 log_verbose("Testing for isalpha\n");
434 for (i = 0x0041; i < 0x005B; i++) {
435 if (!u_isalpha(i))
436 {
437 log_err("Failed isLetter test at %.4X\n", i);
438 }
439 }
440 for (i = 0x0660; i < 0x066A; i++) {
441 if (u_isalpha(i))
442 {
443 log_err("Failed isLetter test with numbers at %.4X\n", i);
444 }
445 }
446
447 log_verbose("Testing for isdigit\n");
448 for (i = 0x0660; i < 0x066A; i++) {
449 if (!u_isdigit(i))
450 {
451 log_verbose("Failed isNumber test at %.4X\n", i);
452 }
453 }
454
455 log_verbose("Testing for isalnum\n");
456 for (i = 0x0041; i < 0x005B; i++) {
457 if (!u_isalnum(i))
458 {
459 log_err("Failed isAlNum test at %.4X\n", i);
460 }
461 }
462 for (i = 0x0660; i < 0x066A; i++) {
463 if (!u_isalnum(i))
464 {
465 log_err("Failed isAlNum test at %.4X\n", i);
466 }
467 }
468
469 {
470 /*
471 * The following checks work only starting from Unicode 4.0.
472 * Check the version number here.
473 */
374ca955 474 static UVersionInfo u401={ 4, 0, 1, 0 };
b75a7d8f
A
475 UVersionInfo version;
476 u_getUnicodeVersion(version);
374ca955 477 if(version[0]<4 || 0==memcmp(version, u401, 4)) {
b75a7d8f
A
478 return;
479 }
480 }
481
482 {
483 /*
484 * Sanity check:
485 * Verify that exactly the digit characters have decimal digit values.
486 * This assumption is used in the implementation of u_digit()
487 * (which checks nt=de)
488 * compared with the parallel java.lang.Character.digit()
489 * (which checks Nd).
490 *
491 * This was not true in Unicode 3.2 and earlier.
374ca955
A
492 * Unicode 4.0 fixed discrepancies.
493 * Unicode 4.0.1 re-introduced problems in this area due to an
494 * unintentionally incomplete last-minute change.
b75a7d8f
A
495 */
496 U_STRING_DECL(digitsPattern, "[:Nd:]", 6);
497 U_STRING_DECL(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
498
499 USet *digits, *decimalValues;
500 UErrorCode errorCode;
501
502 U_STRING_INIT(digitsPattern, "[:Nd:]", 6);
503 U_STRING_INIT(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
504 errorCode=U_ZERO_ERROR;
505 digits=uset_openPattern(digitsPattern, 6, &errorCode);
506 decimalValues=uset_openPattern(decimalValuesPattern, 24, &errorCode);
507
508 if(U_SUCCESS(errorCode)) {
509 compareUSets(digits, decimalValues, "[:Nd:]", "[:Numeric_Type=Decimal:]", TRUE);
510 }
511
512 uset_close(digits);
513 uset_close(decimalValues);
514 }
515}
516
517/* Tests for isDefined(u_isdefined)(, isBaseForm(u_isbase()), isSpaceChar(u_isspace()), isWhiteSpace(), u_CharDigitValue() */
518static void TestMisc()
519{
520 static const UChar sampleSpaces[] = {0x0020, 0x00a0, 0x2000, 0x2001, 0x2005};
521 static const UChar sampleNonSpaces[] = {0x61, 0x62, 0x63, 0x64, 0x74};
522 static const UChar sampleUndefined[] = {0xfff1, 0xfff7, 0xfa6b };
523 static const UChar sampleDefined[] = {0x523E, 0x4f88, 0xfffd};
524 static const UChar sampleBase[] = {0x0061, 0x0031, 0x03d2};
525 static const UChar sampleNonBase[] = {0x002B, 0x0020, 0x203B};
526/* static const UChar sampleChars[] = {0x000a, 0x0045, 0x4e00, 0xDC00, 0xFFE8, 0xFFF0};*/
527 static const UChar sampleDigits[]= {0x0030, 0x0662, 0x0F23, 0x0ED5};
528 static const UChar sampleNonDigits[] = {0x0010, 0x0041, 0x0122, 0x68FE};
529 static const UChar sampleWhiteSpaces[] = {0x2008, 0x2009, 0x200a, 0x001c, 0x000c};
530 static const UChar sampleNonWhiteSpaces[] = {0x61, 0x62, 0x3c, 0x28, 0x3f};
531
532
533 static const int32_t sampleDigitValues[] = {0, 2, 3, 5};
534
535 uint32_t mask;
536
537 int32_t i;
538 char icuVersion[U_MAX_VERSION_STRING_LENGTH];
539 UVersionInfo realVersion;
540
541 memset(icuVersion, 0, U_MAX_VERSION_STRING_LENGTH);
542
543 log_verbose("Testing for isspace and nonspaces\n");
544 for (i = 0; i < 5; i++) {
545 if (!(u_isspace(sampleSpaces[i])) ||
546 (u_isspace(sampleNonSpaces[i])))
547 {
548 log_err("Space char test error : %d or %d \n", (int32_t)sampleSpaces[i], (int32_t)sampleNonSpaces[i]);
549 }
550 if (!(u_isJavaSpaceChar(sampleSpaces[i])) ||
551 (u_isJavaSpaceChar(sampleNonSpaces[i])))
552 {
553 log_err("u_isJavaSpaceChar() test error : %d or %d \n", (int32_t)sampleSpaces[i], (int32_t)sampleNonSpaces[i]);
554 }
555 }
556
557 log_verbose("Testing for isspace and nonspaces\n");
558 for (i = 0; i < 5; i++) {
559 if (!(u_isWhitespace(sampleWhiteSpaces[i])) ||
560 (u_isWhitespace(sampleNonWhiteSpaces[i])))
561 {
562 log_err("White Space char test error : %lx or %lx \n", sampleWhiteSpaces[i], sampleNonWhiteSpaces[i]);
563 }
564 }
565
566 log_verbose("Testing for isdefined\n");
567 for (i = 0; i < 3; i++) {
568 if ((u_isdefined(sampleUndefined[i])) ||
569 !(u_isdefined(sampleDefined[i])))
570 {
571 log_err("Undefined char test error : U+%04x or U+%04x\n", (int32_t)sampleUndefined[i], (int32_t)sampleDefined[i]);
572 }
573 }
574
575 log_verbose("Testing for isbase\n");
576 for (i = 0; i < 3; i++) {
577 if ((u_isbase(sampleNonBase[i])) ||
578 !(u_isbase(sampleBase[i])))
579 {
580 log_err("Non-baseform char test error : U+%04x or U+%04x",(int32_t)sampleNonBase[i], (int32_t)sampleBase[i]);
581 }
582 }
583
584 log_verbose("Testing for isdigit \n");
585 for (i = 0; i < 4; i++) {
586 if ((u_isdigit(sampleDigits[i]) &&
587 (u_charDigitValue(sampleDigits[i])!= sampleDigitValues[i])) ||
588 (u_isdigit(sampleNonDigits[i]))) {
589 log_err("Digit char test error : %lx or %lx\n", sampleDigits[i], sampleNonDigits[i]);
590 }
591 }
592
593 /* Tests the ICU version #*/
594 u_getVersion(realVersion);
595 u_versionToString(realVersion, icuVersion);
374ca955 596 if (strncmp(icuVersion, U_ICU_VERSION, uprv_min((int32_t)strlen(icuVersion), (int32_t)strlen(U_ICU_VERSION))) != 0)
b75a7d8f
A
597 {
598 log_err("ICU version test failed. Header says=%s, got=%s \n", U_ICU_VERSION, icuVersion);
599 }
600#if defined(ICU_VERSION)
601 /* test only happens where we have configure.in with VERSION - sanity check. */
602 if(strcmp(U_ICU_VERSION, ICU_VERSION))
603 {
604 log_err("ICU version mismatch: Header says %s, build environment says %s.\n", U_ICU_VERSION, ICU_VERSION);
605 }
606#endif
607
608 /* test U_GC_... */
609 if(
610 U_GET_GC_MASK(0x41)!=U_GC_LU_MASK ||
611 U_GET_GC_MASK(0x662)!=U_GC_ND_MASK ||
612 U_GET_GC_MASK(0xa0)!=U_GC_ZS_MASK ||
613 U_GET_GC_MASK(0x28)!=U_GC_PS_MASK ||
614 U_GET_GC_MASK(0x2044)!=U_GC_SM_MASK ||
615 U_GET_GC_MASK(0xe0063)!=U_GC_CF_MASK
616 ) {
617 log_err("error: U_GET_GC_MASK does not work properly\n");
618 }
619
620 mask=0;
621 mask=(mask&~U_GC_CN_MASK)|U_GC_CN_MASK;
622
623 mask=(mask&~U_GC_LU_MASK)|U_GC_LU_MASK;
624 mask=(mask&~U_GC_LL_MASK)|U_GC_LL_MASK;
625 mask=(mask&~U_GC_LT_MASK)|U_GC_LT_MASK;
626 mask=(mask&~U_GC_LM_MASK)|U_GC_LM_MASK;
627 mask=(mask&~U_GC_LO_MASK)|U_GC_LO_MASK;
628
629 mask=(mask&~U_GC_MN_MASK)|U_GC_MN_MASK;
630 mask=(mask&~U_GC_ME_MASK)|U_GC_ME_MASK;
631 mask=(mask&~U_GC_MC_MASK)|U_GC_MC_MASK;
632
633 mask=(mask&~U_GC_ND_MASK)|U_GC_ND_MASK;
634 mask=(mask&~U_GC_NL_MASK)|U_GC_NL_MASK;
635 mask=(mask&~U_GC_NO_MASK)|U_GC_NO_MASK;
636
637 mask=(mask&~U_GC_ZS_MASK)|U_GC_ZS_MASK;
638 mask=(mask&~U_GC_ZL_MASK)|U_GC_ZL_MASK;
639 mask=(mask&~U_GC_ZP_MASK)|U_GC_ZP_MASK;
640
641 mask=(mask&~U_GC_CC_MASK)|U_GC_CC_MASK;
642 mask=(mask&~U_GC_CF_MASK)|U_GC_CF_MASK;
643 mask=(mask&~U_GC_CO_MASK)|U_GC_CO_MASK;
644 mask=(mask&~U_GC_CS_MASK)|U_GC_CS_MASK;
645
646 mask=(mask&~U_GC_PD_MASK)|U_GC_PD_MASK;
647 mask=(mask&~U_GC_PS_MASK)|U_GC_PS_MASK;
648 mask=(mask&~U_GC_PE_MASK)|U_GC_PE_MASK;
649 mask=(mask&~U_GC_PC_MASK)|U_GC_PC_MASK;
650 mask=(mask&~U_GC_PO_MASK)|U_GC_PO_MASK;
651
652 mask=(mask&~U_GC_SM_MASK)|U_GC_SM_MASK;
653 mask=(mask&~U_GC_SC_MASK)|U_GC_SC_MASK;
654 mask=(mask&~U_GC_SK_MASK)|U_GC_SK_MASK;
655 mask=(mask&~U_GC_SO_MASK)|U_GC_SO_MASK;
656
657 mask=(mask&~U_GC_PI_MASK)|U_GC_PI_MASK;
658 mask=(mask&~U_GC_PF_MASK)|U_GC_PF_MASK;
659
660 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
661 log_err("error: problems with U_GC_XX_MASK constants\n");
662 }
663
664 mask=0;
665 mask=(mask&~U_GC_C_MASK)|U_GC_C_MASK;
666 mask=(mask&~U_GC_L_MASK)|U_GC_L_MASK;
667 mask=(mask&~U_GC_M_MASK)|U_GC_M_MASK;
668 mask=(mask&~U_GC_N_MASK)|U_GC_N_MASK;
669 mask=(mask&~U_GC_Z_MASK)|U_GC_Z_MASK;
670 mask=(mask&~U_GC_P_MASK)|U_GC_P_MASK;
671 mask=(mask&~U_GC_S_MASK)|U_GC_S_MASK;
672
673 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
674 log_err("error: problems with U_GC_Y_MASK constants\n");
675 }
676 {
677 static const UChar32 digit[10]={ 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039 };
678 for(i=0; i<10; i++){
679 if(digit[i]!=u_forDigit(i,10)){
680 log_err("u_forDigit failed for %i. Expected: 0x%4X Got: 0x%4X\n",i,digit[i],u_forDigit(i,10));
681 }
682 }
683 }
684
685 /* test u_digit() */
686 {
687 static const struct {
688 UChar32 c;
689 int8_t radix, value;
690 } data[]={
691 /* base 16 */
692 { 0x0031, 16, 1 },
693 { 0x0038, 16, 8 },
694 { 0x0043, 16, 12 },
695 { 0x0066, 16, 15 },
696 { 0x00e4, 16, -1 },
697 { 0x0662, 16, 2 },
698 { 0x06f5, 16, 5 },
699 { 0xff13, 16, 3 },
700 { 0xff41, 16, 10 },
701
702 /* base 8 */
703 { 0x0031, 8, 1 },
704 { 0x0038, 8, -1 },
705 { 0x0043, 8, -1 },
706 { 0x0066, 8, -1 },
707 { 0x00e4, 8, -1 },
708 { 0x0662, 8, 2 },
709 { 0x06f5, 8, 5 },
710 { 0xff13, 8, 3 },
711 { 0xff41, 8, -1 },
712
713 /* base 36 */
714 { 0x5a, 36, 35 },
715 { 0x7a, 36, 35 },
716 { 0xff3a, 36, 35 },
717 { 0xff5a, 36, 35 },
718
719 /* wrong radix values */
720 { 0x0031, 1, -1 },
721 { 0xff3a, 37, -1 }
722 };
723
724 for(i=0; i<LENGTHOF(data); ++i) {
725 if(u_digit(data[i].c, data[i].radix)!=data[i].value) {
726 log_err("u_digit(U+%04x, %d)=%d expected %d\n",
727 data[i].c,
728 data[i].radix,
729 u_digit(data[i].c, data[i].radix),
730 data[i].value);
731 }
732 }
733 }
734}
735
736/* test C/POSIX-style functions --------------------------------------------- */
737
738/* bit flags */
739#define ISAL 1
740#define ISLO 2
741#define ISUP 4
742
743#define ISDI 8
744#define ISXD 0x10
745
746#define ISAN 0x20
747
748#define ISPU 0x40
749#define ISGR 0x80
750#define ISPR 0x100
751
752#define ISSP 0x200
753#define ISBL 0x400
754#define ISCN 0x800
755
756/* C/POSIX-style functions, in the same order as the bit flags */
374ca955 757typedef UBool U_EXPORT2 IsPOSIXClass(UChar32 c);
b75a7d8f
A
758
759static const struct {
760 IsPOSIXClass *fn;
761 const char *name;
762} posixClasses[]={
763 { u_isalpha, "isalpha" },
764 { u_islower, "islower" },
765 { u_isupper, "isupper" },
766 { u_isdigit, "isdigit" },
767 { u_isxdigit, "isxdigit" },
768 { u_isalnum, "isalnum" },
769 { u_ispunct, "ispunct" },
770 { u_isgraph, "isgraph" },
771 { u_isprint, "isprint" },
772 { u_isspace, "isspace" },
773 { u_isblank, "isblank" },
774 { u_iscntrl, "iscntrl" }
775};
776
777static const struct {
778 UChar32 c;
779 uint32_t posixResults;
780} posixData[]={
781 { 0x0008, ISCN }, /* backspace */
782 { 0x0009, ISSP|ISBL|ISCN }, /* TAB */
783 { 0x000a, ISSP| ISCN }, /* LF */
784 { 0x000c, ISSP| ISCN }, /* FF */
785 { 0x000d, ISSP| ISCN }, /* CR */
786 { 0x0020, ISPR|ISSP|ISBL }, /* space */
787 { 0x0021, ISPU|ISGR|ISPR }, /* ! */
788 { 0x0033, ISDI|ISXD|ISAN| ISGR|ISPR }, /* 3 */
789 { 0x0040, ISPU|ISGR|ISPR }, /* @ */
790 { 0x0041, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* A */
791 { 0x007a, ISAL|ISLO| ISAN| ISGR|ISPR }, /* z */
792 { 0x007b, ISPU|ISGR|ISPR }, /* { */
793 { 0x0085, ISSP| ISCN }, /* NEL */
794 { 0x00a0, ISPR|ISSP|ISBL }, /* NBSP */
795 { 0x00a4, ISGR|ISPR }, /* currency sign */
796 { 0x00e4, ISAL|ISLO| ISAN| ISGR|ISPR }, /* a-umlaut */
797 { 0x0300, ISGR|ISPR }, /* combining grave */
798 { 0x0600, ISCN }, /* arabic number sign */
799 { 0x0627, ISAL| ISAN| ISGR|ISPR }, /* alef */
800 { 0x0663, ISDI|ISXD|ISAN| ISGR|ISPR }, /* arabic 3 */
801 { 0x2002, ISPR|ISSP|ISBL }, /* en space */
802 { 0x2007, ISPR|ISSP|ISBL }, /* figure space */
803 { 0x2009, ISPR|ISSP|ISBL }, /* thin space */
374ca955
A
804 { 0x200b, ISCN }, /* ZWSP */
805 /*{ 0x200b, ISPR|ISSP },*/ /* ZWSP */ /* ZWSP became a control char in 4.0.1*/
b75a7d8f
A
806 { 0x200e, ISCN }, /* LRM */
807 { 0x2028, ISPR|ISSP| ISCN }, /* LS */
808 { 0x2029, ISPR|ISSP| ISCN }, /* PS */
809 { 0x20ac, ISGR|ISPR }, /* Euro */
810 { 0xff15, ISDI|ISXD|ISAN| ISGR|ISPR }, /* fullwidth 5 */
811 { 0xff25, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* fullwidth E */
812 { 0xff35, ISAL| ISUP| ISAN| ISGR|ISPR }, /* fullwidth U */
813 { 0xff45, ISAL|ISLO| ISXD|ISAN| ISGR|ISPR }, /* fullwidth e */
814 { 0xff55, ISAL|ISLO| ISAN| ISGR|ISPR } /* fullwidth u */
815};
816
817static void
818TestPOSIX() {
819 uint32_t mask;
820 int32_t cl, i;
821 UBool expect;
822
823 mask=1;
824 for(cl=0; cl<12; ++cl) {
825 for(i=0; i<LENGTHOF(posixData); ++i) {
826 expect=(UBool)((posixData[i].posixResults&mask)!=0);
827 if(posixClasses[cl].fn(posixData[i].c)!=expect) {
828 log_err("u_%s(U+%04x)=%s is wrong\n",
829 posixClasses[cl].name, posixData[i].c, expect ? "FALSE" : "TRUE");
830 }
831 }
832 mask<<=1;
833 }
834}
835
836/* Tests for isControl(u_iscntrl()) and isPrintable(u_isprint()) */
837static void TestControlPrint()
838{
839 const UChar sampleControl[] = {0x1b, 0x97, 0x82, 0x2028, 0x2029, 0x200c, 0x202b};
840 const UChar sampleNonControl[] = {0x61, 0x0031, 0x00e2};
841 const UChar samplePrintable[] = {0x0042, 0x005f, 0x2014};
842 const UChar sampleNonPrintable[] = {0x200c, 0x009f, 0x001b};
843 UChar32 c;
844 int i;
845
846 log_verbose("Testing for iscontrol\n");
847 for (i = 0; i < LENGTHOF(sampleControl); i++) {
848 if (!u_iscntrl(sampleControl[i]))
849 {
850 log_err("Control char test error : U+%04x should be control but is not\n", (int32_t)sampleControl[i]);
851 }
852 }
853
854 log_verbose("Testing for !iscontrol\n");
855 for (i = 0; i < LENGTHOF(sampleNonControl); i++) {
856 if (u_iscntrl(sampleNonControl[i]))
857 {
858 log_err("Control char test error : U+%04x should not be control but is\n", (int32_t)sampleNonControl[i]);
859 }
860 }
861
862 log_verbose("testing for isprintable\n");
863 for (i = 0; i < 3; i++) {
864 if (!u_isprint(samplePrintable[i]))
865 {
866 log_err("Printable char test error : U+%04x should be printable but is not\n", (int32_t)samplePrintable[i]);
867 }
868 if (u_isprint(sampleNonPrintable[i]))
869 {
870 log_err("Printable char test error : U+%04x should not be printable but is\n", (int32_t)sampleNonPrintable[i]);
871 }
872 }
873
874 /* test all ISO 8 controls */
875 for(c=0; c<=0x9f; ++c) {
876 if(c==0x20) {
877 /* skip ASCII graphic characters and continue with DEL */
878 c=0x7f;
879 }
880 if(!u_iscntrl(c)) {
881 log_err("error: u_iscntrl(ISO 8 control U+%04x)=FALSE\n", c);
882 }
883 if(!u_isISOControl(c)) {
884 log_err("error: u_isISOControl(ISO 8 control U+%04x)=FALSE\n", c);
885 }
886 if(u_isprint(c)) {
887 log_err("error: u_isprint(ISO 8 control U+%04x)=TRUE\n", c);
888 }
889 }
890
891 /* test all Latin-1 graphic characters */
892 for(c=0x20; c<=0xff; ++c) {
893 if(c==0x7f) {
894 c=0xa0;
895 } else if(c==0xad) {
896 /* Unicode 4 changes 00AD Soft Hyphen to Cf (and it is in fact not printable) */
897 ++c;
898 }
899 if(!u_isprint(c)) {
900 log_err("error: u_isprint(Latin-1 graphic character U+%04x)=FALSE\n", c);
901 }
902 }
903}
904
905/* u_isJavaIDStart, u_isJavaIDPart, u_isIDStart(), u_isIDPart(), u_isIDIgnorable()*/
906static void TestIdentifier()
907{
908 const UChar sampleJavaIDStart[] = {0x0071, 0x00e4, 0x005f};
909 const UChar sampleNonJavaIDStart[] = {0x0020, 0x2030, 0x0082};
910 const UChar sampleJavaIDPart[] = {0x005f, 0x0032, 0x0045};
911 const UChar sampleNonJavaIDPart[] = {0x2030, 0x2020, 0x0020};
912 const UChar sampleUnicodeIDStart[] = {0x0250, 0x00e2, 0x0061};
913 const UChar sampleNonUnicodeIDStart[] = {0x2000, 0x000a, 0x2019};
914 const UChar sampleUnicodeIDPart[] = {0x005f, 0x0032, 0x0045};
915 const UChar sampleNonUnicodeIDPart[] = {0x2030, 0x00a3, 0x0020};
916 const UChar sampleIDIgnore[] = {0x0006, 0x0010, 0x206b};
917 const UChar sampleNonIDIgnore[] = {0x0075, 0x00a3, 0x0061};
918
919 int i;
920
921 log_verbose("Testing sampleJavaID start \n");
922 for (i = 0; i < 3; i++) {
923 if (!(u_isJavaIDStart(sampleJavaIDStart[i])) ||
924 (u_isJavaIDStart(sampleNonJavaIDStart[i])))
925 log_err("Java ID Start char test error : %lx or %lx\n",
926 sampleJavaIDStart[i], sampleNonJavaIDStart[i]);
927 }
928
929 log_verbose("Testing sampleJavaID part \n");
930 for (i = 0; i < 3; i++) {
931 if (!(u_isJavaIDPart(sampleJavaIDPart[i])) ||
932 (u_isJavaIDPart(sampleNonJavaIDPart[i])))
933 log_err("Java ID Part char test error : %lx or %lx\n",
934 sampleJavaIDPart[i], sampleNonJavaIDPart[i]);
935 }
936
937 log_verbose("Testing sampleUnicodeID start \n");
938 for (i = 0; i < 3; i++) {
939 /* T_test_logln_ustr((int32_t)i); */
940 if (!(u_isIDStart(sampleUnicodeIDStart[i])) ||
941 (u_isIDStart(sampleNonUnicodeIDStart[i])))
942 {
943 log_err("Unicode ID Start char test error : %lx or %lx\n", sampleUnicodeIDStart[i],
944 sampleNonUnicodeIDStart[i]);
945 }
946 }
947
948 log_verbose("Testing sample unicode ID part \n");
949 for (i = 2; i < 3; i++) { /* nos *** starts with 2 instead of 0, until clarified */
950 /* T_test_logln_ustr((int32_t)i); */
951 if (!(u_isIDPart(sampleUnicodeIDPart[i])) ||
952 (u_isIDPart(sampleNonUnicodeIDPart[i])))
953 {
954 log_err("Unicode ID Part char test error : %lx or %lx", sampleUnicodeIDPart[i], sampleNonUnicodeIDPart[i]);
955 }
956 }
957
958 log_verbose("Testing sampleId ignore\n");
959 for (i = 0; i < 3; i++) {
960 /*T_test_logln_ustr((int32_t)i); */
961 if (!(u_isIDIgnorable(sampleIDIgnore[i])) ||
962 (u_isIDIgnorable(sampleNonIDIgnore[i])))
963 {
964 log_err("ID ignorable char test error : U+%04x or U+%04x\n", sampleIDIgnore[i], sampleNonIDIgnore[i]);
965 }
966 }
967}
968
969/* for each line of UnicodeData.txt, check some of the properties */
970/*
971 * ### TODO
972 * This test fails incorrectly if the First or Last code point of a repetitive area
973 * is overridden, which is allowed and is encouraged for the PUAs.
974 * Currently, this means that both area First/Last and override lines are
975 * tested against the properties from the API,
976 * and the area boundary will not match and cause an error.
977 *
978 * This function should detect area boundaries and skip them for the test of individual
979 * code points' properties.
980 * Then it should check that the areas contain all the same properties except where overridden.
981 * For this, it would have had to set a flag for which code points were listed explicitly.
982 */
983static void U_CALLCONV
984unicodeDataLineFn(void *context,
985 char *fields[][2], int32_t fieldCount,
986 UErrorCode *pErrorCode)
987{
988 char buffer[100];
989 char *end;
990 uint32_t value;
991 UChar32 c;
992 int32_t i;
993 int8_t type;
994
995 /* get the character code, field 0 */
996 c=strtoul(fields[0][0], &end, 16);
997 if(end<=fields[0][0] || end!=fields[0][1]) {
998 log_err("error: syntax error in field 0 at %s\n", fields[0][0]);
999 return;
1000 }
1001 if((uint32_t)c>=UCHAR_MAX_VALUE + 1) {
1002 log_err("error in UnicodeData.txt: code point %lu out of range\n", c);
1003 return;
1004 }
1005
1006 /* get general category, field 2 */
1007 *fields[2][1]=0;
1008 type = (int8_t)tagValues[MakeProp(fields[2][0])];
1009 if(u_charType(c)!=type) {
1010 log_err("error: u_charType(U+%04lx)==%u instead of %u\n", c, u_charType(c), type);
1011 }
1012 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1013 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1014 }
1015
1016 /* get canonical combining class, field 3 */
1017 value=strtoul(fields[3][0], &end, 10);
1018 if(end<=fields[3][0] || end!=fields[3][1]) {
1019 log_err("error: syntax error in field 3 at code 0x%lx\n", c);
1020 return;
1021 }
1022 if(value>255) {
1023 log_err("error in UnicodeData.txt: combining class %lu out of range\n", value);
1024 return;
1025 }
1026#if !UCONFIG_NO_NORMALIZATION
1027 if(value!=u_getCombiningClass(c) || value!=(uint32_t)u_getIntPropertyValue(c, UCHAR_CANONICAL_COMBINING_CLASS)) {
1028 log_err("error: u_getCombiningClass(U+%04lx)==%hu instead of %lu\n", c, u_getCombiningClass(c), value);
1029 }
1030#endif
1031
1032 /* get BiDi category, field 4 */
1033 *fields[4][1]=0;
1034 i=MakeDir(fields[4][0]);
1035 if(i!=u_charDirection(c) || i!=u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)) {
1036 log_err("error: u_charDirection(U+%04lx)==%u instead of %u (%s)\n", c, u_charDirection(c), MakeDir(fields[4][0]), fields[4][0]);
1037 }
1038
1039 /* get ISO Comment, field 11 */
1040 *fields[11][1]=0;
1041 i=u_getISOComment(c, buffer, sizeof(buffer), pErrorCode);
1042 if(U_FAILURE(*pErrorCode) || 0!=strcmp(fields[11][0], buffer)) {
1043 log_err("error: u_getISOComment(U+%04lx) wrong (%s): \"%s\" should be \"%s\"\n",
1044 c, u_errorName(*pErrorCode),
1045 U_FAILURE(*pErrorCode) ? buffer : "[error]",
1046 fields[11][0]);
1047 }
1048
1049 /* get uppercase mapping, field 12 */
1050 if(fields[12][0]!=fields[12][1]) {
1051 value=strtoul(fields[12][0], &end, 16);
1052 if(end!=fields[12][1]) {
1053 log_err("error: syntax error in field 12 at code 0x%lx\n", c);
1054 return;
1055 }
1056 if((UChar32)value!=u_toupper(c)) {
1057 log_err("error: u_toupper(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_toupper(c), value);
1058 }
1059 } else {
1060 /* no case mapping: the API must map the code point to itself */
1061 if(c!=u_toupper(c)) {
1062 log_err("error: U+%04lx does not have an uppercase mapping but u_toupper()==U+%04lx\n", c, u_toupper(c));
1063 }
1064 }
1065
1066 /* get lowercase mapping, field 13 */
1067 if(fields[13][0]!=fields[13][1]) {
1068 value=strtoul(fields[13][0], &end, 16);
1069 if(end!=fields[13][1]) {
1070 log_err("error: syntax error in field 13 at code 0x%lx\n", c);
1071 return;
1072 }
1073 if((UChar32)value!=u_tolower(c)) {
1074 log_err("error: u_tolower(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_tolower(c), value);
1075 }
1076 } else {
1077 /* no case mapping: the API must map the code point to itself */
1078 if(c!=u_tolower(c)) {
1079 log_err("error: U+%04lx does not have a lowercase mapping but u_tolower()==U+%04lx\n", c, u_tolower(c));
1080 }
1081 }
1082
1083 /* get titlecase mapping, field 14 */
1084 if(fields[14][0]!=fields[14][1]) {
1085 value=strtoul(fields[14][0], &end, 16);
1086 if(end!=fields[14][1]) {
1087 log_err("error: syntax error in field 14 at code 0x%lx\n", c);
1088 return;
1089 }
1090 if((UChar32)value!=u_totitle(c)) {
1091 log_err("error: u_totitle(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_totitle(c), value);
1092 }
1093 } else {
1094 /* no case mapping: the API must map the code point to itself */
1095 if(c!=u_totitle(c)) {
1096 log_err("error: U+%04lx does not have a titlecase mapping but u_totitle()==U+%04lx\n", c, u_totitle(c));
1097 }
1098 }
1099}
1100
1101static UBool U_CALLCONV
1102enumTypeRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1103 static const UChar32 test[][2]={
1104 {0x41, U_UPPERCASE_LETTER},
1105 {0x308, U_NON_SPACING_MARK},
1106 {0xfffe, U_GENERAL_OTHER_TYPES},
1107 {0xe0041, U_FORMAT_CHAR},
1108 {0xeffff, U_UNASSIGNED}
1109 };
1110
374ca955 1111 int32_t i, count;
b75a7d8f
A
1112
1113 if(0!=strcmp((const char *)context, "a1")) {
1114 log_err("error: u_enumCharTypes() passes on an incorrect context pointer\n");
1115 return FALSE;
1116 }
1117
374ca955 1118 count=LENGTHOF(test);
b75a7d8f
A
1119 for(i=0; i<count; ++i) {
1120 if(start<=test[i][0] && test[i][0]<limit) {
1121 if(type!=(UCharCategory)test[i][1]) {
1122 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld instead of U+%04lx with %ld\n",
1123 start, limit, (long)type, test[i][0], test[i][1]);
1124 }
374ca955 1125 /* stop at the range that includes the last test code point (increases code coverage for enumeration) */
b75a7d8f
A
1126 return i==(count-1) ? FALSE : TRUE;
1127 }
1128 }
1129
1130 if(start>test[count-1][0]) {
1131 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld after it should have stopped\n",
1132 start, limit, (long)type);
1133 return FALSE;
1134 }
1135
374ca955
A
1136 return TRUE;
1137}
1138
1139static UBool U_CALLCONV
1140enumDefaultsRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1141 /* default Bidi classes for unassigned code points */
1142 static const int32_t defaultBidi[][2]={ /* { limit, class } */
1143 { 0x0590, U_LEFT_TO_RIGHT },
1144 { 0x0600, U_RIGHT_TO_LEFT },
1145 { 0x07C0, U_RIGHT_TO_LEFT_ARABIC },
1146 { 0x0900, U_RIGHT_TO_LEFT },
1147 { 0xFB1D, U_LEFT_TO_RIGHT },
1148 { 0xFB50, U_RIGHT_TO_LEFT },
1149 { 0xFE00, U_RIGHT_TO_LEFT_ARABIC },
1150 { 0xFE70, U_LEFT_TO_RIGHT },
1151 { 0xFF00, U_RIGHT_TO_LEFT_ARABIC },
1152 { 0x10800, U_LEFT_TO_RIGHT },
1153 { 0x11000, U_RIGHT_TO_LEFT },
1154 { 0x110000, U_LEFT_TO_RIGHT }
1155 };
1156
1157 UChar32 c;
1158 int32_t i;
1159 UCharDirection shouldBeDir;
1160
b75a7d8f
A
1161 /*
1162 * LineBreak.txt specifies:
1163 * # - Assigned characters that are not listed explicitly are given the value
1164 * # "AL".
1165 * # - Unassigned characters are given the value "XX".
1166 *
1167 * PUA characters are listed explicitly with "XX".
1168 * Verify that no assigned character has "XX".
1169 */
1170 if(type!=U_UNASSIGNED && type!=U_PRIVATE_USE_CHAR) {
1171 c=start;
1172 while(c<limit) {
1173 if(0==u_getIntPropertyValue(c, UCHAR_LINE_BREAK)) {
1174 log_err("error UCHAR_LINE_BREAK(assigned U+%04lx)=XX\n", c);
1175 }
1176 ++c;
1177 }
1178 }
1179
1180 /*
1181 * Verify default Bidi classes.
374ca955
A
1182 * For recent Unicode versions, see UCD.html.
1183 *
1184 * For older Unicode versions:
b75a7d8f
A
1185 * See table 3-7 "Bidirectional Character Types" in UAX #9.
1186 * http://www.unicode.org/reports/tr9/
1187 *
1188 * See also DerivedBidiClass.txt for Cn code points!
374ca955
A
1189 *
1190 * Unicode 4.0.1/Public Review Issue #28 (http://www.unicode.org/review/resolved-pri.html)
1191 * changed some default values.
1192 * In particular, non-characters and unassigned Default Ignorable Code Points
1193 * change from L to BN.
1194 *
1195 * UCD.html version 4.0.1 does not yet reflect these changes.
b75a7d8f
A
1196 */
1197 if(type==U_UNASSIGNED || type==U_PRIVATE_USE_CHAR) {
1198 /* enumerate the intersections of defaultBidi ranges with [start..limit[ */
1199 c=start;
1200 for(i=0; i<LENGTHOF(defaultBidi) && c<limit; ++i) {
1201 if((int32_t)c<defaultBidi[i][0]) {
1202 while(c<limit && (int32_t)c<defaultBidi[i][0]) {
374ca955
A
1203 if(U_IS_UNICODE_NONCHAR(c) || u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
1204 shouldBeDir=U_BOUNDARY_NEUTRAL;
1205 } else {
1206 shouldBeDir=(UCharDirection)defaultBidi[i][1];
1207 }
1208
1209 if( u_charDirection(c)!=shouldBeDir ||
1210 u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)!=shouldBeDir
b75a7d8f
A
1211 ) {
1212 log_err("error: u_charDirection(unassigned/PUA U+%04lx)=%s should be %s\n",
374ca955 1213 c, dirStrings[u_charDirection(c)], dirStrings[shouldBeDir]);
b75a7d8f
A
1214 }
1215 ++c;
1216 }
1217 }
1218 }
1219 }
1220
1221 return TRUE;
1222}
1223
1224/* tests for several properties */
1225static void TestUnicodeData()
1226{
b75a7d8f
A
1227 UVersionInfo expectVersionArray;
1228 UVersionInfo versionArray;
1229 char *fields[15][2];
1230 UErrorCode errorCode;
1231 UChar32 c;
1232 int8_t type;
1233
b75a7d8f
A
1234 u_versionFromString(expectVersionArray, U_UNICODE_VERSION);
1235 u_getUnicodeVersion(versionArray);
1236 if(memcmp(versionArray, expectVersionArray, U_MAX_VERSION_LENGTH) != 0)
1237 {
1238 log_err("Testing u_getUnicodeVersion() - expected " U_UNICODE_VERSION " got %d.%d.%d.%d\n",
1239 versionArray[0], versionArray[1], versionArray[2], versionArray[3]);
1240 }
1241
1242#if defined(ICU_UNICODE_VERSION)
1243 /* test only happens where we have configure.in with UNICODE_VERSION - sanity check. */
1244 if(strcmp(U_UNICODE_VERSION, ICU_UNICODE_VERSION))
1245 {
1246 log_err("Testing configure.in's ICU_UNICODE_VERSION - expected " U_UNICODE_VERSION " got " ICU_UNICODE_VERSION "\n");
1247 }
1248#endif
1249
1250 if (ublock_getCode((UChar)0x0041) != UBLOCK_BASIC_LATIN || u_getIntPropertyValue(0x41, UCHAR_BLOCK)!=(int32_t)UBLOCK_BASIC_LATIN) {
1251 log_err("ublock_getCode(U+0041) property failed! Expected : %i Got: %i \n", UBLOCK_BASIC_LATIN,ublock_getCode((UChar)0x0041));
1252 }
1253
1254 errorCode=U_ZERO_ERROR;
73c04bcf 1255 parseUCDFile("UnicodeData.txt", fields, 15, unicodeDataLineFn, NULL, &errorCode);
b75a7d8f 1256 if(U_FAILURE(errorCode)) {
b75a7d8f
A
1257 return; /* if we couldn't parse UnicodeData.txt, we should return */
1258 }
1259
1260 /* sanity check on repeated properties */
1261 for(c=0xfffe; c<=0x10ffff;) {
1262 type=u_charType(c);
1263 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1264 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1265 }
1266 if(type!=U_UNASSIGNED) {
1267 log_err("error: u_charType(U+%04lx)!=U_UNASSIGNED (returns %d)\n", c, u_charType(c));
1268 }
1269 if((c&0xffff)==0xfffe) {
1270 ++c;
1271 } else {
1272 c+=0xffff;
1273 }
1274 }
1275
1276 /* test that PUA is not "unassigned" */
1277 for(c=0xe000; c<=0x10fffd;) {
1278 type=u_charType(c);
1279 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1280 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1281 }
1282 if(type==U_UNASSIGNED) {
1283 log_err("error: u_charType(U+%04lx)==U_UNASSIGNED\n", c);
1284 } else if(type!=U_PRIVATE_USE_CHAR) {
1285 log_verbose("PUA override: u_charType(U+%04lx)=%d\n", c, type);
1286 }
1287 if(c==0xf8ff) {
1288 c=0xf0000;
1289 } else if(c==0xffffd) {
1290 c=0x100000;
1291 } else {
1292 ++c;
1293 }
1294 }
1295
1296 /* test u_enumCharTypes() */
1297 u_enumCharTypes(enumTypeRange, "a1");
374ca955
A
1298
1299 /* check default properties */
1300 u_enumCharTypes(enumDefaultsRange, NULL);
b75a7d8f
A
1301}
1302
1303static void TestCodeUnit(){
1304 const UChar codeunit[]={0x0000,0xe065,0x20ac,0xd7ff,0xd800,0xd841,0xd905,0xdbff,0xdc00,0xdc02,0xddee,0xdfff,0};
1305
1306 int32_t i;
1307
1308 for(i=0; i<(int32_t)(sizeof(codeunit)/sizeof(codeunit[0])); i++){
1309 UChar c=codeunit[i];
1310 if(i<4){
1311 if(!(UTF_IS_SINGLE(c)) || (UTF_IS_LEAD(c)) || (UTF_IS_TRAIL(c)) ||(UTF_IS_SURROGATE(c))){
1312 log_err("ERROR: U+%04x is a single", c);
1313 }
1314
1315 }
1316 if(i >= 4 && i< 8){
1317 if(!(UTF_IS_LEAD(c)) || UTF_IS_SINGLE(c) || UTF_IS_TRAIL(c) || !(UTF_IS_SURROGATE(c))){
1318 log_err("ERROR: U+%04x is a first surrogate", c);
1319 }
1320 }
1321 if(i >= 8 && i< 12){
1322 if(!(UTF_IS_TRAIL(c)) || UTF_IS_SINGLE(c) || UTF_IS_LEAD(c) || !(UTF_IS_SURROGATE(c))){
1323 log_err("ERROR: U+%04x is a second surrogate", c);
1324 }
1325 }
1326 }
1327
1328}
1329
1330static void TestCodePoint(){
1331 const UChar32 codePoint[]={
1332 /*surrogate, notvalid(codepoint), not a UnicodeChar, not Error */
1333 0xd800,
1334 0xdbff,
1335 0xdc00,
1336 0xdfff,
1337 0xdc04,
1338 0xd821,
1339 /*not a surrogate, valid, isUnicodeChar , not Error*/
1340 0x20ac,
1341 0xd7ff,
1342 0xe000,
1343 0xe123,
1344 0x0061,
1345 0xe065,
1346 0x20402,
1347 0x24506,
1348 0x23456,
1349 0x20402,
1350 0x10402,
1351 0x23456,
1352 /*not a surrogate, not valid, isUnicodeChar, isError */
1353 0x0015,
1354 0x009f,
1355 /*not a surrogate, not valid, not isUnicodeChar, isError */
1356 0xffff,
1357 0xfffe,
1358 };
1359 int32_t i;
1360 for(i=0; i<(int32_t)(sizeof(codePoint)/sizeof(codePoint[0])); i++){
1361 UChar32 c=codePoint[i];
1362 if(i<6){
1363 if(!UTF_IS_SURROGATE(c) || !U_IS_SURROGATE(c) || !U16_IS_SURROGATE(c)){
1364 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1365 }
1366 if(UTF_IS_VALID(c)){
1367 log_err("ERROR: isValid() failed for U+%04x\n", c);
1368 }
1369 if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){
1370 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1371 }
1372 if(UTF_IS_ERROR(c)){
1373 log_err("ERROR: isError() failed for U+%04x\n", c);
1374 }
1375 }else if(i >=6 && i<18){
1376 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
1377 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1378 }
1379 if(!UTF_IS_VALID(c)){
1380 log_err("ERROR: isValid() failed for U+%04x\n", c);
1381 }
1382 if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){
1383 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1384 }
1385 if(UTF_IS_ERROR(c)){
1386 log_err("ERROR: isError() failed for U+%04x\n", c);
1387 }
1388 }else if(i >=18 && i<20){
1389 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
1390 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1391 }
1392 if(UTF_IS_VALID(c)){
1393 log_err("ERROR: isValid() failed for U+%04x\n", c);
1394 }
1395 if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){
1396 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1397 }
1398 if(!UTF_IS_ERROR(c)){
1399 log_err("ERROR: isError() failed for U+%04x\n", c);
1400 }
1401 }
1402 else if(i >=18 && i<(int32_t)(sizeof(codePoint)/sizeof(codePoint[0]))){
1403 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
1404 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1405 }
1406 if(UTF_IS_VALID(c)){
1407 log_err("ERROR: isValid() failed for U+%04x\n", c);
1408 }
1409 if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){
1410 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1411 }
1412 if(!UTF_IS_ERROR(c)){
1413 log_err("ERROR: isError() failed for U+%04x\n", c);
1414 }
1415 }
1416 }
1417
374ca955
A
1418 if(
1419 !U_IS_BMP(0) || !U_IS_BMP(0x61) || !U_IS_BMP(0x20ac) ||
1420 !U_IS_BMP(0xd9da) || !U_IS_BMP(0xdfed) || !U_IS_BMP(0xffff) ||
1421 U_IS_BMP(U_SENTINEL) || U_IS_BMP(0x10000) || U_IS_BMP(0x50005) ||
1422 U_IS_BMP(0x10ffff) || U_IS_BMP(0x110000) || U_IS_BMP(0x7fffffff)
1423 ) {
1424 log_err("error with U_IS_BMP()\n");
1425 }
1426
1427 if(
1428 U_IS_SUPPLEMENTARY(0) || U_IS_SUPPLEMENTARY(0x61) || U_IS_SUPPLEMENTARY(0x20ac) ||
1429 U_IS_SUPPLEMENTARY(0xd9da) || U_IS_SUPPLEMENTARY(0xdfed) || U_IS_SUPPLEMENTARY(0xffff) ||
1430 U_IS_SUPPLEMENTARY(U_SENTINEL) || !U_IS_SUPPLEMENTARY(0x10000) || !U_IS_SUPPLEMENTARY(0x50005) ||
1431 !U_IS_SUPPLEMENTARY(0x10ffff) || U_IS_SUPPLEMENTARY(0x110000) || U_IS_SUPPLEMENTARY(0x7fffffff)
1432 ) {
1433 log_err("error with U_IS_SUPPLEMENTARY()\n");
1434 }
b75a7d8f
A
1435}
1436
1437static void TestCharLength()
1438{
1439 const int32_t codepoint[]={
1440 1, 0x0061,
1441 1, 0xe065,
1442 1, 0x20ac,
1443 2, 0x20402,
1444 2, 0x23456,
1445 2, 0x24506,
1446 2, 0x20402,
1447 2, 0x10402,
1448 1, 0xd7ff,
1449 1, 0xe000
1450 };
1451
1452 int32_t i;
1453 UBool multiple;
1454 for(i=0; i<(int32_t)(sizeof(codepoint)/sizeof(codepoint[0])); i=(int16_t)(i+2)){
1455 UChar32 c=codepoint[i+1];
1456 if(UTF_CHAR_LENGTH(c) != codepoint[i] || U16_LENGTH(c) != codepoint[i]){
1457 log_err("The no: of code units for U+%04x:- Expected: %d Got: %d\n", c, codepoint[i], UTF_CHAR_LENGTH(c));
1458 }
1459 multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
1460 if(UTF_NEED_MULTIPLE_UCHAR(c) != multiple){
1461 log_err("ERROR: Unicode::needMultipleUChar() failed for U+%04x\n", c);
1462 }
1463 }
1464}
1465
1466/*internal functions ----*/
1467static int32_t MakeProp(char* str)
1468{
1469 int32_t result = 0;
1470 char* matchPosition =0;
1471
1472 matchPosition = strstr(tagStrings, str);
1473 if (matchPosition == 0)
1474 {
1475 log_err("unrecognized type letter ");
1476 log_err(str);
1477 }
374ca955
A
1478 else
1479 result = (int32_t)((matchPosition - tagStrings) / 2);
b75a7d8f
A
1480 return result;
1481}
1482
1483static int32_t MakeDir(char* str)
1484{
1485 int32_t pos = 0;
1486 for (pos = 0; pos < 19; pos++) {
1487 if (strcmp(str, dirStrings[pos]) == 0) {
1488 return pos;
1489 }
1490 }
1491 return -1;
1492}
1493
1494/* test u_charName() -------------------------------------------------------- */
1495
1496static const struct {
1497 uint32_t code;
1498 const char *name, *oldName, *extName;
1499} names[]={
1500 {0x0061, "LATIN SMALL LETTER A", "", "LATIN SMALL LETTER A"},
1501 {0x0284, "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK", "LATIN SMALL LETTER DOTLESS J BAR HOOK", "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK" },
1502 {0x3401, "CJK UNIFIED IDEOGRAPH-3401", "", "CJK UNIFIED IDEOGRAPH-3401" },
1503 {0x7fed, "CJK UNIFIED IDEOGRAPH-7FED", "", "CJK UNIFIED IDEOGRAPH-7FED" },
1504 {0xac00, "HANGUL SYLLABLE GA", "", "HANGUL SYLLABLE GA" },
1505 {0xd7a3, "HANGUL SYLLABLE HIH", "", "HANGUL SYLLABLE HIH" },
1506 {0xd800, "", "", "<lead surrogate-D800>" },
1507 {0xdc00, "", "", "<trail surrogate-DC00>" },
1508 {0xff08, "FULLWIDTH LEFT PARENTHESIS", "FULLWIDTH OPENING PARENTHESIS", "FULLWIDTH LEFT PARENTHESIS" },
1509 {0xffe5, "FULLWIDTH YEN SIGN", "", "FULLWIDTH YEN SIGN" },
1510 {0xffff, "", "", "<noncharacter-FFFF>" },
1511 {0x23456, "CJK UNIFIED IDEOGRAPH-23456", "", "CJK UNIFIED IDEOGRAPH-23456" }
1512};
1513
1514static UBool
1515enumCharNamesFn(void *context,
1516 UChar32 code, UCharNameChoice nameChoice,
1517 const char *name, int32_t length) {
1518 int32_t *pCount=(int32_t *)context;
1519 int i;
1520
1521 if(length<=0 || length!=(int32_t)strlen(name)) {
1522 /* should not be called with an empty string or invalid length */
1523 log_err("u_enumCharName(0x%lx)=%s but length=%ld\n", name, length);
1524 return TRUE;
1525 }
1526
1527 ++*pCount;
1528 for(i=0; i<sizeof(names)/sizeof(names[0]); ++i) {
1529 if(code==(UChar32)names[i].code) {
1530 switch (nameChoice) {
1531 case U_EXTENDED_CHAR_NAME:
1532 if(0!=strcmp(name, names[i].extName)) {
1533 log_err("u_enumCharName(0x%lx - Extended)=%s instead of %s\n", code, name, names[i].extName);
1534 }
1535 break;
1536 case U_UNICODE_CHAR_NAME:
1537 if(0!=strcmp(name, names[i].name)) {
1538 log_err("u_enumCharName(0x%lx)=%s instead of %s\n", code, name, names[i].name);
1539 }
1540 break;
1541 case U_UNICODE_10_CHAR_NAME:
1542 if(names[i].oldName[0]==0 || 0!=strcmp(name, names[i].oldName)) {
1543 log_err("u_enumCharName(0x%lx - 1.0)=%s instead of %s\n", code, name, names[i].oldName);
1544 }
1545 break;
1546 case U_CHAR_NAME_CHOICE_COUNT:
1547 break;
1548 }
1549 break;
1550 }
1551 }
1552 return TRUE;
1553}
1554
1555struct enumExtCharNamesContext {
1556 uint32_t length;
1557 int32_t last;
1558};
1559
1560static UBool
1561enumExtCharNamesFn(void *context,
1562 UChar32 code, UCharNameChoice nameChoice,
1563 const char *name, int32_t length) {
1564 struct enumExtCharNamesContext *ecncp = (struct enumExtCharNamesContext *) context;
1565
1566 if (ecncp->last != (int32_t) code - 1) {
1567 if (ecncp->last < 0) {
1568 log_err("u_enumCharName(0x%lx - Ext) after u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x%lx - Ext)\n", code, ecncp->last, ecncp->last + 1);
1569 } else {
1570 log_err("u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x0 - Ext)\n", code);
1571 }
1572 }
1573 ecncp->last = (int32_t) code;
1574
1575 if (!*name) {
1576 log_err("u_enumCharName(0x%lx - Ext) should not be an empty string\n", code);
1577 }
1578
1579 return enumCharNamesFn(&ecncp->length, code, nameChoice, name, length);
1580}
1581
1582/**
1583 * This can be made more efficient by moving it into putil.c and having
1584 * it directly access the ebcdic translation tables.
1585 * TODO: If we get this method in putil.c, then delete it from here.
1586 */
1587static UChar
1588u_charToUChar(char c) {
1589 UChar uc;
1590 u_charsToUChars(&c, &uc, 1);
1591 return uc;
1592}
1593
1594static void
1595TestCharNames() {
1596 static char name[80];
1597 UErrorCode errorCode=U_ZERO_ERROR;
1598 struct enumExtCharNamesContext extContext;
1599 int32_t length;
1600 UChar32 c;
1601 int32_t i;
1602
1603 log_verbose("Testing uprv_getMaxCharNameLength()\n");
1604 length=uprv_getMaxCharNameLength();
1605 if(length==0) {
1606 /* no names data available */
1607 return;
1608 }
1609 if(length<83) { /* Unicode 3.2 max char name length */
1610 log_err("uprv_getMaxCharNameLength()=%d is too short");
1611 }
1612 /* ### TODO same tests for max ISO comment length as for max name length */
1613
1614 log_verbose("Testing u_charName()\n");
1615 for(i=0; i<(int32_t)(sizeof(names)/sizeof(names[0])); ++i) {
1616 /* modern Unicode character name */
1617 length=u_charName(names[i].code, U_UNICODE_CHAR_NAME, name, sizeof(name), &errorCode);
1618 if(U_FAILURE(errorCode)) {
1619 log_err("u_charName(0x%lx) error %s\n", names[i].code, u_errorName(errorCode));
1620 return;
1621 }
1622 if(length<0 || 0!=strcmp(name, names[i].name) || length!=(uint16_t)strlen(name)) {
1623 log_err("u_charName(0x%lx) gets: %s (length %ld) instead of: %s\n", names[i].code, name, length, names[i].name);
1624 }
1625
1626 /* find the modern name */
1627 if (*names[i].name) {
1628 c=u_charFromName(U_UNICODE_CHAR_NAME, names[i].name, &errorCode);
1629 if(U_FAILURE(errorCode)) {
1630 log_err("u_charFromName(%s) error %s\n", names[i].name, u_errorName(errorCode));
1631 return;
1632 }
1633 if(c!=(UChar32)names[i].code) {
1634 log_err("u_charFromName(%s) gets 0x%lx instead of 0x%lx\n", names[i].name, c, names[i].code);
1635 }
1636 }
1637
1638 /* Unicode 1.0 character name */
1639 length=u_charName(names[i].code, U_UNICODE_10_CHAR_NAME, name, sizeof(name), &errorCode);
1640 if(U_FAILURE(errorCode)) {
1641 log_err("u_charName(0x%lx - 1.0) error %s\n", names[i].code, u_errorName(errorCode));
1642 return;
1643 }
1644 if(length<0 || (length>0 && 0!=strcmp(name, names[i].oldName)) || length!=(uint16_t)strlen(name)) {
1645 log_err("u_charName(0x%lx - 1.0) gets %s length %ld instead of nothing or %s\n", names[i].code, name, length, names[i].oldName);
1646 }
1647
1648 /* find the Unicode 1.0 name if it is stored (length>0 means that we could read it) */
1649 if(names[i].oldName[0]!=0 /* && length>0 */) {
1650 c=u_charFromName(U_UNICODE_10_CHAR_NAME, names[i].oldName, &errorCode);
1651 if(U_FAILURE(errorCode)) {
1652 log_err("u_charFromName(%s - 1.0) error %s\n", names[i].oldName, u_errorName(errorCode));
1653 return;
1654 }
1655 if(c!=(UChar32)names[i].code) {
1656 log_err("u_charFromName(%s - 1.0) gets 0x%lx instead of 0x%lx\n", names[i].oldName, c, names[i].code);
1657 }
1658 }
1659 }
1660
1661 /* test u_enumCharNames() */
1662 length=0;
1663 errorCode=U_ZERO_ERROR;
1664 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumCharNamesFn, &length, U_UNICODE_CHAR_NAME, &errorCode);
1665 if(U_FAILURE(errorCode) || length<94140) {
1666 log_err("u_enumCharNames(%ld..%lx) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE, u_errorName(errorCode), length);
1667 }
1668
1669 extContext.length = 0;
1670 extContext.last = -1;
1671 errorCode=U_ZERO_ERROR;
1672 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumExtCharNamesFn, &extContext, U_EXTENDED_CHAR_NAME, &errorCode);
1673 if(U_FAILURE(errorCode) || extContext.length<UCHAR_MAX_VALUE + 1) {
1674 log_err("u_enumCharNames(%ld..0x%lx - Extended) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, u_errorName(errorCode), extContext.length);
1675 }
1676
1677 /* test that u_charFromName() uppercases the input name, i.e., works with mixed-case names (new in 2.0) */
1678 if(0x61!=u_charFromName(U_UNICODE_CHAR_NAME, "LATin smALl letTER A", &errorCode)) {
1679 log_err("u_charFromName(U_UNICODE_CHAR_NAME, \"LATin smALl letTER A\") did not find U+0061 (%s)\n", u_errorName(errorCode));
1680 }
1681
1682 /* Test getCharNameCharacters */
1683 if(!QUICK) {
1684 enum { BUFSIZE = 256 };
1685 UErrorCode ec = U_ZERO_ERROR;
1686 char buf[BUFSIZE];
1687 int32_t maxLength;
1688 UChar32 cp;
1689 UChar pat[BUFSIZE], dumbPat[BUFSIZE];
1690 int32_t l1, l2;
1691 UBool map[256];
1692 UBool ok;
1693
1694 USet* set = uset_open(1, 0); /* empty set */
1695 USet* dumb = uset_open(1, 0); /* empty set */
1696
1697 /*
1698 * uprv_getCharNameCharacters() will likely return more lowercase
1699 * letters than actual character names contain because
1700 * it includes all the characters in lowercased names of
1701 * general categories, for the full possible set of extended names.
1702 */
374ca955
A
1703 {
1704 USetAdder sa={
1705 NULL,
1706 uset_add,
1707 uset_addRange,
73c04bcf
A
1708 uset_addString,
1709 NULL /* don't need remove() */
374ca955
A
1710 };
1711 sa.set=set;
1712 uprv_getCharNameCharacters(&sa);
1713 }
b75a7d8f
A
1714
1715 /* build set the dumb (but sure-fire) way */
374ca955 1716 for (i=0; i<256; ++i) {
b75a7d8f 1717 map[i] = FALSE;
374ca955 1718 }
b75a7d8f
A
1719
1720 maxLength=0;
1721 for (cp=0; cp<0x110000; ++cp) {
1722 int32_t len = u_charName(cp, U_EXTENDED_CHAR_NAME,
1723 buf, BUFSIZE, &ec);
1724 if (U_FAILURE(ec)) {
1725 log_err("FAIL: u_charName failed when it shouldn't\n");
1726 uset_close(set);
1727 uset_close(dumb);
1728 return;
1729 }
1730 if(len>maxLength) {
1731 maxLength=len;
1732 }
1733
1734 for (i=0; i<len; ++i) {
1735 if (!map[(uint8_t) buf[i]]) {
1736 uset_add(dumb, (UChar32)u_charToUChar(buf[i]));
1737 map[(uint8_t) buf[i]] = TRUE;
1738 }
1739 }
374ca955
A
1740
1741 /* test for leading/trailing whitespace */
1742 if(buf[0]==' ' || buf[0]=='\t' || buf[len-1]==' ' || buf[len-1]=='\t') {
1743 log_err("u_charName(U+%04x) returns a name with leading or trailing whitespace\n", cp);
1744 }
1745 }
1746
1747 if(map[(uint8_t)'\t']) {
1748 log_err("u_charName() returned a name with a TAB for some code point\n", cp);
b75a7d8f
A
1749 }
1750
1751 length=uprv_getMaxCharNameLength();
1752 if(length!=maxLength) {
1753 log_err("uprv_getMaxCharNameLength()=%d differs from the maximum length %d of all extended names\n",
1754 length, maxLength);
1755 }
1756
1757 /* compare the sets. Where is my uset_equals?!! */
1758 ok=TRUE;
1759 for(i=0; i<256; ++i) {
1760 if(uset_contains(set, i)!=uset_contains(dumb, i)) {
1761 if(0x61<=i && i<=0x7a /* a-z */ && uset_contains(set, i) && !uset_contains(dumb, i)) {
1762 /* ignore lowercase a-z that are in set but not in dumb */
1763 ok=TRUE;
1764 } else {
1765 ok=FALSE;
1766 break;
1767 }
1768 }
1769 }
1770
1771 l1 = uset_toPattern(set, pat, BUFSIZE, TRUE, &ec);
1772 l2 = uset_toPattern(dumb, dumbPat, BUFSIZE, TRUE, &ec);
1773 if (U_FAILURE(ec)) {
1774 log_err("FAIL: uset_toPattern failed when it shouldn't\n");
1775 uset_close(set);
1776 uset_close(dumb);
1777 return;
1778 }
1779
1780 if (l1 >= BUFSIZE) {
1781 l1 = BUFSIZE-1;
1782 pat[l1] = 0;
1783 }
1784 if (l2 >= BUFSIZE) {
1785 l2 = BUFSIZE-1;
1786 dumbPat[l2] = 0;
1787 }
1788
1789 if (!ok) {
b75a7d8f 1790 log_err("FAIL: uprv_getCharNameCharacters() returned %s, expected %s (too many lowercase a-z are ok)\n",
374ca955
A
1791 aescstrdup(pat, l1), aescstrdup(dumbPat, l2));
1792 } else if(VERBOSITY) {
1793 log_verbose("Ok: uprv_getCharNameCharacters() returned %s\n", aescstrdup(pat, l1));
b75a7d8f
A
1794 }
1795
1796 uset_close(set);
1797 uset_close(dumb);
1798 }
1799
1800 /* ### TODO: test error cases and other interesting things */
1801}
1802
1803/* test u_isMirrored() and u_charMirror() ----------------------------------- */
1804
1805static void
1806TestMirroring() {
73c04bcf
A
1807 USet *set;
1808 UErrorCode errorCode;
1809
1810 UChar32 start, end, c2, c3;
1811 int32_t i;
1812
1813 U_STRING_DECL(mirroredPattern, "[:Bidi_Mirrored:]", 17);
1814
1815 U_STRING_INIT(mirroredPattern, "[:Bidi_Mirrored:]", 17);
1816
b75a7d8f
A
1817 log_verbose("Testing u_isMirrored()\n");
1818 if(!(u_isMirrored(0x28) && u_isMirrored(0xbb) && u_isMirrored(0x2045) && u_isMirrored(0x232a) &&
1819 !u_isMirrored(0x27) && !u_isMirrored(0x61) && !u_isMirrored(0x284) && !u_isMirrored(0x3400)
1820 )
1821 ) {
1822 log_err("u_isMirrored() does not work correctly\n");
1823 }
1824
1825 log_verbose("Testing u_charMirror()\n");
1826 if(!(u_charMirror(0x3c)==0x3e && u_charMirror(0x5d)==0x5b && u_charMirror(0x208d)==0x208e && u_charMirror(0x3017)==0x3016 &&
73c04bcf 1827 u_charMirror(0xbb)==0xab && u_charMirror(0x2215)==0x29F5 && u_charMirror(0x29F5)==0x2215 && /* large delta between the code points */
b75a7d8f
A
1828 u_charMirror(0x2e)==0x2e && u_charMirror(0x6f3)==0x6f3 && u_charMirror(0x301c)==0x301c && u_charMirror(0xa4ab)==0xa4ab
1829 )
1830 ) {
1831 log_err("u_charMirror() does not work correctly\n");
1832 }
73c04bcf
A
1833
1834 /* verify that Bidi_Mirroring_Glyph roundtrips */
1835 errorCode=U_ZERO_ERROR;
1836 set=uset_openPattern(mirroredPattern, 17, &errorCode);
1837
1838 if (U_FAILURE(errorCode)) {
1839 log_data_err("uset_openPattern(mirroredPattern, 17, &errorCode) failed!");
1840 } else {
1841 for(i=0; 0==uset_getItem(set, i, &start, &end, NULL, 0, &errorCode); ++i) {
1842 do {
1843 c2=u_charMirror(start);
1844 c3=u_charMirror(c2);
1845 if(c3!=start) {
1846 log_err("u_charMirror() does not roundtrip: U+%04lx->U+%04lx->U+%04lx\n", (long)start, (long)c2, (long)c3);
1847 }
1848 } while(++start<=end);
1849 }
1850 }
1851
1852 uset_close(set);
b75a7d8f
A
1853}
1854
1855
1856struct RunTestData
1857{
1858 const char *runText;
1859 UScriptCode runCode;
1860};
1861
1862typedef struct RunTestData RunTestData;
1863
1864static void
1865CheckScriptRuns(UScriptRun *scriptRun, int32_t *runStarts, const RunTestData *testData, int32_t nRuns,
1866 const char *prefix)
1867{
1868 int32_t run, runStart, runLimit;
1869 UScriptCode runCode;
1870
1871 /* iterate over all the runs */
1872 run = 0;
1873 while (uscript_nextRun(scriptRun, &runStart, &runLimit, &runCode)) {
1874 if (runStart != runStarts[run]) {
1875 log_err("%s: incorrect start offset for run %d: expected %d, got %d\n",
1876 prefix, run, runStarts[run], runStart);
1877 }
1878
1879 if (runLimit != runStarts[run + 1]) {
1880 log_err("%s: incorrect limit offset for run %d: expected %d, got %d\n",
1881 prefix, run, runStarts[run + 1], runLimit);
1882 }
1883
1884 if (runCode != testData[run].runCode) {
1885 log_err("%s: incorrect script for run %d: expected \"%s\", got \"%s\"\n",
1886 prefix, run, uscript_getName(testData[run].runCode), uscript_getName(runCode));
1887 }
1888
1889 run += 1;
1890
1891 /* stop when we've seen all the runs we expect to see */
1892 if (run >= nRuns) {
1893 break;
1894 }
1895 }
1896
1897 /* Complain if we didn't see then number of runs we expected */
1898 if (run != nRuns) {
1899 log_err("%s: incorrect number of runs: expected %d, got %d\n", prefix, run, nRuns);
1900 }
1901}
1902
1903static void
1904TestUScriptRunAPI()
1905{
374ca955 1906 static const RunTestData testData1[] = {
b75a7d8f
A
1907 {"\\u0020\\u0946\\u0939\\u093F\\u0928\\u094D\\u0926\\u0940\\u0020", USCRIPT_DEVANAGARI},
1908 {"\\u0627\\u0644\\u0639\\u0631\\u0628\\u064A\\u0629\\u0020", USCRIPT_ARABIC},
1909 {"\\u0420\\u0443\\u0441\\u0441\\u043A\\u0438\\u0439\\u0020", USCRIPT_CYRILLIC},
1910 {"English (", USCRIPT_LATIN},
1911 {"\\u0E44\\u0E17\\u0E22", USCRIPT_THAI},
1912 {") ", USCRIPT_LATIN},
1913 {"\\u6F22\\u5B75", USCRIPT_HAN},
1914 {"\\u3068\\u3072\\u3089\\u304C\\u306A\\u3068", USCRIPT_HIRAGANA},
1915 {"\\u30AB\\u30BF\\u30AB\\u30CA", USCRIPT_KATAKANA},
1916 {"\\U00010400\\U00010401\\U00010402\\U00010403", USCRIPT_DESERET}
1917 };
374ca955
A
1918
1919 static const RunTestData testData2[] = {
1920 {"((((((((((abc))))))))))", USCRIPT_LATIN}
1921 };
1922
1923 static const struct {
1924 const RunTestData *testData;
1925 int32_t nRuns;
1926 } testDataEntries[] = {
1927 {testData1, LENGTHOF(testData1)},
1928 {testData2, LENGTHOF(testData2)}
1929 };
1930
1931 static const int32_t nTestEntries = LENGTHOF(testDataEntries);
1932 int32_t testEntry;
1933
1934 for (testEntry = 0; testEntry < nTestEntries; testEntry += 1) {
1935 UChar testString[1024];
1936 int32_t runStarts[256];
1937 int32_t nTestRuns = testDataEntries[testEntry].nRuns;
1938 const RunTestData *testData = testDataEntries[testEntry].testData;
1939
1940 int32_t run, stringLimit;
1941 UScriptRun *scriptRun = NULL;
1942 UErrorCode err;
1943
1944 /*
1945 * Fill in the test string and the runStarts array.
1946 */
1947 stringLimit = 0;
1948 for (run = 0; run < nTestRuns; run += 1) {
1949 runStarts[run] = stringLimit;
1950 stringLimit += u_unescape(testData[run].runText, &testString[stringLimit], 1024 - stringLimit);
1951 /*stringLimit -= 1;*/
1952 }
1953
1954 /* The limit of the last run */
1955 runStarts[nTestRuns] = stringLimit;
1956
1957 /*
1958 * Make sure that calling uscript_OpenRun with a NULL text pointer
1959 * and a non-zero text length returns the correct error.
1960 */
1961 err = U_ZERO_ERROR;
1962 scriptRun = uscript_openRun(NULL, stringLimit, &err);
1963
1964 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
1965 log_err("uscript_openRun(NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
1966 }
1967
1968 if (scriptRun != NULL) {
1969 log_err("uscript_openRun(NULL, stringLimit, &err) returned a non-NULL result.\n");
1970 uscript_closeRun(scriptRun);
1971 }
1972
1973 /*
1974 * Make sure that calling uscript_OpenRun with a non-NULL text pointer
1975 * and a zero text length returns the correct error.
1976 */
1977 err = U_ZERO_ERROR;
1978 scriptRun = uscript_openRun(testString, 0, &err);
1979
1980 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
1981 log_err("uscript_openRun(testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
1982 }
1983
1984 if (scriptRun != NULL) {
1985 log_err("uscript_openRun(testString, 0, &err) returned a non-NULL result.\n");
1986 uscript_closeRun(scriptRun);
1987 }
1988
1989 /*
1990 * Make sure that calling uscript_openRun with a NULL text pointer
1991 * and a zero text length doesn't return an error.
1992 */
1993 err = U_ZERO_ERROR;
1994 scriptRun = uscript_openRun(NULL, 0, &err);
1995
1996 if (U_FAILURE(err)) {
1997 log_err("Got error %s from uscript_openRun(NULL, 0, &err)\n", u_errorName(err));
1998 }
1999
2000 /* Make sure that the empty iterator doesn't find any runs */
2001 if (uscript_nextRun(scriptRun, NULL, NULL, NULL)) {
2002 log_err("uscript_nextRun(...) returned TRUE for an empty iterator.\n");
2003 }
2004
2005 /*
2006 * Make sure that calling uscript_setRunText with a NULL text pointer
2007 * and a non-zero text length returns the correct error.
2008 */
2009 err = U_ZERO_ERROR;
2010 uscript_setRunText(scriptRun, NULL, stringLimit, &err);
2011
2012 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2013 log_err("uscript_setRunText(scriptRun, NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2014 }
2015
2016 /*
2017 * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2018 * and a zero text length returns the correct error.
2019 */
2020 err = U_ZERO_ERROR;
2021 uscript_setRunText(scriptRun, testString, 0, &err);
2022
2023 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2024 log_err("uscript_setRunText(scriptRun, testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2025 }
2026
2027 /*
2028 * Now call uscript_setRunText on the empty iterator
2029 * and make sure that it works.
2030 */
2031 err = U_ZERO_ERROR;
2032 uscript_setRunText(scriptRun, testString, stringLimit, &err);
2033
2034 if (U_FAILURE(err)) {
2035 log_err("Got error %s from uscript_setRunText(...)\n", u_errorName(err));
2036 } else {
2037 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_setRunText");
2038 }
2039
b75a7d8f 2040 uscript_closeRun(scriptRun);
374ca955
A
2041
2042 /*
2043 * Now open an interator over the testString
2044 * using uscript_openRun and make sure that it works
2045 */
2046 scriptRun = uscript_openRun(testString, stringLimit, &err);
2047
2048 if (U_FAILURE(err)) {
2049 log_err("Got error %s from uscript_openRun(...)\n", u_errorName(err));
2050 } else {
2051 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_openRun");
2052 }
2053
2054 /* Now reset the iterator, and make sure
2055 * that it still works.
2056 */
2057 uscript_resetRun(scriptRun);
2058
2059 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_resetRun");
2060
2061 /* Close the iterator */
b75a7d8f
A
2062 uscript_closeRun(scriptRun);
2063 }
b75a7d8f
A
2064}
2065
2066/* test additional, non-core properties */
2067static void
2068TestAdditionalProperties() {
2069 /* test data for u_charAge() */
2070 static const struct {
2071 UChar32 c;
2072 UVersionInfo version;
2073 } charAges[]={
2074 {0x41, { 1, 1, 0, 0 }},
2075 {0xffff, { 1, 1, 0, 0 }},
2076 {0x20ab, { 2, 0, 0, 0 }},
2077 {0x2fffe, { 2, 0, 0, 0 }},
2078 {0x20ac, { 2, 1, 0, 0 }},
2079 {0xfb1d, { 3, 0, 0, 0 }},
2080 {0x3f4, { 3, 1, 0, 0 }},
2081 {0x10300, { 3, 1, 0, 0 }},
2082 {0x220, { 3, 2, 0, 0 }},
2083 {0xff60, { 3, 2, 0, 0 }}
2084 };
2085
2086 /* test data for u_hasBinaryProperty() */
2087 static int32_t
2088 props[][3]={ /* code point, property, value */
2089 { 0x0627, UCHAR_ALPHABETIC, TRUE },
2090 { 0x1034a, UCHAR_ALPHABETIC, TRUE },
2091 { 0x2028, UCHAR_ALPHABETIC, FALSE },
2092
2093 { 0x0066, UCHAR_ASCII_HEX_DIGIT, TRUE },
2094 { 0x0067, UCHAR_ASCII_HEX_DIGIT, FALSE },
2095
2096 { 0x202c, UCHAR_BIDI_CONTROL, TRUE },
2097 { 0x202f, UCHAR_BIDI_CONTROL, FALSE },
2098
2099 { 0x003c, UCHAR_BIDI_MIRRORED, TRUE },
2100 { 0x003d, UCHAR_BIDI_MIRRORED, FALSE },
2101
2102 { 0x058a, UCHAR_DASH, TRUE },
2103 { 0x007e, UCHAR_DASH, FALSE },
2104
2105 { 0x0c4d, UCHAR_DIACRITIC, TRUE },
2106 { 0x3000, UCHAR_DIACRITIC, FALSE },
2107
2108 { 0x0e46, UCHAR_EXTENDER, TRUE },
2109 { 0x0020, UCHAR_EXTENDER, FALSE },
2110
2111#if !UCONFIG_NO_NORMALIZATION
2112 { 0xfb1d, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2113 { 0x1d15f, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2114 { 0xfb1e, UCHAR_FULL_COMPOSITION_EXCLUSION, FALSE },
374ca955
A
2115
2116 { 0x110a, UCHAR_NFD_INERT, TRUE }, /* Jamo L */
2117 { 0x0308, UCHAR_NFD_INERT, FALSE },
2118
2119 { 0x1164, UCHAR_NFKD_INERT, TRUE }, /* Jamo V */
2120 { 0x1d79d, UCHAR_NFKD_INERT, FALSE }, /* math compat version of xi */
2121
2122 { 0x0021, UCHAR_NFC_INERT, TRUE }, /* ! */
2123 { 0x0061, UCHAR_NFC_INERT, FALSE }, /* a */
2124 { 0x00e4, UCHAR_NFC_INERT, FALSE }, /* a-umlaut */
2125 { 0x0102, UCHAR_NFC_INERT, FALSE }, /* a-breve */
2126 { 0xac1c, UCHAR_NFC_INERT, FALSE }, /* Hangul LV */
2127 { 0xac1d, UCHAR_NFC_INERT, TRUE }, /* Hangul LVT */
2128
2129 { 0x1d79d, UCHAR_NFKC_INERT, FALSE }, /* math compat version of xi */
2130 { 0x2a6d6, UCHAR_NFKC_INERT, TRUE }, /* Han, last of CJK ext. B */
2131
2132 { 0x00e4, UCHAR_SEGMENT_STARTER, TRUE },
2133 { 0x0308, UCHAR_SEGMENT_STARTER, FALSE },
2134 { 0x110a, UCHAR_SEGMENT_STARTER, TRUE }, /* Jamo L */
2135 { 0x1164, UCHAR_SEGMENT_STARTER, FALSE },/* Jamo V */
2136 { 0xac1c, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LV */
2137 { 0xac1d, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LVT */
b75a7d8f
A
2138#endif
2139
2140 { 0x0044, UCHAR_HEX_DIGIT, TRUE },
2141 { 0xff46, UCHAR_HEX_DIGIT, TRUE },
2142 { 0x0047, UCHAR_HEX_DIGIT, FALSE },
2143
2144 { 0x30fb, UCHAR_HYPHEN, TRUE },
2145 { 0xfe58, UCHAR_HYPHEN, FALSE },
2146
2147 { 0x2172, UCHAR_ID_CONTINUE, TRUE },
2148 { 0x0307, UCHAR_ID_CONTINUE, TRUE },
2149 { 0x005c, UCHAR_ID_CONTINUE, FALSE },
2150
2151 { 0x2172, UCHAR_ID_START, TRUE },
2152 { 0x007a, UCHAR_ID_START, TRUE },
2153 { 0x0039, UCHAR_ID_START, FALSE },
2154
2155 { 0x4db5, UCHAR_IDEOGRAPHIC, TRUE },
2156 { 0x2f999, UCHAR_IDEOGRAPHIC, TRUE },
2157 { 0x2f99, UCHAR_IDEOGRAPHIC, FALSE },
2158
2159 { 0x200c, UCHAR_JOIN_CONTROL, TRUE },
2160 { 0x2029, UCHAR_JOIN_CONTROL, FALSE },
2161
2162 { 0x1d7bc, UCHAR_LOWERCASE, TRUE },
2163 { 0x0345, UCHAR_LOWERCASE, TRUE },
2164 { 0x0030, UCHAR_LOWERCASE, FALSE },
2165
2166 { 0x1d7a9, UCHAR_MATH, TRUE },
2167 { 0x2135, UCHAR_MATH, TRUE },
2168 { 0x0062, UCHAR_MATH, FALSE },
2169
2170 { 0xfde1, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2171 { 0x10ffff, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2172 { 0x10fffd, UCHAR_NONCHARACTER_CODE_POINT, FALSE },
2173
2174 { 0x0022, UCHAR_QUOTATION_MARK, TRUE },
2175 { 0xff62, UCHAR_QUOTATION_MARK, TRUE },
2176 { 0xd840, UCHAR_QUOTATION_MARK, FALSE },
2177
2178 { 0x061f, UCHAR_TERMINAL_PUNCTUATION, TRUE },
2179 { 0xe003f, UCHAR_TERMINAL_PUNCTUATION, FALSE },
2180
2181 { 0x1d44a, UCHAR_UPPERCASE, TRUE },
2182 { 0x2162, UCHAR_UPPERCASE, TRUE },
2183 { 0x0345, UCHAR_UPPERCASE, FALSE },
2184
2185 { 0x0020, UCHAR_WHITE_SPACE, TRUE },
2186 { 0x202f, UCHAR_WHITE_SPACE, TRUE },
2187 { 0x3001, UCHAR_WHITE_SPACE, FALSE },
2188
2189 { 0x0711, UCHAR_XID_CONTINUE, TRUE },
2190 { 0x1d1aa, UCHAR_XID_CONTINUE, TRUE },
2191 { 0x007c, UCHAR_XID_CONTINUE, FALSE },
2192
2193 { 0x16ee, UCHAR_XID_START, TRUE },
2194 { 0x23456, UCHAR_XID_START, TRUE },
2195 { 0x1d1aa, UCHAR_XID_START, FALSE },
2196
2197 /*
2198 * Version break:
2199 * The following properties are only supported starting with the
2200 * Unicode version indicated in the second field.
2201 */
374ca955 2202 { -1, 0x320, 0 },
b75a7d8f
A
2203
2204 { 0x180c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2205 { 0xfe02, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2206 { 0x1801, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, FALSE },
2207
2208 { 0x0341, UCHAR_DEPRECATED, TRUE },
2209 { 0xe0041, UCHAR_DEPRECATED, FALSE },
2210
2211 { 0x00a0, UCHAR_GRAPHEME_BASE, TRUE },
2212 { 0x0a4d, UCHAR_GRAPHEME_BASE, FALSE },
2213 { 0xff9f, UCHAR_GRAPHEME_BASE, TRUE }, /* changed from Unicode 3.2 to 4 */
2214
2215 { 0x0300, UCHAR_GRAPHEME_EXTEND, TRUE },
2216 { 0xff9f, UCHAR_GRAPHEME_EXTEND, FALSE }, /* changed from Unicode 3.2 to 4 */
2217 { 0x0603, UCHAR_GRAPHEME_EXTEND, FALSE },
2218
2219 { 0x0a4d, UCHAR_GRAPHEME_LINK, TRUE },
2220 { 0xff9f, UCHAR_GRAPHEME_LINK, FALSE },
2221
2222 { 0x2ff7, UCHAR_IDS_BINARY_OPERATOR, TRUE },
2223 { 0x2ff3, UCHAR_IDS_BINARY_OPERATOR, FALSE },
2224
2225 { 0x2ff3, UCHAR_IDS_TRINARY_OPERATOR, TRUE },
2226 { 0x2f03, UCHAR_IDS_TRINARY_OPERATOR, FALSE },
2227
2228 { 0x0ec1, UCHAR_LOGICAL_ORDER_EXCEPTION, TRUE },
2229 { 0xdcba, UCHAR_LOGICAL_ORDER_EXCEPTION, FALSE },
2230
2231 { 0x2e9b, UCHAR_RADICAL, TRUE },
2232 { 0x4e00, UCHAR_RADICAL, FALSE },
2233
2234 { 0x012f, UCHAR_SOFT_DOTTED, TRUE },
2235 { 0x0049, UCHAR_SOFT_DOTTED, FALSE },
2236
2237 { 0xfa11, UCHAR_UNIFIED_IDEOGRAPH, TRUE },
2238 { 0xfa12, UCHAR_UNIFIED_IDEOGRAPH, FALSE },
2239
73c04bcf 2240 { -1, 0x401, 0 }, /* version break for Unicode 4.0.1 */
374ca955
A
2241
2242 { 0x002e, UCHAR_S_TERM, TRUE },
2243 { 0x0061, UCHAR_S_TERM, FALSE },
2244
2245 { 0x180c, UCHAR_VARIATION_SELECTOR, TRUE },
2246 { 0xfe03, UCHAR_VARIATION_SELECTOR, TRUE },
2247 { 0xe01ef, UCHAR_VARIATION_SELECTOR, TRUE },
2248 { 0xe0200, UCHAR_VARIATION_SELECTOR, FALSE },
2249
b75a7d8f
A
2250 /* enum/integer type properties */
2251
2252 /* UCHAR_BIDI_CLASS tested for assigned characters in TestUnicodeData() */
2253 /* test default Bidi classes for unassigned code points */
2254 { 0x0590, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
73c04bcf 2255 { 0x05cf, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
b75a7d8f 2256 { 0x05ed, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
73c04bcf
A
2257 { 0x07f2, UCHAR_BIDI_CLASS, U_DIR_NON_SPACING_MARK }, /* Nko, new in Unicode 5.0 */
2258 { 0x07fe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, /* unassigned R */
b75a7d8f
A
2259 { 0x08ba, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2260 { 0xfb37, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2261 { 0xfb42, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2262 { 0x10806, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2263 { 0x10909, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2264 { 0x10fe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2265
2266 { 0x0606, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2267 { 0x061c, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2268 { 0x063f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2269 { 0x070e, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2270 { 0x0775, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2271 { 0xfbc2, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2272 { 0xfd90, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2273 { 0xfefe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2274
2275 { 0x02AF, UCHAR_BLOCK, UBLOCK_IPA_EXTENSIONS },
2276 { 0x0C4E, UCHAR_BLOCK, UBLOCK_TELUGU },
2277 { 0x155A, UCHAR_BLOCK, UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS },
2278 { 0x1717, UCHAR_BLOCK, UBLOCK_TAGALOG },
374ca955 2279 { 0x1900, UCHAR_BLOCK, UBLOCK_LIMBU },
b75a7d8f
A
2280 { 0x1AFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2281 { 0x3040, UCHAR_BLOCK, UBLOCK_HIRAGANA },
2282 { 0x1D0FF, UCHAR_BLOCK, UBLOCK_BYZANTINE_MUSICAL_SYMBOLS },
374ca955 2283 { 0x50000, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
b75a7d8f 2284 { 0xEFFFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
374ca955 2285 { 0x10D0FF, UCHAR_BLOCK, UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B },
b75a7d8f
A
2286
2287 /* UCHAR_CANONICAL_COMBINING_CLASS tested for assigned characters in TestUnicodeData() */
2288 { 0xd7d7, UCHAR_CANONICAL_COMBINING_CLASS, 0 },
2289
2290 { 0x00A0, UCHAR_DECOMPOSITION_TYPE, U_DT_NOBREAK },
2291 { 0x00A8, UCHAR_DECOMPOSITION_TYPE, U_DT_COMPAT },
2292 { 0x00bf, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2293 { 0x00c0, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2294 { 0x1E9B, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2295 { 0xBCDE, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2296 { 0xFB5D, UCHAR_DECOMPOSITION_TYPE, U_DT_MEDIAL },
2297 { 0x1D736, UCHAR_DECOMPOSITION_TYPE, U_DT_FONT },
2298 { 0xe0033, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2299
2300 { 0x0009, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2301 { 0x0020, UCHAR_EAST_ASIAN_WIDTH, U_EA_NARROW },
2302 { 0x00B1, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2303 { 0x20A9, UCHAR_EAST_ASIAN_WIDTH, U_EA_HALFWIDTH },
2304 { 0x2FFB, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2305 { 0x3000, UCHAR_EAST_ASIAN_WIDTH, U_EA_FULLWIDTH },
2306 { 0x35bb, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2307 { 0x58bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2308 { 0xD7A3, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2309 { 0xEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2310 { 0x1D198, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2311 { 0x20000, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2312 { 0x2F8C7, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2313 { 0x3a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, /* plane 3 got default W values in Unicode 4 */
2314 { 0x5a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2315 { 0xFEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2316 { 0x10EEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2317
2318 /* UCHAR_GENERAL_CATEGORY tested for assigned characters in TestUnicodeData() */
2319 { 0xd7d7, UCHAR_GENERAL_CATEGORY, 0 },
2320
2321 { 0x0444, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2322 { 0x0639, UCHAR_JOINING_GROUP, U_JG_AIN },
2323 { 0x072A, UCHAR_JOINING_GROUP, U_JG_DALATH_RISH },
2324 { 0x0647, UCHAR_JOINING_GROUP, U_JG_HEH },
2325 { 0x06C1, UCHAR_JOINING_GROUP, U_JG_HEH_GOAL },
2326 { 0x06C3, UCHAR_JOINING_GROUP, U_JG_HAMZA_ON_HEH_GOAL },
2327
2328 { 0x200C, UCHAR_JOINING_TYPE, U_JT_NON_JOINING },
2329 { 0x200D, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2330 { 0x0639, UCHAR_JOINING_TYPE, U_JT_DUAL_JOINING },
2331 { 0x0640, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2332 { 0x06C3, UCHAR_JOINING_TYPE, U_JT_RIGHT_JOINING },
2333 { 0x0300, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2334 { 0x070F, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2335 { 0xe0033, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2336
2337 /* TestUnicodeData() verifies that no assigned character has "XX" (unknown) */
2338 { 0xe7e7, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2339 { 0x10fffd, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2340 { 0x0028, UCHAR_LINE_BREAK, U_LB_OPEN_PUNCTUATION },
2341 { 0x232A, UCHAR_LINE_BREAK, U_LB_CLOSE_PUNCTUATION },
2342 { 0x3401, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2343 { 0x4e02, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
b75a7d8f
A
2344 { 0x20004, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2345 { 0xf905, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2346 { 0xdb7e, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2347 { 0xdbfd, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2348 { 0xdffc, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2349 { 0x2762, UCHAR_LINE_BREAK, U_LB_EXCLAMATION },
2350 { 0x002F, UCHAR_LINE_BREAK, U_LB_BREAK_SYMBOLS },
2351 { 0x1D49C, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2352 { 0x1731, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2353
2354 /* UCHAR_NUMERIC_TYPE tested in TestNumericProperties() */
2355
2356 /* UCHAR_SCRIPT tested in TestUScriptCodeAPI() */
2357
2358 { 0x1100, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2359 { 0x1111, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2360 { 0x1159, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2361 { 0x115f, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2362
2363 { 0x1160, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2364 { 0x1161, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2365 { 0x1172, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2366 { 0x11a2, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2367
2368 { 0x11a8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2369 { 0x11b8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2370 { 0x11c8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2371 { 0x11f9, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2372
2373 { 0x115a, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2374 { 0x115e, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2375 { 0x11a3, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2376 { 0x11a7, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2377 { 0x11fa, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2378 { 0x11ff, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2379
2380 { 0xac00, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2381 { 0xac1c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2382 { 0xc5ec, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2383 { 0xd788, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2384
2385 { 0xac01, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2386 { 0xac1b, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2387 { 0xac1d, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2388 { 0xc5ee, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2389 { 0xd7a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2390
2391 { 0xd7a4, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2392
73c04bcf
A
2393 { -1, 0x410, 0 }, /* version break for Unicode 4.1 */
2394
2395 { 0x00d7, UCHAR_PATTERN_SYNTAX, TRUE },
2396 { 0xfe45, UCHAR_PATTERN_SYNTAX, TRUE },
2397 { 0x0061, UCHAR_PATTERN_SYNTAX, FALSE },
2398
2399 { 0x0020, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2400 { 0x0085, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2401 { 0x200f, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2402 { 0x00a0, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2403 { 0x3000, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2404
2405 { 0x1d200, UCHAR_BLOCK, UBLOCK_ANCIENT_GREEK_MUSICAL_NOTATION },
2406 { 0x2c8e, UCHAR_BLOCK, UBLOCK_COPTIC },
2407 { 0xfe17, UCHAR_BLOCK, UBLOCK_VERTICAL_FORMS },
2408
2409 { 0x1a00, UCHAR_SCRIPT, USCRIPT_BUGINESE },
2410 { 0x2cea, UCHAR_SCRIPT, USCRIPT_COPTIC },
2411 { 0xa82b, UCHAR_SCRIPT, USCRIPT_SYLOTI_NAGRI },
2412 { 0x103d0, UCHAR_SCRIPT, USCRIPT_OLD_PERSIAN },
2413
2414 { 0xcc28, UCHAR_LINE_BREAK, U_LB_H2 },
2415 { 0xcc29, UCHAR_LINE_BREAK, U_LB_H3 },
2416 { 0xac03, UCHAR_LINE_BREAK, U_LB_H3 },
2417 { 0x115f, UCHAR_LINE_BREAK, U_LB_JL },
2418 { 0x11aa, UCHAR_LINE_BREAK, U_LB_JT },
2419 { 0x11a1, UCHAR_LINE_BREAK, U_LB_JV },
2420
2421 { 0xb2c9, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_LVT },
2422 { 0x036f, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_EXTEND },
2423 { 0x0000, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_CONTROL },
2424 { 0x1160, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_V },
2425
2426 { 0x05f4, UCHAR_WORD_BREAK, U_WB_MIDLETTER },
2427 { 0x4ef0, UCHAR_WORD_BREAK, U_WB_OTHER },
2428 { 0x19d9, UCHAR_WORD_BREAK, U_WB_NUMERIC },
2429 { 0x2044, UCHAR_WORD_BREAK, U_WB_MIDNUM },
2430
2431 { 0xfffd, UCHAR_SENTENCE_BREAK, U_SB_OTHER },
2432 { 0x1ffc, UCHAR_SENTENCE_BREAK, U_SB_UPPER },
2433 { 0xff63, UCHAR_SENTENCE_BREAK, U_SB_CLOSE },
2434 { 0x2028, UCHAR_SENTENCE_BREAK, U_SB_SEP },
2435
b75a7d8f
A
2436 /* undefined UProperty values */
2437 { 0x61, 0x4a7, 0 },
2438 { 0x234bc, 0x15ed, 0 }
2439 };
2440
2441 UVersionInfo version;
2442 UChar32 c;
2443 int32_t i, result, uVersion;
2444 UProperty which;
2445
2446 /* what is our Unicode version? */
2447 u_getUnicodeVersion(version);
374ca955 2448 uVersion=((int32_t)version[0]<<8)|(version[1]<<4)|version[2]; /* major/minor/update version numbers */
b75a7d8f
A
2449
2450 u_charAge(0x20, version);
2451 if(version[0]==0) {
2452 /* no additional properties available */
2453 log_err("TestAdditionalProperties: no additional properties available, not tested\n");
2454 return;
2455 }
2456
2457 /* test u_charAge() */
2458 for(i=0; i<sizeof(charAges)/sizeof(charAges[0]); ++i) {
2459 u_charAge(charAges[i].c, version);
2460 if(0!=memcmp(version, charAges[i].version, sizeof(UVersionInfo))) {
2461 log_err("error: u_charAge(U+%04lx)={ %u, %u, %u, %u } instead of { %u, %u, %u, %u }\n",
2462 charAges[i].c,
2463 version[0], version[1], version[2], version[3],
2464 charAges[i].version[0], charAges[i].version[1], charAges[i].version[2], charAges[i].version[3]);
2465 }
2466 }
2467
2468 if( u_getIntPropertyMinValue(UCHAR_DASH)!=0 ||
2469 u_getIntPropertyMinValue(UCHAR_BIDI_CLASS)!=0 ||
2470 u_getIntPropertyMinValue(UCHAR_BLOCK)!=0 || /* j2478 */
2471 u_getIntPropertyMinValue(UCHAR_SCRIPT)!=0 || /*JB#2410*/
2472 u_getIntPropertyMinValue(0x2345)!=0
2473 ) {
2474 log_err("error: u_getIntPropertyMinValue() wrong\n");
2475 }
73c04bcf
A
2476 if( u_getIntPropertyMaxValue(UCHAR_DASH)!=1) {
2477 log_err("error: u_getIntPropertyMaxValue(UCHAR_DASH) wrong\n");
2478 }
2479 if( u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE)!=1) {
2480 log_err("error: u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE) wrong\n");
2481 }
2482 if( u_getIntPropertyMaxValue(UCHAR_BINARY_LIMIT-1)!=1) {
2483 log_err("error: u_getIntPropertyMaxValue(UCHAR_BINARY_LIMIT-1) wrong\n");
2484 }
2485 if( u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS)!=(int32_t)U_CHAR_DIRECTION_COUNT-1 ) {
2486 log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS) wrong\n");
2487 }
2488 if( u_getIntPropertyMaxValue(UCHAR_BLOCK)!=(int32_t)UBLOCK_COUNT-1 ) {
2489 log_err("error: u_getIntPropertyMaxValue(UCHAR_BLOCK) wrong\n");
2490 }
2491 if(u_getIntPropertyMaxValue(UCHAR_LINE_BREAK)!=(int32_t)U_LB_COUNT-1) {
2492 log_err("error: u_getIntPropertyMaxValue(UCHAR_LINE_BREAK) wrong\n");
2493 }
2494 if(u_getIntPropertyMaxValue(UCHAR_SCRIPT)!=(int32_t)USCRIPT_CODE_LIMIT-1) {
2495 log_err("error: u_getIntPropertyMaxValue(UCHAR_SCRIPT) wrong\n");
2496 }
2497 if(u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE)!=(int32_t)U_NT_COUNT-1) {
2498 log_err("error: u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE) wrong\n");
2499 }
2500 if(u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY)!=(int32_t)U_CHAR_CATEGORY_COUNT-1) {
2501 log_err("error: u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY) wrong\n");
2502 }
2503 if(u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE)!=(int32_t)U_HST_COUNT-1) {
2504 log_err("error: u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE) wrong\n");
2505 }
2506 if(u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK)!=(int32_t)U_GCB_COUNT-1) {
2507 log_err("error: u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK) wrong\n");
2508 }
2509 if(u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK)!=(int32_t)U_SB_COUNT-1) {
2510 log_err("error: u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK) wrong\n");
2511 }
2512 if(u_getIntPropertyMaxValue(UCHAR_WORD_BREAK)!=(int32_t)U_WB_COUNT-1) {
2513 log_err("error: u_getIntPropertyMaxValue(UCHAR_WORD_BREAK) wrong\n");
2514 }
2515 /*JB#2410*/
2516 if( u_getIntPropertyMaxValue(0x2345)!=-1) {
2517 log_err("error: u_getIntPropertyMaxValue(0x2345) wrong\n");
2518 }
2519 if( u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) != (int32_t) (U_DT_COUNT - 1)) {
2520 log_err("error: u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) wrong\n");
2521 }
2522 if( u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) != (int32_t) (U_JG_COUNT -1)) {
2523 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) wrong\n");
2524 }
2525 if( u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) != (int32_t) (U_JT_COUNT -1)) {
2526 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) wrong\n");
2527 }
2528 if( u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) != (int32_t) (U_EA_COUNT -1)) {
2529 log_err("error: u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) wrong\n");
b75a7d8f
A
2530 }
2531
2532 /* test u_hasBinaryProperty() and u_getIntPropertyValue() */
2533 for(i=0; i<sizeof(props)/sizeof(props[0]); ++i) {
2534 if(props[i][0]<0) {
2535 /* Unicode version break */
2536 if(uVersion<props[i][1]) {
2537 break; /* do not test properties that are not yet supported */
2538 } else {
2539 continue; /* skip this row */
2540 }
2541 }
2542
2543 c=(UChar32)props[i][0];
2544 which=(UProperty)props[i][1];
2545
2546 if(which<UCHAR_INT_START) {
2547 result=u_hasBinaryProperty(c, which);
2548 if(result!=props[i][2]) {
2549 log_err("error: u_hasBinaryProperty(U+%04lx, %d)=%d is wrong (props[%d])\n",
2550 c, which, result, i);
2551 }
2552 }
2553
2554 result=u_getIntPropertyValue(c, which);
2555 if(result!=props[i][2]) {
2556 log_err("error: u_getIntPropertyValue(U+%04lx, 0x1000+%d)=%d is wrong, should be %d (props[%d])\n",
2557 c, (int32_t)which-0x1000, result, props[i][2], i);
2558 }
2559
2560 /* test separate functions, too */
2561 switch((UProperty)props[i][1]) {
2562 case UCHAR_ALPHABETIC:
2563 if(u_isUAlphabetic((UChar32)props[i][0])!=(UBool)props[i][2]) {
2564 log_err("error: u_isUAlphabetic(U+%04lx)=%d is wrong (props[%d])\n",
2565 props[i][0], result, i);
2566 }
2567 break;
2568 case UCHAR_LOWERCASE:
2569 if(u_isULowercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2570 log_err("error: u_isULowercase(U+%04lx)=%d is wrong (props[%d])\n",
2571 props[i][0], result, i);
2572 }
2573 break;
2574 case UCHAR_UPPERCASE:
2575 if(u_isUUppercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2576 log_err("error: u_isUUppercase(U+%04lx)=%d is wrong (props[%d])\n",
2577 props[i][0], result, i);
2578 }
2579 break;
2580 case UCHAR_WHITE_SPACE:
2581 if(u_isUWhiteSpace((UChar32)props[i][0])!=(UBool)props[i][2]) {
2582 log_err("error: u_isUWhiteSpace(U+%04lx)=%d is wrong (props[%d])\n",
2583 props[i][0], result, i);
2584 }
2585 break;
2586 default:
2587 break;
2588 }
2589 }
2590}
2591
2592static void
2593TestNumericProperties(void) {
2594 /* see UnicodeData.txt, DerivedNumericValues.txt */
2595 static const struct {
2596 UChar32 c;
2597 int32_t type;
2598 double numValue;
2599 } values[]={
2600 { 0x0F33, U_NT_NUMERIC, -1./2. },
2601 { 0x0C66, U_NT_DECIMAL, 0 },
2602 { 0x96f6, U_NT_NUMERIC, 0 },
2603 { 0x2159, U_NT_NUMERIC, 1./6. },
2604 { 0x00BD, U_NT_NUMERIC, 1./2. },
2605 { 0x0031, U_NT_DECIMAL, 1. },
2606 { 0x4e00, U_NT_NUMERIC, 1. },
2607 { 0x58f1, U_NT_NUMERIC, 1. },
2608 { 0x10320, U_NT_NUMERIC, 1. },
2609 { 0x0F2B, U_NT_NUMERIC, 3./2. },
2610 { 0x00B2, U_NT_DIGIT, 2. },
2611 { 0x5f10, U_NT_NUMERIC, 2. },
2612 { 0x1813, U_NT_DECIMAL, 3. },
2613 { 0x5f0e, U_NT_NUMERIC, 3. },
2614 { 0x2173, U_NT_NUMERIC, 4. },
2615 { 0x8086, U_NT_NUMERIC, 4. },
2616 { 0x278E, U_NT_DIGIT, 5. },
2617 { 0x1D7F2, U_NT_DECIMAL, 6. },
2618 { 0x247A, U_NT_DIGIT, 7. },
2619 { 0x7396, U_NT_NUMERIC, 9. },
2620 { 0x1372, U_NT_NUMERIC, 10. },
2621 { 0x216B, U_NT_NUMERIC, 12. },
2622 { 0x16EE, U_NT_NUMERIC, 17. },
2623 { 0x249A, U_NT_NUMERIC, 19. },
2624 { 0x303A, U_NT_NUMERIC, 30. },
2625 { 0x5345, U_NT_NUMERIC, 30. },
2626 { 0x32B2, U_NT_NUMERIC, 37. },
2627 { 0x1375, U_NT_NUMERIC, 40. },
2628 { 0x10323, U_NT_NUMERIC, 50. },
2629 { 0x0BF1, U_NT_NUMERIC, 100. },
2630 { 0x964c, U_NT_NUMERIC, 100. },
2631 { 0x217E, U_NT_NUMERIC, 500. },
2632 { 0x2180, U_NT_NUMERIC, 1000. },
2633 { 0x4edf, U_NT_NUMERIC, 1000. },
2634 { 0x2181, U_NT_NUMERIC, 5000. },
2635 { 0x137C, U_NT_NUMERIC, 10000. },
2636 { 0x4e07, U_NT_NUMERIC, 10000. },
2637 { 0x4ebf, U_NT_NUMERIC, 100000000. },
2638 { 0x5146, U_NT_NUMERIC, 1000000000000. },
2639 { 0x61, U_NT_NONE, U_NO_NUMERIC_VALUE },
2640 { 0x3000, U_NT_NONE, U_NO_NUMERIC_VALUE },
2641 { 0xfffe, U_NT_NONE, U_NO_NUMERIC_VALUE },
2642 { 0x10301, U_NT_NONE, U_NO_NUMERIC_VALUE },
2643 { 0xe0033, U_NT_NONE, U_NO_NUMERIC_VALUE },
2644 { 0x10ffff, U_NT_NONE, U_NO_NUMERIC_VALUE }
2645 };
2646
2647 double nv;
2648 UChar32 c;
2649 int32_t i, type;
2650
2651 for(i=0; i<LENGTHOF(values); ++i) {
2652 c=values[i].c;
2653 type=u_getIntPropertyValue(c, UCHAR_NUMERIC_TYPE);
2654 nv=u_getNumericValue(c);
2655
2656 if(type!=values[i].type) {
2657 log_err("UCHAR_NUMERIC_TYPE(U+%04lx)=%d should be %d\n", c, type, values[i].type);
2658 }
2659 if(0.000001 <= fabs(nv - values[i].numValue)) {
2660 log_err("u_getNumericValue(U+%04lx)=%g should be %g\n", c, nv, values[i].numValue);
2661 }
2662 }
2663}
2664
2665/**
2666 * Test the property names and property value names API.
2667 */
2668static void
2669TestPropertyNames(void) {
2670 int32_t p, v, choice=0, rev;
2671 UBool atLeastSomething = FALSE;
2672
2673 for (p=0; ; ++p) {
2674 UBool sawProp = FALSE;
2675 if(p > 10 && !atLeastSomething) {
2676 log_data_err("Never got anything after 10 tries.\nYour data is probably fried. Quitting this test\n", p, choice);
2677 return;
2678 }
2679
2680 for (choice=0; ; ++choice) {
2681 const char* name = u_getPropertyName(p, choice);
2682 if (name) {
2683 if (!sawProp) log_verbose("prop 0x%04x+%2d:", p&~0xfff, p&0xfff);
2684 log_verbose("%d=\"%s\"", choice, name);
2685 sawProp = TRUE;
2686 atLeastSomething = TRUE;
2687
2688 /* test reverse mapping */
2689 rev = u_getPropertyEnum(name);
2690 if (rev != p) {
2691 log_err("Property round-trip failure: %d -> %s -> %d\n",
2692 p, name, rev);
2693 }
2694 }
2695 if (!name && choice>0) break;
2696 }
2697 if (sawProp) {
2698 /* looks like a valid property; check the values */
2699 const char* pname = u_getPropertyName(p, U_LONG_PROPERTY_NAME);
2700 int32_t max = 0;
2701 if (p == UCHAR_CANONICAL_COMBINING_CLASS) {
2702 max = 255;
2703 } else if (p == UCHAR_GENERAL_CATEGORY_MASK) {
2704 /* it's far too slow to iterate all the way up to
2705 the real max, U_GC_P_MASK */
2706 max = U_GC_NL_MASK;
2707 } else if (p == UCHAR_BLOCK) {
2708 /* UBlockCodes, unlike other values, start at 1 */
2709 max = 1;
2710 }
2711 log_verbose("\n");
2712 for (v=-1; ; ++v) {
2713 UBool sawValue = FALSE;
2714 for (choice=0; ; ++choice) {
2715 const char* vname = u_getPropertyValueName(p, v, choice);
2716 if (vname) {
2717 if (!sawValue) log_verbose(" %s, value %d:", pname, v);
2718 log_verbose("%d=\"%s\"", choice, vname);
2719 sawValue = TRUE;
2720
2721 /* test reverse mapping */
2722 rev = u_getPropertyValueEnum(p, vname);
2723 if (rev != v) {
2724 log_err("Value round-trip failure (%s): %d -> %s -> %d\n",
2725 pname, v, vname, rev);
2726 }
2727 }
2728 if (!vname && choice>0) break;
2729 }
2730 if (sawValue) {
2731 log_verbose("\n");
2732 }
2733 if (!sawValue && v>=max) break;
2734 }
2735 }
2736 if (!sawProp) {
2737 if (p>=UCHAR_STRING_LIMIT) {
2738 break;
2739 } else if (p>=UCHAR_DOUBLE_LIMIT) {
2740 p = UCHAR_STRING_START - 1;
2741 } else if (p>=UCHAR_MASK_LIMIT) {
2742 p = UCHAR_DOUBLE_START - 1;
2743 } else if (p>=UCHAR_INT_LIMIT) {
2744 p = UCHAR_MASK_START - 1;
2745 } else if (p>=UCHAR_BINARY_LIMIT) {
2746 p = UCHAR_INT_START - 1;
2747 }
2748 }
2749 }
2750}
2751
2752/**
2753 * Test the property values API. See JB#2410.
2754 */
2755static void
2756TestPropertyValues(void) {
2757 int32_t i, p, min, max;
2758 UErrorCode ec;
2759
2760 /* Min should be 0 for everything. */
2761 /* Until JB#2478 is fixed, the one exception is UCHAR_BLOCK. */
2762 for (p=UCHAR_INT_START; p<UCHAR_INT_LIMIT; ++p) {
2763 min = u_getIntPropertyMinValue(p);
2764 if (min != 0) {
2765 if (p == UCHAR_BLOCK) {
2766 /* This is okay...for now. See JB#2487.
2767 TODO Update this for JB#2487. */
2768 } else {
2769 const char* name;
2770 name = u_getPropertyName(p, U_LONG_PROPERTY_NAME);
2771 if (name == NULL) name = "<ERROR>";
2772 log_err("FAIL: u_getIntPropertyMinValue(%s) = %d, exp. 0\n",
2773 name, min);
2774 }
2775 }
2776 }
2777
2778 if( u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY_MASK)!=0 ||
2779 u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY_MASK)!=-1) {
2780 log_err("error: u_getIntPropertyMin/MaxValue(UCHAR_GENERAL_CATEGORY_MASK) is wrong\n");
2781 }
2782
2783 /* Max should be -1 for invalid properties. */
2784 max = u_getIntPropertyMaxValue(-1);
2785 if (max != -1) {
2786 log_err("FAIL: u_getIntPropertyMaxValue(-1) = %d, exp. -1\n",
2787 max);
2788 }
2789
73c04bcf 2790 /* Script should return USCRIPT_INVALID_CODE for an invalid code point. */
b75a7d8f
A
2791 for (i=0; i<2; ++i) {
2792 int32_t script;
2793 const char* desc;
2794 ec = U_ZERO_ERROR;
2795 switch (i) {
2796 case 0:
2797 script = uscript_getScript(-1, &ec);
2798 desc = "uscript_getScript(-1)";
2799 break;
2800 case 1:
2801 script = u_getIntPropertyValue(-1, UCHAR_SCRIPT);
2802 desc = "u_getIntPropertyValue(-1, UCHAR_SCRIPT)";
2803 break;
2804 default:
2805 log_err("Internal test error. Too many scripts\n");
2806 return;
2807 }
2808 /* We don't explicitly test ec. It should be U_FAILURE but it
2809 isn't documented as such. */
73c04bcf 2810 if (script != (int32_t)USCRIPT_INVALID_CODE) {
b75a7d8f
A
2811 log_err("FAIL: %s = %d, exp. 0\n",
2812 desc, script);
2813 }
2814 }
2815}
2816
2817/* add characters from a serialized set to a normal one */
2818static void
2819_setAddSerialized(USet *set, const USerializedSet *sset) {
2820 UChar32 start, end;
2821 int32_t i, count;
2822
2823 count=uset_getSerializedRangeCount(sset);
2824 for(i=0; i<count; ++i) {
2825 uset_getSerializedRange(sset, i, &start, &end);
2826 uset_addRange(set, start, end);
2827 }
2828}
2829
2830/* various tests for consistency of UCD data and API behavior */
2831static void
2832TestConsistency() {
2833#if !UCONFIG_NO_NORMALIZATION
2834 UChar buffer16[300];
2835#endif
2836 char buffer[300];
2837 USet *set1, *set2, *set3, *set4;
2838 UErrorCode errorCode;
2839
2840#if !UCONFIG_NO_NORMALIZATION
2841 USerializedSet sset;
2842#endif
2843 UChar32 start, end;
2844 int32_t i, length;
2845
2846 U_STRING_DECL(hyphenPattern, "[:Hyphen:]", 10);
2847 U_STRING_DECL(dashPattern, "[:Dash:]", 8);
2848 U_STRING_DECL(lowerPattern, "[:Lowercase:]", 13);
2849 U_STRING_DECL(formatPattern, "[:Cf:]", 6);
2850 U_STRING_DECL(alphaPattern, "[:Alphabetic:]", 14);
2851
73c04bcf
A
2852 U_STRING_DECL(mathBlocksPattern,
2853 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
2854 1+32+46+46+45+43+1+1); /* +1 for NUL */
2855 U_STRING_DECL(mathPattern, "[:Math:]", 8);
2856 U_STRING_DECL(unassignedPattern, "[:Cn:]", 6);
2857 U_STRING_DECL(unknownPattern, "[:sc=Unknown:]", 14);
2858 U_STRING_DECL(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
2859
b75a7d8f
A
2860 U_STRING_INIT(hyphenPattern, "[:Hyphen:]", 10);
2861 U_STRING_INIT(dashPattern, "[:Dash:]", 8);
2862 U_STRING_INIT(lowerPattern, "[:Lowercase:]", 13);
2863 U_STRING_INIT(formatPattern, "[:Cf:]", 6);
2864 U_STRING_INIT(alphaPattern, "[:Alphabetic:]", 14);
2865
73c04bcf
A
2866 U_STRING_INIT(mathBlocksPattern,
2867 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
2868 1+32+46+46+45+43+1+1); /* +1 for NUL */
2869 U_STRING_INIT(mathPattern, "[:Math:]", 8);
2870 U_STRING_INIT(unassignedPattern, "[:Cn:]", 6);
2871 U_STRING_INIT(unknownPattern, "[:sc=Unknown:]", 14);
2872 U_STRING_INIT(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
2873
b75a7d8f
A
2874 /*
2875 * It used to be that UCD.html and its precursors said
2876 * "Those dashes used to mark connections between pieces of words,
2877 * plus the Katakana middle dot."
2878 *
2879 * Unicode 4 changed 00AD Soft Hyphen to Cf and removed it from Dash
2880 * but not from Hyphen.
2881 * UTC 94 (2003mar) decided to leave it that way and to changed UCD.html.
2882 * Therefore, do not show errors when testing the Hyphen property.
2883 */
2884 log_verbose("Starting with Unicode 4, inconsistencies with [:Hyphen:] are\n"
2885 "known to the UTC and not considered errors.\n");
2886
2887 errorCode=U_ZERO_ERROR;
2888 set1=uset_openPattern(hyphenPattern, 10, &errorCode);
2889 set2=uset_openPattern(dashPattern, 8, &errorCode);
2890 if(U_SUCCESS(errorCode)) {
2891 /* remove the Katakana middle dot(s) from set1 */
2892 uset_remove(set1, 0x30fb);
2893 uset_remove(set1, 0xff65); /* halfwidth variant */
2894 showAMinusB(set1, set2, "[:Hyphen:]", "[:Dash:]", FALSE);
2895 } else {
2896 log_err("error opening [:Hyphen:] or [:Dash:] - %s\n", u_errorName(errorCode));
2897 }
2898
2899 /* check that Cf is neither Hyphen nor Dash nor Alphabetic */
2900 set3=uset_openPattern(formatPattern, 6, &errorCode);
2901 set4=uset_openPattern(alphaPattern, 14, &errorCode);
2902 if(U_SUCCESS(errorCode)) {
2903 showAIntersectB(set3, set1, "[:Cf:]", "[:Hyphen:]", FALSE);
2904 showAIntersectB(set3, set2, "[:Cf:]", "[:Dash:]", TRUE);
2905 showAIntersectB(set3, set4, "[:Cf:]", "[:Alphabetic:]", TRUE);
2906 } else {
2907 log_err("error opening [:Cf:] or [:Alpbabetic:] - %s\n", u_errorName(errorCode));
2908 }
2909
2910 uset_close(set1);
2911 uset_close(set2);
2912 uset_close(set3);
2913 uset_close(set4);
2914
2915 /*
2916 * Check that each lowercase character has "small" in its name
2917 * and not "capital".
2918 * There are some such characters, some of which seem odd.
2919 * Use the verbose flag to see these notices.
2920 */
2921 errorCode=U_ZERO_ERROR;
2922 set1=uset_openPattern(lowerPattern, 13, &errorCode);
2923 if(U_SUCCESS(errorCode)) {
2924 for(i=0;; ++i) {
2925 length=uset_getItem(set1, i, &start, &end, NULL, 0, &errorCode);
2926 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
2927 break; /* done */
2928 }
2929 if(U_FAILURE(errorCode)) {
2930 log_err("error iterating over [:Lowercase:] at item %d: %s\n",
2931 i, u_errorName(errorCode));
2932 break;
2933 }
2934 if(length!=0) {
2935 break; /* done with code points, got a string or -1 */
2936 }
2937
2938 while(start<=end) {
2939 length=u_charName(start, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode);
2940 if(U_FAILURE(errorCode)) {
2941 log_err("error getting the name of U+%04x - %s\n", start, u_errorName(errorCode));
2942 errorCode=U_ZERO_ERROR;
2943 continue;
2944 }
2945 if( (strstr(buffer, "SMALL")==NULL || strstr(buffer, "CAPITAL")!=NULL) &&
2946 strstr(buffer, "SMALL CAPITAL")==NULL
2947 ) {
2948 log_verbose("info: [:Lowercase:] contains U+%04x whose name does not suggest lowercase: %s\n", start, buffer);
2949 }
2950 ++start;
2951 }
2952 }
2953 } else {
2954 log_err("error opening [:Lowercase:] - %s\n", u_errorName(errorCode));
2955 }
2956 uset_close(set1);
2957
2958#if !UCONFIG_NO_NORMALIZATION
2959
2960 /*
2961 * Test for an example that unorm_getCanonStartSet() delivers
2962 * all characters that compose from the input one,
2963 * even in multiple steps.
2964 * For example, the set for "I" (0049) should contain both
2965 * I-diaeresis (00CF) and I-diaeresis-acute (1E2E).
2966 * In general, the set for the middle such character should be a subset
2967 * of the set for the first.
2968 */
2969 set1=uset_open(1, 0);
2970 set2=uset_open(1, 0);
2971
374ca955
A
2972 if (unorm_getCanonStartSet(0x49, &sset)) {
2973 _setAddSerialized(set1, &sset);
b75a7d8f 2974
374ca955
A
2975 /* enumerate all characters that are plausible to be latin letters */
2976 for(start=0xa0; start<0x2000; ++start) {
2977 if(unorm_getDecomposition(start, FALSE, buffer16, LENGTHOF(buffer16))>1 && buffer16[0]==0x49) {
2978 uset_add(set2, start);
2979 }
b75a7d8f 2980 }
374ca955
A
2981
2982 compareUSets(set1, set2,
2983 "[canon start set of 0049]", "[all c with canon decomp with 0049]",
2984 TRUE);
2985 } else {
2986 log_err("error calling unorm_getCanonStartSet()\n");
b75a7d8f
A
2987 }
2988
b75a7d8f
A
2989 uset_close(set1);
2990 uset_close(set2);
2991
2992#endif
73c04bcf
A
2993
2994 /* verify that all assigned characters in Math blocks are exactly Math characters */
2995 errorCode=U_ZERO_ERROR;
2996 set1=uset_openPattern(mathBlocksPattern, -1, &errorCode);
2997 set2=uset_openPattern(mathPattern, 8, &errorCode);
2998 set3=uset_openPattern(unassignedPattern, 6, &errorCode);
2999 if(U_SUCCESS(errorCode)) {
3000 uset_retainAll(set2, set1); /* [math blocks]&[:Math:] */
3001 uset_complement(set3); /* assigned characters */
3002 uset_retainAll(set1, set3); /* [math blocks]&[assigned] */
3003 compareUSets(set1, set2,
3004 "[assigned Math block chars]", "[math blocks]&[:Math:]",
3005 TRUE);
3006 } else {
3007 log_err("error opening [math blocks] or [:Math:] or [:Cn:] - %s\n", u_errorName(errorCode));
3008 }
3009 uset_close(set1);
3010 uset_close(set2);
3011 uset_close(set3);
3012
3013 /* new in Unicode 5.0: exactly all unassigned+PUA+surrogate code points have script=Unknown */
3014 errorCode=U_ZERO_ERROR;
3015 set1=uset_openPattern(unknownPattern, 14, &errorCode);
3016 set2=uset_openPattern(reservedPattern, 20, &errorCode);
3017 if(U_SUCCESS(errorCode)) {
3018 compareUSets(set1, set2,
3019 "[:sc=Unknown:]", "[[:Cn:][:Co:][:Cs:]]",
3020 TRUE);
3021 } else {
3022 log_err("error opening [:sc=Unknown:] or [[:Cn:][:Co:][:Cs:]] - %s\n", u_errorName(errorCode));
3023 }
3024 uset_close(set1);
3025 uset_close(set2);
b75a7d8f 3026}
374ca955 3027
73c04bcf
A
3028/*
3029 * Starting with ICU4C 3.4, the core Unicode properties files
3030 * (uprops.icu, ucase.icu, ubidi.icu, unorm.icu)
3031 * are hardcoded in the common DLL and therefore not included
3032 * in the data package any more.
3033 * Test requiring these files are disabled so that
3034 * we need not jump through hoops (like adding snapshots of these files
3035 * to testdata).
3036 * See Jitterbug 4497.
3037 */
3038#define HARDCODED_DATA_4497 1
3039
374ca955
A
3040/* API coverage for ucase.c */
3041static void TestUCase() {
73c04bcf 3042#if !HARDCODED_DATA_4497
374ca955
A
3043 UDataMemory *pData;
3044 UCaseProps *csp;
73c04bcf
A
3045#endif
3046 const UCaseProps *ccsp;
374ca955
A
3047 UErrorCode errorCode;
3048
73c04bcf 3049#if !HARDCODED_DATA_4497
374ca955
A
3050 /* coverage for ucase_openBinary() */
3051 errorCode=U_ZERO_ERROR;
3052 pData=udata_open(NULL, UCASE_DATA_TYPE, UCASE_DATA_NAME, &errorCode);
3053 if(U_FAILURE(errorCode)) {
3054 log_data_err("unable to open " UCASE_DATA_NAME "." UCASE_DATA_TYPE ": %s\n",
3055 u_errorName(errorCode));
3056 return;
3057 }
3058
3059 csp=ucase_openBinary((const uint8_t *)pData->pHeader, -1, &errorCode);
3060 if(U_FAILURE(errorCode)) {
3061 log_err("ucase_openBinary() fails for the contents of " UCASE_DATA_NAME "." UCASE_DATA_TYPE ": %s\n",
3062 u_errorName(errorCode));
3063 udata_close(pData);
3064 return;
3065 }
3066
3067 if(UCASE_LOWER!=ucase_getType(csp, 0xdf)) { /* verify islower(sharp s) */
3068 log_err("ucase_openBinary() does not seem to return working UCaseProps\n");
3069 }
3070
3071 ucase_close(csp);
3072 udata_close(pData);
73c04bcf
A
3073#endif
3074
3075 /* coverage for ucase_getDummy() */
3076 errorCode=U_ZERO_ERROR;
3077 ccsp=ucase_getDummy(&errorCode);
3078 if(ucase_tolower(ccsp, 0x41)!=0x41) {
3079 log_err("ucase_tolower(dummy, A)!=A\n");
3080 }
3081}
3082
3083/* API coverage for ubidi_props.c */
3084static void TestUBiDiProps() {
3085#if !HARDCODED_DATA_4497
3086 UDataMemory *pData;
3087 UBiDiProps *bdp;
3088#endif
3089 const UBiDiProps *cbdp;
3090 UErrorCode errorCode;
3091
3092#if !HARDCODED_DATA_4497
3093 /* coverage for ubidi_openBinary() */
3094 errorCode=U_ZERO_ERROR;
3095 pData=udata_open(NULL, UBIDI_DATA_TYPE, UBIDI_DATA_NAME, &errorCode);
3096 if(U_FAILURE(errorCode)) {
3097 log_data_err("unable to open " UBIDI_DATA_NAME "." UBIDI_DATA_TYPE ": %s\n",
3098 u_errorName(errorCode));
3099 return;
3100 }
3101
3102 bdp=ubidi_openBinary((const uint8_t *)pData->pHeader, -1, &errorCode);
3103 if(U_FAILURE(errorCode)) {
3104 log_err("ubidi_openBinary() fails for the contents of " UBIDI_DATA_NAME "." UBIDI_DATA_TYPE ": %s\n",
3105 u_errorName(errorCode));
3106 udata_close(pData);
3107 return;
3108 }
3109
3110 if(0x2215!=ubidi_getMirror(bdp, 0x29F5)) { /* verify some data */
3111 log_err("ubidi_openBinary() does not seem to return working UBiDiProps\n");
3112 }
3113
3114 ubidi_closeProps(bdp);
3115 udata_close(pData);
3116#endif
3117
3118 /* coverage for ubidi_getDummy() */
3119 errorCode=U_ZERO_ERROR;
3120 cbdp=ubidi_getDummy(&errorCode);
3121 if(ubidi_getClass(cbdp, 0x20)!=0) {
3122 log_err("ubidi_getClass(dummy, space)!=0\n");
3123 }
3124}
3125
3126/* test case folding, compare return values with CaseFolding.txt ------------ */
3127
3128/* bit set for which case foldings for a character have been tested already */
3129enum {
3130 CF_SIMPLE=1,
3131 CF_FULL=2,
3132 CF_TURKIC=4,
3133 CF_ALL=7
3134};
3135
3136static void
3137testFold(UChar32 c, int which,
3138 UChar32 simple, UChar32 turkic,
3139 const UChar *full, int32_t fullLength,
3140 const UChar *turkicFull, int32_t turkicFullLength) {
3141 UChar s[2], t[32];
3142 UChar32 c2;
3143 int32_t length, length2;
3144
3145 UErrorCode errorCode=U_ZERO_ERROR;
3146
3147 length=0;
3148 U16_APPEND_UNSAFE(s, length, c);
3149
3150 if((which&CF_SIMPLE)!=0 && (c2=u_foldCase(c, 0))!=simple) {
3151 log_err("u_foldCase(U+%04lx, default)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3152 }
3153 if((which&CF_FULL)!=0) {
3154 length2=u_strFoldCase(t, LENGTHOF(t), s, length, 0, &errorCode);
3155 if(length2!=fullLength || 0!=u_memcmp(t, full, fullLength)) {
3156 log_err("u_strFoldCase(U+%04lx, default) does not fold properly\n", (long)c);
3157 }
3158 }
3159 if((which&CF_TURKIC)!=0) {
3160 if((c2=u_foldCase(c, U_FOLD_CASE_EXCLUDE_SPECIAL_I))!=turkic) {
3161 log_err("u_foldCase(U+%04lx, turkic)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3162 }
3163
3164 length2=u_strFoldCase(t, LENGTHOF(t), s, length, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &errorCode);
3165 if(length2!=turkicFullLength || 0!=u_memcmp(t, turkicFull, length2)) {
3166 log_err("u_strFoldCase(U+%04lx, turkic) does not fold properly\n", (long)c);
3167 }
3168 }
3169}
3170
3171/* test that c case-folds to itself */
3172static void
3173testFoldToSelf(UChar32 c, int which) {
3174 UChar s[2];
3175 int32_t length;
3176
3177 length=0;
3178 U16_APPEND_UNSAFE(s, length, c);
3179 testFold(c, which, c, c, s, length, s, length);
3180}
3181
3182struct CaseFoldingData {
3183 USet *notSeen;
3184 UChar32 prev, prevSimple;
3185 UChar prevFull[32];
3186 int32_t prevFullLength;
3187 int which;
3188};
3189typedef struct CaseFoldingData CaseFoldingData;
3190
3191static void U_CALLCONV
3192caseFoldingLineFn(void *context,
3193 char *fields[][2], int32_t fieldCount,
3194 UErrorCode *pErrorCode) {
3195 CaseFoldingData *pData=(CaseFoldingData *)context;
3196 char *end;
3197 UChar full[32];
3198 UChar32 c, prev, simple;
3199 int32_t count;
3200 int which;
3201 char status;
3202
3203 /* get code point */
3204 c=(UChar32)strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
3205 end=(char *)u_skipWhitespace(end);
3206 if(end<=fields[0][0] || end!=fields[0][1]) {
3207 log_err("syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
3208 *pErrorCode=U_PARSE_ERROR;
3209 return;
3210 }
3211
3212 /* get the status of this mapping */
3213 status=*u_skipWhitespace(fields[1][0]);
3214 if(status!='C' && status!='S' && status!='F' && status!='T') {
3215 log_err("unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
3216 *pErrorCode=U_PARSE_ERROR;
3217 return;
3218 }
3219
3220 /* get the mapping */
3221 count=u_parseString(fields[2][0], full, 32, (uint32_t *)&simple, pErrorCode);
3222 if(U_FAILURE(*pErrorCode)) {
3223 log_err("error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
3224 return;
3225 }
3226
3227 /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
3228 if(count==0 || count>2 || (count==2 && U16_IS_SINGLE(full[1]))) {
3229 simple=c;
3230 }
3231
3232 if(c!=(prev=pData->prev)) {
3233 /*
3234 * Test remaining mappings for the previous code point.
3235 * If a turkic folding was not mentioned, then it should fold the same
3236 * as the regular simple case folding.
3237 */
3238 UChar s[2];
3239 int32_t length;
3240
3241 length=0;
3242 U16_APPEND_UNSAFE(s, length, prev);
3243 testFold(prev, (~pData->which)&CF_ALL,
3244 prev, pData->prevSimple,
3245 s, length,
3246 pData->prevFull, pData->prevFullLength);
3247 pData->prev=pData->prevSimple=c;
3248 length=0;
3249 U16_APPEND_UNSAFE(pData->prevFull, length, c);
3250 pData->prevFullLength=length;
3251 pData->which=0;
3252 }
3253
3254 /*
3255 * Turn the status into a bit set of case foldings to test.
3256 * Remember non-Turkic case foldings as defaults for Turkic mode.
3257 */
3258 switch(status) {
3259 case 'C':
3260 which=CF_SIMPLE|CF_FULL;
3261 pData->prevSimple=simple;
3262 u_memcpy(pData->prevFull, full, count);
3263 pData->prevFullLength=count;
3264 break;
3265 case 'S':
3266 which=CF_SIMPLE;
3267 pData->prevSimple=simple;
3268 break;
3269 case 'F':
3270 which=CF_FULL;
3271 u_memcpy(pData->prevFull, full, count);
3272 pData->prevFullLength=count;
3273 break;
3274 case 'T':
3275 which=CF_TURKIC;
3276 break;
3277 default:
3278 which=0;
3279 break; /* won't happen because of test above */
3280 }
3281
3282 testFold(c, which, simple, simple, full, count, full, count);
3283
3284 /* remember which case foldings of c have been tested */
3285 pData->which|=which;
3286
3287 /* remove c from the set of ones not mentioned in CaseFolding.txt */
3288 uset_remove(pData->notSeen, c);
3289}
3290
3291static void
3292TestCaseFolding() {
3293 CaseFoldingData data={ NULL };
3294 char *fields[3][2];
3295 UErrorCode errorCode;
3296
3297 static char *lastLine= (char *)"10FFFF; C; 10FFFF;";
3298
3299 errorCode=U_ZERO_ERROR;
3300 /* test BMP & plane 1 - nothing interesting above */
3301 data.notSeen=uset_open(0, 0x1ffff);
3302 data.prevFullLength=1; /* length of full case folding of U+0000 */
3303
3304 parseUCDFile("CaseFolding.txt", fields, 3, caseFoldingLineFn, &data, &errorCode);
3305 if(U_SUCCESS(errorCode)) {
3306 int32_t i, start, end;
3307
3308 /* add a pseudo-last line to finish testing of the actual last one */
3309 fields[0][0]=lastLine;
3310 fields[0][1]=lastLine+6;
3311 fields[1][0]=lastLine+7;
3312 fields[1][1]=lastLine+9;
3313 fields[2][0]=lastLine+10;
3314 fields[2][1]=lastLine+17;
3315 caseFoldingLineFn(&data, fields, 3, &errorCode);
3316
3317 /* verify that all code points that are not mentioned in CaseFolding.txt fold to themselves */
3318 for(i=0;
3319 0==uset_getItem(data.notSeen, i, &start, &end, NULL, 0, &errorCode) &&
3320 U_SUCCESS(errorCode);
3321 ++i
3322 ) {
3323 do {
3324 testFoldToSelf(start, CF_ALL);
3325 } while(++start<=end);
3326 }
3327 }
3328
3329 uset_close(data.notSeen);
374ca955 3330}