]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/cintltst/cucdtst.c
ICU-400.42.tar.gz
[apple/icu.git] / icuSources / test / cintltst / cucdtst.c
CommitLineData
b75a7d8f
A
1/********************************************************************
2 * COPYRIGHT:
46f4442e 3 * Copyright (c) 1997-2008, International Business Machines Corporation and
b75a7d8f
A
4 * others. All Rights Reserved.
5 ********************************************************************/
46f4442e 6/*******************************************************************************
b75a7d8f
A
7*
8* File CUCDTST.C
9*
10* Modification History:
11* Name Description
12* Madhu Katragadda Ported for C API, added tests for string functions
46f4442e 13********************************************************************************
b75a7d8f
A
14*/
15
16#include <string.h>
17#include <math.h>
18#include <stdlib.h>
19
20#include "unicode/utypes.h"
21#include "unicode/uchar.h"
22#include "unicode/putil.h"
23#include "unicode/ustring.h"
24#include "unicode/uloc.h"
25
26#include "cintltst.h"
374ca955 27#include "putilimp.h"
b75a7d8f 28#include "uparse.h"
374ca955 29#include "ucase.h"
73c04bcf 30#include "ubidi_props.h"
b75a7d8f 31#include "uprops.h"
374ca955 32#include "uset_imp.h"
b75a7d8f
A
33#include "usc_impl.h"
34#include "unormimp.h"
374ca955
A
35#include "udatamem.h" /* for testing ucase_openBinary() */
36#include "cucdapi.h"
b75a7d8f 37
374ca955 38#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
b75a7d8f
A
39
40/* prototypes --------------------------------------------------------------- */
41
42static void TestUpperLower(void);
43static void TestLetterNumber(void);
44static void TestMisc(void);
45static void TestPOSIX(void);
46static void TestControlPrint(void);
47static void TestIdentifier(void);
48static void TestUnicodeData(void);
49static void TestCodeUnit(void);
50static void TestCodePoint(void);
51static void TestCharLength(void);
52static void TestCharNames(void);
53static void TestMirroring(void);
b75a7d8f
A
54static void TestUScriptRunAPI(void);
55static void TestAdditionalProperties(void);
56static void TestNumericProperties(void);
57static void TestPropertyNames(void);
58static void TestPropertyValues(void);
59static void TestConsistency(void);
374ca955 60static void TestUCase(void);
73c04bcf
A
61static void TestUBiDiProps(void);
62static void TestCaseFolding(void);
b75a7d8f
A
63
64/* internal methods used */
65static int32_t MakeProp(char* str);
66static int32_t MakeDir(char* str);
67
73c04bcf
A
68/* helpers ------------------------------------------------------------------ */
69
70static void
71parseUCDFile(const char *filename,
72 char *fields[][2], int32_t fieldCount,
73 UParseLineFn *lineFn, void *context,
74 UErrorCode *pErrorCode) {
75 char path[256];
76 char backupPath[256];
77
78 if(U_FAILURE(*pErrorCode)) {
79 return;
80 }
81
82 /* Look inside ICU_DATA first */
83 strcpy(path, u_getDataDirectory());
84 strcat(path, ".." U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING);
85 strcat(path, filename);
86
87 /* As a fallback, try to guess where the source data was located
88 * at the time ICU was built, and look there.
89 */
90 strcpy(backupPath, ctest_dataSrcDir());
91 strcat(backupPath, U_FILE_SEP_STRING);
92 strcat(backupPath, "unidata" U_FILE_SEP_STRING);
93 strcat(backupPath, filename);
94
95 u_parseDelimitedFile(path, ';', fields, fieldCount, lineFn, context, pErrorCode);
96 if(*pErrorCode==U_FILE_ACCESS_ERROR) {
97 *pErrorCode=U_ZERO_ERROR;
98 u_parseDelimitedFile(backupPath, ';', fields, fieldCount, lineFn, context, pErrorCode);
99 }
100 if(U_FAILURE(*pErrorCode)) {
101 log_err("error parsing %s: %s\n", filename, u_errorName(*pErrorCode));
102 }
103}
104
b75a7d8f
A
105/* test data ---------------------------------------------------------------- */
106
107static const UChar LAST_CHAR_CODE_IN_FILE = 0xFFFD;
108static const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf";
109static const int32_t tagValues[] =
110 {
111 /* Mn */ U_NON_SPACING_MARK,
112 /* Mc */ U_COMBINING_SPACING_MARK,
113 /* Me */ U_ENCLOSING_MARK,
114 /* Nd */ U_DECIMAL_DIGIT_NUMBER,
115 /* Nl */ U_LETTER_NUMBER,
116 /* No */ U_OTHER_NUMBER,
117 /* Zs */ U_SPACE_SEPARATOR,
118 /* Zl */ U_LINE_SEPARATOR,
119 /* Zp */ U_PARAGRAPH_SEPARATOR,
120 /* Cc */ U_CONTROL_CHAR,
121 /* Cf */ U_FORMAT_CHAR,
122 /* Cs */ U_SURROGATE,
123 /* Co */ U_PRIVATE_USE_CHAR,
124 /* Cn */ U_UNASSIGNED,
125 /* Lu */ U_UPPERCASE_LETTER,
126 /* Ll */ U_LOWERCASE_LETTER,
127 /* Lt */ U_TITLECASE_LETTER,
128 /* Lm */ U_MODIFIER_LETTER,
129 /* Lo */ U_OTHER_LETTER,
130 /* Pc */ U_CONNECTOR_PUNCTUATION,
131 /* Pd */ U_DASH_PUNCTUATION,
132 /* Ps */ U_START_PUNCTUATION,
133 /* Pe */ U_END_PUNCTUATION,
134 /* Po */ U_OTHER_PUNCTUATION,
135 /* Sm */ U_MATH_SYMBOL,
136 /* Sc */ U_CURRENCY_SYMBOL,
137 /* Sk */ U_MODIFIER_SYMBOL,
138 /* So */ U_OTHER_SYMBOL,
139 /* Pi */ U_INITIAL_PUNCTUATION,
140 /* Pf */ U_FINAL_PUNCTUATION
141 };
142
143static const char dirStrings[][5] = {
144 "L",
145 "R",
146 "EN",
147 "ES",
148 "ET",
149 "AN",
150 "CS",
151 "B",
152 "S",
153 "WS",
154 "ON",
155 "LRE",
156 "LRO",
157 "AL",
158 "RLE",
159 "RLO",
160 "PDF",
161 "NSM",
162 "BN"
163};
164
165void addUnicodeTest(TestNode** root);
166
167void addUnicodeTest(TestNode** root)
168{
b75a7d8f
A
169 addTest(root, &TestCodeUnit, "tsutil/cucdtst/TestCodeUnit");
170 addTest(root, &TestCodePoint, "tsutil/cucdtst/TestCodePoint");
171 addTest(root, &TestCharLength, "tsutil/cucdtst/TestCharLength");
46f4442e
A
172 addTest(root, &TestBinaryValues, "tsutil/cucdtst/TestBinaryValues");
173 addTest(root, &TestUnicodeData, "tsutil/cucdtst/TestUnicodeData");
b75a7d8f
A
174 addTest(root, &TestAdditionalProperties, "tsutil/cucdtst/TestAdditionalProperties");
175 addTest(root, &TestNumericProperties, "tsutil/cucdtst/TestNumericProperties");
176 addTest(root, &TestUpperLower, "tsutil/cucdtst/TestUpperLower");
177 addTest(root, &TestLetterNumber, "tsutil/cucdtst/TestLetterNumber");
178 addTest(root, &TestMisc, "tsutil/cucdtst/TestMisc");
179 addTest(root, &TestPOSIX, "tsutil/cucdtst/TestPOSIX");
180 addTest(root, &TestControlPrint, "tsutil/cucdtst/TestControlPrint");
181 addTest(root, &TestIdentifier, "tsutil/cucdtst/TestIdentifier");
182 addTest(root, &TestCharNames, "tsutil/cucdtst/TestCharNames");
183 addTest(root, &TestMirroring, "tsutil/cucdtst/TestMirroring");
184 addTest(root, &TestUScriptCodeAPI, "tsutil/cucdtst/TestUScriptCodeAPI");
185 addTest(root, &TestUScriptRunAPI, "tsutil/cucdtst/TestUScriptRunAPI");
186 addTest(root, &TestPropertyNames, "tsutil/cucdtst/TestPropertyNames");
187 addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues");
188 addTest(root, &TestConsistency, "tsutil/cucdtst/TestConsistency");
374ca955 189 addTest(root, &TestUCase, "tsutil/cucdtst/TestUCase");
73c04bcf
A
190 addTest(root, &TestUBiDiProps, "tsutil/cucdtst/TestUBiDiProps");
191 addTest(root, &TestCaseFolding, "tsutil/cucdtst/TestCaseFolding");
b75a7d8f
A
192}
193
194/*==================================================== */
195/* test u_toupper() and u_tolower() */
196/*==================================================== */
197static void TestUpperLower()
198{
199 const UChar upper[] = {0x41, 0x42, 0x00b2, 0x01c4, 0x01c6, 0x01c9, 0x01c8, 0x01c9, 0x000c, 0x0000};
200 const UChar lower[] = {0x61, 0x62, 0x00b2, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01c9, 0x000c, 0x0000};
201 U_STRING_DECL(upperTest, "abcdefg123hij.?:klmno", 21);
202 U_STRING_DECL(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
203 int32_t i;
204
205 U_STRING_INIT(upperTest, "abcdefg123hij.?:klmno", 21);
206 U_STRING_INIT(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
207
208/*
209Checks LetterLike Symbols which were previously a source of confusion
210[Bertrand A. D. 02/04/98]
211*/
212 for (i=0x2100;i<0x2138;i++)
213 {
73c04bcf
A
214 /* Unicode 5.0 adds lowercase U+214E (TURNED SMALL F) to U+2132 (TURNED CAPITAL F) */
215 if(i!=0x2126 && i!=0x212a && i!=0x212b && i!=0x2132)
b75a7d8f
A
216 {
217 if (i != (int)u_tolower(i)) /* itself */
218 log_err("Failed case conversion with itself: U+%04x\n", i);
219 if (i != (int)u_toupper(i))
220 log_err("Failed case conversion with itself: U+%04x\n", i);
221 }
222 }
223
224 for(i=0; i < u_strlen(upper); i++){
225 if(u_tolower(upper[i]) != lower[i]){
226 log_err("FAILED u_tolower() for %lx Expected %lx Got %lx\n", upper[i], lower[i], u_tolower(upper[i]));
227 }
228 }
229
230 log_verbose("testing upper lower\n");
231 for (i = 0; i < 21; i++) {
232
233 if (u_isalpha(upperTest[i]) && !u_islower(upperTest[i]))
234 {
235 log_err("Failed isLowerCase test at %c\n", upperTest[i]);
236 }
237 else if (u_isalpha(lowerTest[i]) && !u_isupper(lowerTest[i]))
238 {
239 log_err("Failed isUpperCase test at %c\n", lowerTest[i]);
240 }
241 else if (upperTest[i] != u_tolower(lowerTest[i]))
242 {
243 log_err("Failed case conversion from %c To %c :\n", lowerTest[i], upperTest[i]);
244 }
245 else if (lowerTest[i] != u_toupper(upperTest[i]))
246 {
247 log_err("Failed case conversion : %c To %c \n", upperTest[i], lowerTest[i]);
248 }
249 else if (upperTest[i] != u_tolower(upperTest[i]))
250 {
251 log_err("Failed case conversion with itself: %c\n", upperTest[i]);
252 }
253 else if (lowerTest[i] != u_toupper(lowerTest[i]))
254 {
255 log_err("Failed case conversion with itself: %c\n", lowerTest[i]);
256 }
257 }
258 log_verbose("done testing upper lower\n");
259
260 log_verbose("testing u_istitle\n");
261 {
262 static const UChar expected[] = {
263 0x1F88,
264 0x1F89,
265 0x1F8A,
266 0x1F8B,
267 0x1F8C,
268 0x1F8D,
269 0x1F8E,
270 0x1F8F,
271 0x1F88,
272 0x1F89,
273 0x1F8A,
274 0x1F8B,
275 0x1F8C,
276 0x1F8D,
277 0x1F8E,
278 0x1F8F,
279 0x1F98,
280 0x1F99,
281 0x1F9A,
282 0x1F9B,
283 0x1F9C,
284 0x1F9D,
285 0x1F9E,
286 0x1F9F,
287 0x1F98,
288 0x1F99,
289 0x1F9A,
290 0x1F9B,
291 0x1F9C,
292 0x1F9D,
293 0x1F9E,
294 0x1F9F,
295 0x1FA8,
296 0x1FA9,
297 0x1FAA,
298 0x1FAB,
299 0x1FAC,
300 0x1FAD,
301 0x1FAE,
302 0x1FAF,
303 0x1FA8,
304 0x1FA9,
305 0x1FAA,
306 0x1FAB,
307 0x1FAC,
308 0x1FAD,
309 0x1FAE,
310 0x1FAF,
311 0x1FBC,
312 0x1FBC,
313 0x1FCC,
314 0x1FCC,
315 0x1FFC,
316 0x1FFC,
317 };
318 int32_t num = sizeof(expected)/sizeof(expected[0]);
319 for(i=0; i<num; i++){
320 if(!u_istitle(expected[i])){
321 log_err("u_istitle failed for 0x%4X. Expected TRUE, got FALSE\n",expected[i]);
322 }
323 }
324
325 }
326}
327
73c04bcf 328/* compare two sets and verify that their difference or intersection is empty */
b75a7d8f
A
329static UBool
330showADiffB(const USet *a, const USet *b,
331 const char *a_name, const char *b_name,
332 UBool expect, UBool diffIsError) {
73c04bcf 333 USet *aa;
b75a7d8f 334 int32_t i, start, end, length;
b75a7d8f
A
335 UErrorCode errorCode;
336
73c04bcf
A
337 /*
338 * expect:
339 * TRUE -> a-b should be empty, that is, b should contain all of a
340 * FALSE -> a&b should be empty, that is, a should contain none of b (and vice versa)
341 */
342 if(expect ? uset_containsAll(b, a) : uset_containsNone(a, b)) {
343 return TRUE;
344 }
345
346 /* clone a to aa because a is const */
347 aa=uset_open(1, 0);
348 if(aa==NULL) {
349 /* unusual problem - out of memory? */
350 return FALSE;
351 }
352 uset_addAll(aa, a);
353
354 /* compute the set in question */
355 if(expect) {
356 /* a-b */
357 uset_removeAll(aa, b);
358 } else {
359 /* a&b */
360 uset_retainAll(aa, b);
361 }
362
363 /* aa is not empty because of the initial tests above; show its contents */
b75a7d8f 364 errorCode=U_ZERO_ERROR;
b75a7d8f
A
365 i=0;
366 for(;;) {
73c04bcf 367 length=uset_getItem(aa, i, &start, &end, NULL, 0, &errorCode);
b75a7d8f 368 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
73c04bcf 369 break; /* done */
b75a7d8f
A
370 }
371 if(U_FAILURE(errorCode)) {
73c04bcf 372 log_err("error comparing %s with %s at difference item %d: %s\n",
b75a7d8f 373 a_name, b_name, i, u_errorName(errorCode));
73c04bcf 374 break;
b75a7d8f
A
375 }
376 if(length!=0) {
73c04bcf 377 break; /* done with code points, got a string or -1 */
b75a7d8f
A
378 }
379
73c04bcf
A
380 if(diffIsError) {
381 if(expect) {
382 log_err("error: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
383 } else {
384 log_err("error: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
385 }
386 } else {
387 if(expect) {
388 log_verbose("info: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
389 } else {
390 log_verbose("info: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
b75a7d8f
A
391 }
392 }
393
394 ++i;
395 }
73c04bcf
A
396
397 uset_close(aa);
398 return FALSE;
b75a7d8f
A
399}
400
401static UBool
402showAMinusB(const USet *a, const USet *b,
403 const char *a_name, const char *b_name,
404 UBool diffIsError) {
405 return showADiffB(a, b, a_name, b_name, TRUE, diffIsError);
406}
407
408static UBool
409showAIntersectB(const USet *a, const USet *b,
410 const char *a_name, const char *b_name,
411 UBool diffIsError) {
412 return showADiffB(a, b, a_name, b_name, FALSE, diffIsError);
413}
414
415static UBool
416compareUSets(const USet *a, const USet *b,
417 const char *a_name, const char *b_name,
418 UBool diffIsError) {
73c04bcf
A
419 /*
420 * Use an arithmetic & not a logical && so that both branches
421 * are always taken and all differences are shown.
422 */
b75a7d8f 423 return
73c04bcf 424 showAMinusB(a, b, a_name, b_name, diffIsError) &
b75a7d8f
A
425 showAMinusB(b, a, b_name, a_name, diffIsError);
426}
427
428/* test isLetter(u_isapha()) and isDigit(u_isdigit()) */
429static void TestLetterNumber()
430{
431 UChar i = 0x0000;
432
433 log_verbose("Testing for isalpha\n");
434 for (i = 0x0041; i < 0x005B; i++) {
435 if (!u_isalpha(i))
436 {
437 log_err("Failed isLetter test at %.4X\n", i);
438 }
439 }
440 for (i = 0x0660; i < 0x066A; i++) {
441 if (u_isalpha(i))
442 {
443 log_err("Failed isLetter test with numbers at %.4X\n", i);
444 }
445 }
446
447 log_verbose("Testing for isdigit\n");
448 for (i = 0x0660; i < 0x066A; i++) {
449 if (!u_isdigit(i))
450 {
451 log_verbose("Failed isNumber test at %.4X\n", i);
452 }
453 }
454
455 log_verbose("Testing for isalnum\n");
456 for (i = 0x0041; i < 0x005B; i++) {
457 if (!u_isalnum(i))
458 {
459 log_err("Failed isAlNum test at %.4X\n", i);
460 }
461 }
462 for (i = 0x0660; i < 0x066A; i++) {
463 if (!u_isalnum(i))
464 {
465 log_err("Failed isAlNum test at %.4X\n", i);
466 }
467 }
468
469 {
470 /*
471 * The following checks work only starting from Unicode 4.0.
472 * Check the version number here.
473 */
374ca955 474 static UVersionInfo u401={ 4, 0, 1, 0 };
b75a7d8f
A
475 UVersionInfo version;
476 u_getUnicodeVersion(version);
374ca955 477 if(version[0]<4 || 0==memcmp(version, u401, 4)) {
b75a7d8f
A
478 return;
479 }
480 }
481
482 {
483 /*
484 * Sanity check:
485 * Verify that exactly the digit characters have decimal digit values.
486 * This assumption is used in the implementation of u_digit()
487 * (which checks nt=de)
488 * compared with the parallel java.lang.Character.digit()
489 * (which checks Nd).
490 *
491 * This was not true in Unicode 3.2 and earlier.
374ca955
A
492 * Unicode 4.0 fixed discrepancies.
493 * Unicode 4.0.1 re-introduced problems in this area due to an
494 * unintentionally incomplete last-minute change.
b75a7d8f
A
495 */
496 U_STRING_DECL(digitsPattern, "[:Nd:]", 6);
497 U_STRING_DECL(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
498
499 USet *digits, *decimalValues;
500 UErrorCode errorCode;
501
502 U_STRING_INIT(digitsPattern, "[:Nd:]", 6);
503 U_STRING_INIT(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
504 errorCode=U_ZERO_ERROR;
505 digits=uset_openPattern(digitsPattern, 6, &errorCode);
506 decimalValues=uset_openPattern(decimalValuesPattern, 24, &errorCode);
507
508 if(U_SUCCESS(errorCode)) {
509 compareUSets(digits, decimalValues, "[:Nd:]", "[:Numeric_Type=Decimal:]", TRUE);
510 }
511
512 uset_close(digits);
513 uset_close(decimalValues);
514 }
515}
516
517/* Tests for isDefined(u_isdefined)(, isBaseForm(u_isbase()), isSpaceChar(u_isspace()), isWhiteSpace(), u_CharDigitValue() */
518static void TestMisc()
519{
520 static const UChar sampleSpaces[] = {0x0020, 0x00a0, 0x2000, 0x2001, 0x2005};
521 static const UChar sampleNonSpaces[] = {0x61, 0x62, 0x63, 0x64, 0x74};
522 static const UChar sampleUndefined[] = {0xfff1, 0xfff7, 0xfa6b };
523 static const UChar sampleDefined[] = {0x523E, 0x4f88, 0xfffd};
524 static const UChar sampleBase[] = {0x0061, 0x0031, 0x03d2};
525 static const UChar sampleNonBase[] = {0x002B, 0x0020, 0x203B};
526/* static const UChar sampleChars[] = {0x000a, 0x0045, 0x4e00, 0xDC00, 0xFFE8, 0xFFF0};*/
527 static const UChar sampleDigits[]= {0x0030, 0x0662, 0x0F23, 0x0ED5};
528 static const UChar sampleNonDigits[] = {0x0010, 0x0041, 0x0122, 0x68FE};
529 static const UChar sampleWhiteSpaces[] = {0x2008, 0x2009, 0x200a, 0x001c, 0x000c};
530 static const UChar sampleNonWhiteSpaces[] = {0x61, 0x62, 0x3c, 0x28, 0x3f};
531
532
533 static const int32_t sampleDigitValues[] = {0, 2, 3, 5};
534
535 uint32_t mask;
536
537 int32_t i;
538 char icuVersion[U_MAX_VERSION_STRING_LENGTH];
539 UVersionInfo realVersion;
540
541 memset(icuVersion, 0, U_MAX_VERSION_STRING_LENGTH);
542
543 log_verbose("Testing for isspace and nonspaces\n");
544 for (i = 0; i < 5; i++) {
545 if (!(u_isspace(sampleSpaces[i])) ||
546 (u_isspace(sampleNonSpaces[i])))
547 {
548 log_err("Space char test error : %d or %d \n", (int32_t)sampleSpaces[i], (int32_t)sampleNonSpaces[i]);
549 }
550 if (!(u_isJavaSpaceChar(sampleSpaces[i])) ||
551 (u_isJavaSpaceChar(sampleNonSpaces[i])))
552 {
553 log_err("u_isJavaSpaceChar() test error : %d or %d \n", (int32_t)sampleSpaces[i], (int32_t)sampleNonSpaces[i]);
554 }
555 }
556
557 log_verbose("Testing for isspace and nonspaces\n");
558 for (i = 0; i < 5; i++) {
559 if (!(u_isWhitespace(sampleWhiteSpaces[i])) ||
560 (u_isWhitespace(sampleNonWhiteSpaces[i])))
561 {
562 log_err("White Space char test error : %lx or %lx \n", sampleWhiteSpaces[i], sampleNonWhiteSpaces[i]);
563 }
564 }
565
566 log_verbose("Testing for isdefined\n");
567 for (i = 0; i < 3; i++) {
568 if ((u_isdefined(sampleUndefined[i])) ||
569 !(u_isdefined(sampleDefined[i])))
570 {
571 log_err("Undefined char test error : U+%04x or U+%04x\n", (int32_t)sampleUndefined[i], (int32_t)sampleDefined[i]);
572 }
573 }
574
575 log_verbose("Testing for isbase\n");
576 for (i = 0; i < 3; i++) {
577 if ((u_isbase(sampleNonBase[i])) ||
578 !(u_isbase(sampleBase[i])))
579 {
580 log_err("Non-baseform char test error : U+%04x or U+%04x",(int32_t)sampleNonBase[i], (int32_t)sampleBase[i]);
581 }
582 }
583
584 log_verbose("Testing for isdigit \n");
585 for (i = 0; i < 4; i++) {
586 if ((u_isdigit(sampleDigits[i]) &&
587 (u_charDigitValue(sampleDigits[i])!= sampleDigitValues[i])) ||
588 (u_isdigit(sampleNonDigits[i]))) {
589 log_err("Digit char test error : %lx or %lx\n", sampleDigits[i], sampleNonDigits[i]);
590 }
591 }
592
593 /* Tests the ICU version #*/
594 u_getVersion(realVersion);
595 u_versionToString(realVersion, icuVersion);
374ca955 596 if (strncmp(icuVersion, U_ICU_VERSION, uprv_min((int32_t)strlen(icuVersion), (int32_t)strlen(U_ICU_VERSION))) != 0)
b75a7d8f
A
597 {
598 log_err("ICU version test failed. Header says=%s, got=%s \n", U_ICU_VERSION, icuVersion);
599 }
600#if defined(ICU_VERSION)
601 /* test only happens where we have configure.in with VERSION - sanity check. */
602 if(strcmp(U_ICU_VERSION, ICU_VERSION))
603 {
604 log_err("ICU version mismatch: Header says %s, build environment says %s.\n", U_ICU_VERSION, ICU_VERSION);
605 }
606#endif
607
608 /* test U_GC_... */
609 if(
610 U_GET_GC_MASK(0x41)!=U_GC_LU_MASK ||
611 U_GET_GC_MASK(0x662)!=U_GC_ND_MASK ||
612 U_GET_GC_MASK(0xa0)!=U_GC_ZS_MASK ||
613 U_GET_GC_MASK(0x28)!=U_GC_PS_MASK ||
614 U_GET_GC_MASK(0x2044)!=U_GC_SM_MASK ||
615 U_GET_GC_MASK(0xe0063)!=U_GC_CF_MASK
616 ) {
617 log_err("error: U_GET_GC_MASK does not work properly\n");
618 }
619
620 mask=0;
621 mask=(mask&~U_GC_CN_MASK)|U_GC_CN_MASK;
622
623 mask=(mask&~U_GC_LU_MASK)|U_GC_LU_MASK;
624 mask=(mask&~U_GC_LL_MASK)|U_GC_LL_MASK;
625 mask=(mask&~U_GC_LT_MASK)|U_GC_LT_MASK;
626 mask=(mask&~U_GC_LM_MASK)|U_GC_LM_MASK;
627 mask=(mask&~U_GC_LO_MASK)|U_GC_LO_MASK;
628
629 mask=(mask&~U_GC_MN_MASK)|U_GC_MN_MASK;
630 mask=(mask&~U_GC_ME_MASK)|U_GC_ME_MASK;
631 mask=(mask&~U_GC_MC_MASK)|U_GC_MC_MASK;
632
633 mask=(mask&~U_GC_ND_MASK)|U_GC_ND_MASK;
634 mask=(mask&~U_GC_NL_MASK)|U_GC_NL_MASK;
635 mask=(mask&~U_GC_NO_MASK)|U_GC_NO_MASK;
636
637 mask=(mask&~U_GC_ZS_MASK)|U_GC_ZS_MASK;
638 mask=(mask&~U_GC_ZL_MASK)|U_GC_ZL_MASK;
639 mask=(mask&~U_GC_ZP_MASK)|U_GC_ZP_MASK;
640
641 mask=(mask&~U_GC_CC_MASK)|U_GC_CC_MASK;
642 mask=(mask&~U_GC_CF_MASK)|U_GC_CF_MASK;
643 mask=(mask&~U_GC_CO_MASK)|U_GC_CO_MASK;
644 mask=(mask&~U_GC_CS_MASK)|U_GC_CS_MASK;
645
646 mask=(mask&~U_GC_PD_MASK)|U_GC_PD_MASK;
647 mask=(mask&~U_GC_PS_MASK)|U_GC_PS_MASK;
648 mask=(mask&~U_GC_PE_MASK)|U_GC_PE_MASK;
649 mask=(mask&~U_GC_PC_MASK)|U_GC_PC_MASK;
650 mask=(mask&~U_GC_PO_MASK)|U_GC_PO_MASK;
651
652 mask=(mask&~U_GC_SM_MASK)|U_GC_SM_MASK;
653 mask=(mask&~U_GC_SC_MASK)|U_GC_SC_MASK;
654 mask=(mask&~U_GC_SK_MASK)|U_GC_SK_MASK;
655 mask=(mask&~U_GC_SO_MASK)|U_GC_SO_MASK;
656
657 mask=(mask&~U_GC_PI_MASK)|U_GC_PI_MASK;
658 mask=(mask&~U_GC_PF_MASK)|U_GC_PF_MASK;
659
660 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
661 log_err("error: problems with U_GC_XX_MASK constants\n");
662 }
663
664 mask=0;
665 mask=(mask&~U_GC_C_MASK)|U_GC_C_MASK;
666 mask=(mask&~U_GC_L_MASK)|U_GC_L_MASK;
667 mask=(mask&~U_GC_M_MASK)|U_GC_M_MASK;
668 mask=(mask&~U_GC_N_MASK)|U_GC_N_MASK;
669 mask=(mask&~U_GC_Z_MASK)|U_GC_Z_MASK;
670 mask=(mask&~U_GC_P_MASK)|U_GC_P_MASK;
671 mask=(mask&~U_GC_S_MASK)|U_GC_S_MASK;
672
673 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
674 log_err("error: problems with U_GC_Y_MASK constants\n");
675 }
676 {
677 static const UChar32 digit[10]={ 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039 };
678 for(i=0; i<10; i++){
679 if(digit[i]!=u_forDigit(i,10)){
680 log_err("u_forDigit failed for %i. Expected: 0x%4X Got: 0x%4X\n",i,digit[i],u_forDigit(i,10));
681 }
682 }
683 }
684
685 /* test u_digit() */
686 {
687 static const struct {
688 UChar32 c;
689 int8_t radix, value;
690 } data[]={
691 /* base 16 */
692 { 0x0031, 16, 1 },
693 { 0x0038, 16, 8 },
694 { 0x0043, 16, 12 },
695 { 0x0066, 16, 15 },
696 { 0x00e4, 16, -1 },
697 { 0x0662, 16, 2 },
698 { 0x06f5, 16, 5 },
699 { 0xff13, 16, 3 },
700 { 0xff41, 16, 10 },
701
702 /* base 8 */
703 { 0x0031, 8, 1 },
704 { 0x0038, 8, -1 },
705 { 0x0043, 8, -1 },
706 { 0x0066, 8, -1 },
707 { 0x00e4, 8, -1 },
708 { 0x0662, 8, 2 },
709 { 0x06f5, 8, 5 },
710 { 0xff13, 8, 3 },
711 { 0xff41, 8, -1 },
712
713 /* base 36 */
714 { 0x5a, 36, 35 },
715 { 0x7a, 36, 35 },
716 { 0xff3a, 36, 35 },
717 { 0xff5a, 36, 35 },
718
719 /* wrong radix values */
720 { 0x0031, 1, -1 },
721 { 0xff3a, 37, -1 }
722 };
723
724 for(i=0; i<LENGTHOF(data); ++i) {
725 if(u_digit(data[i].c, data[i].radix)!=data[i].value) {
726 log_err("u_digit(U+%04x, %d)=%d expected %d\n",
727 data[i].c,
728 data[i].radix,
729 u_digit(data[i].c, data[i].radix),
730 data[i].value);
731 }
732 }
733 }
734}
735
736/* test C/POSIX-style functions --------------------------------------------- */
737
738/* bit flags */
739#define ISAL 1
740#define ISLO 2
741#define ISUP 4
742
743#define ISDI 8
744#define ISXD 0x10
745
746#define ISAN 0x20
747
748#define ISPU 0x40
749#define ISGR 0x80
750#define ISPR 0x100
751
752#define ISSP 0x200
753#define ISBL 0x400
754#define ISCN 0x800
755
756/* C/POSIX-style functions, in the same order as the bit flags */
374ca955 757typedef UBool U_EXPORT2 IsPOSIXClass(UChar32 c);
b75a7d8f
A
758
759static const struct {
760 IsPOSIXClass *fn;
761 const char *name;
762} posixClasses[]={
763 { u_isalpha, "isalpha" },
764 { u_islower, "islower" },
765 { u_isupper, "isupper" },
766 { u_isdigit, "isdigit" },
767 { u_isxdigit, "isxdigit" },
768 { u_isalnum, "isalnum" },
769 { u_ispunct, "ispunct" },
770 { u_isgraph, "isgraph" },
771 { u_isprint, "isprint" },
772 { u_isspace, "isspace" },
773 { u_isblank, "isblank" },
774 { u_iscntrl, "iscntrl" }
775};
776
777static const struct {
778 UChar32 c;
779 uint32_t posixResults;
780} posixData[]={
781 { 0x0008, ISCN }, /* backspace */
782 { 0x0009, ISSP|ISBL|ISCN }, /* TAB */
783 { 0x000a, ISSP| ISCN }, /* LF */
784 { 0x000c, ISSP| ISCN }, /* FF */
785 { 0x000d, ISSP| ISCN }, /* CR */
786 { 0x0020, ISPR|ISSP|ISBL }, /* space */
787 { 0x0021, ISPU|ISGR|ISPR }, /* ! */
788 { 0x0033, ISDI|ISXD|ISAN| ISGR|ISPR }, /* 3 */
789 { 0x0040, ISPU|ISGR|ISPR }, /* @ */
790 { 0x0041, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* A */
791 { 0x007a, ISAL|ISLO| ISAN| ISGR|ISPR }, /* z */
792 { 0x007b, ISPU|ISGR|ISPR }, /* { */
793 { 0x0085, ISSP| ISCN }, /* NEL */
794 { 0x00a0, ISPR|ISSP|ISBL }, /* NBSP */
795 { 0x00a4, ISGR|ISPR }, /* currency sign */
796 { 0x00e4, ISAL|ISLO| ISAN| ISGR|ISPR }, /* a-umlaut */
797 { 0x0300, ISGR|ISPR }, /* combining grave */
798 { 0x0600, ISCN }, /* arabic number sign */
799 { 0x0627, ISAL| ISAN| ISGR|ISPR }, /* alef */
800 { 0x0663, ISDI|ISXD|ISAN| ISGR|ISPR }, /* arabic 3 */
801 { 0x2002, ISPR|ISSP|ISBL }, /* en space */
802 { 0x2007, ISPR|ISSP|ISBL }, /* figure space */
803 { 0x2009, ISPR|ISSP|ISBL }, /* thin space */
374ca955
A
804 { 0x200b, ISCN }, /* ZWSP */
805 /*{ 0x200b, ISPR|ISSP },*/ /* ZWSP */ /* ZWSP became a control char in 4.0.1*/
b75a7d8f
A
806 { 0x200e, ISCN }, /* LRM */
807 { 0x2028, ISPR|ISSP| ISCN }, /* LS */
808 { 0x2029, ISPR|ISSP| ISCN }, /* PS */
809 { 0x20ac, ISGR|ISPR }, /* Euro */
810 { 0xff15, ISDI|ISXD|ISAN| ISGR|ISPR }, /* fullwidth 5 */
811 { 0xff25, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* fullwidth E */
812 { 0xff35, ISAL| ISUP| ISAN| ISGR|ISPR }, /* fullwidth U */
813 { 0xff45, ISAL|ISLO| ISXD|ISAN| ISGR|ISPR }, /* fullwidth e */
814 { 0xff55, ISAL|ISLO| ISAN| ISGR|ISPR } /* fullwidth u */
815};
816
817static void
818TestPOSIX() {
819 uint32_t mask;
820 int32_t cl, i;
821 UBool expect;
822
823 mask=1;
824 for(cl=0; cl<12; ++cl) {
825 for(i=0; i<LENGTHOF(posixData); ++i) {
826 expect=(UBool)((posixData[i].posixResults&mask)!=0);
827 if(posixClasses[cl].fn(posixData[i].c)!=expect) {
828 log_err("u_%s(U+%04x)=%s is wrong\n",
829 posixClasses[cl].name, posixData[i].c, expect ? "FALSE" : "TRUE");
830 }
831 }
832 mask<<=1;
833 }
834}
835
836/* Tests for isControl(u_iscntrl()) and isPrintable(u_isprint()) */
837static void TestControlPrint()
838{
839 const UChar sampleControl[] = {0x1b, 0x97, 0x82, 0x2028, 0x2029, 0x200c, 0x202b};
840 const UChar sampleNonControl[] = {0x61, 0x0031, 0x00e2};
841 const UChar samplePrintable[] = {0x0042, 0x005f, 0x2014};
842 const UChar sampleNonPrintable[] = {0x200c, 0x009f, 0x001b};
843 UChar32 c;
844 int i;
845
846 log_verbose("Testing for iscontrol\n");
847 for (i = 0; i < LENGTHOF(sampleControl); i++) {
848 if (!u_iscntrl(sampleControl[i]))
849 {
850 log_err("Control char test error : U+%04x should be control but is not\n", (int32_t)sampleControl[i]);
851 }
852 }
853
854 log_verbose("Testing for !iscontrol\n");
855 for (i = 0; i < LENGTHOF(sampleNonControl); i++) {
856 if (u_iscntrl(sampleNonControl[i]))
857 {
858 log_err("Control char test error : U+%04x should not be control but is\n", (int32_t)sampleNonControl[i]);
859 }
860 }
861
862 log_verbose("testing for isprintable\n");
863 for (i = 0; i < 3; i++) {
864 if (!u_isprint(samplePrintable[i]))
865 {
866 log_err("Printable char test error : U+%04x should be printable but is not\n", (int32_t)samplePrintable[i]);
867 }
868 if (u_isprint(sampleNonPrintable[i]))
869 {
870 log_err("Printable char test error : U+%04x should not be printable but is\n", (int32_t)sampleNonPrintable[i]);
871 }
872 }
873
874 /* test all ISO 8 controls */
875 for(c=0; c<=0x9f; ++c) {
876 if(c==0x20) {
877 /* skip ASCII graphic characters and continue with DEL */
878 c=0x7f;
879 }
880 if(!u_iscntrl(c)) {
881 log_err("error: u_iscntrl(ISO 8 control U+%04x)=FALSE\n", c);
882 }
883 if(!u_isISOControl(c)) {
884 log_err("error: u_isISOControl(ISO 8 control U+%04x)=FALSE\n", c);
885 }
886 if(u_isprint(c)) {
887 log_err("error: u_isprint(ISO 8 control U+%04x)=TRUE\n", c);
888 }
889 }
890
891 /* test all Latin-1 graphic characters */
892 for(c=0x20; c<=0xff; ++c) {
893 if(c==0x7f) {
894 c=0xa0;
895 } else if(c==0xad) {
896 /* Unicode 4 changes 00AD Soft Hyphen to Cf (and it is in fact not printable) */
897 ++c;
898 }
899 if(!u_isprint(c)) {
900 log_err("error: u_isprint(Latin-1 graphic character U+%04x)=FALSE\n", c);
901 }
902 }
903}
904
905/* u_isJavaIDStart, u_isJavaIDPart, u_isIDStart(), u_isIDPart(), u_isIDIgnorable()*/
906static void TestIdentifier()
907{
908 const UChar sampleJavaIDStart[] = {0x0071, 0x00e4, 0x005f};
909 const UChar sampleNonJavaIDStart[] = {0x0020, 0x2030, 0x0082};
910 const UChar sampleJavaIDPart[] = {0x005f, 0x0032, 0x0045};
911 const UChar sampleNonJavaIDPart[] = {0x2030, 0x2020, 0x0020};
912 const UChar sampleUnicodeIDStart[] = {0x0250, 0x00e2, 0x0061};
913 const UChar sampleNonUnicodeIDStart[] = {0x2000, 0x000a, 0x2019};
914 const UChar sampleUnicodeIDPart[] = {0x005f, 0x0032, 0x0045};
915 const UChar sampleNonUnicodeIDPart[] = {0x2030, 0x00a3, 0x0020};
916 const UChar sampleIDIgnore[] = {0x0006, 0x0010, 0x206b};
917 const UChar sampleNonIDIgnore[] = {0x0075, 0x00a3, 0x0061};
918
919 int i;
920
921 log_verbose("Testing sampleJavaID start \n");
922 for (i = 0; i < 3; i++) {
923 if (!(u_isJavaIDStart(sampleJavaIDStart[i])) ||
924 (u_isJavaIDStart(sampleNonJavaIDStart[i])))
925 log_err("Java ID Start char test error : %lx or %lx\n",
926 sampleJavaIDStart[i], sampleNonJavaIDStart[i]);
927 }
928
929 log_verbose("Testing sampleJavaID part \n");
930 for (i = 0; i < 3; i++) {
931 if (!(u_isJavaIDPart(sampleJavaIDPart[i])) ||
932 (u_isJavaIDPart(sampleNonJavaIDPart[i])))
933 log_err("Java ID Part char test error : %lx or %lx\n",
934 sampleJavaIDPart[i], sampleNonJavaIDPart[i]);
935 }
936
937 log_verbose("Testing sampleUnicodeID start \n");
938 for (i = 0; i < 3; i++) {
939 /* T_test_logln_ustr((int32_t)i); */
940 if (!(u_isIDStart(sampleUnicodeIDStart[i])) ||
941 (u_isIDStart(sampleNonUnicodeIDStart[i])))
942 {
943 log_err("Unicode ID Start char test error : %lx or %lx\n", sampleUnicodeIDStart[i],
944 sampleNonUnicodeIDStart[i]);
945 }
946 }
947
948 log_verbose("Testing sample unicode ID part \n");
949 for (i = 2; i < 3; i++) { /* nos *** starts with 2 instead of 0, until clarified */
950 /* T_test_logln_ustr((int32_t)i); */
951 if (!(u_isIDPart(sampleUnicodeIDPart[i])) ||
952 (u_isIDPart(sampleNonUnicodeIDPart[i])))
953 {
954 log_err("Unicode ID Part char test error : %lx or %lx", sampleUnicodeIDPart[i], sampleNonUnicodeIDPart[i]);
955 }
956 }
957
958 log_verbose("Testing sampleId ignore\n");
959 for (i = 0; i < 3; i++) {
960 /*T_test_logln_ustr((int32_t)i); */
961 if (!(u_isIDIgnorable(sampleIDIgnore[i])) ||
962 (u_isIDIgnorable(sampleNonIDIgnore[i])))
963 {
964 log_err("ID ignorable char test error : U+%04x or U+%04x\n", sampleIDIgnore[i], sampleNonIDIgnore[i]);
965 }
966 }
967}
968
969/* for each line of UnicodeData.txt, check some of the properties */
970/*
971 * ### TODO
972 * This test fails incorrectly if the First or Last code point of a repetitive area
973 * is overridden, which is allowed and is encouraged for the PUAs.
974 * Currently, this means that both area First/Last and override lines are
975 * tested against the properties from the API,
976 * and the area boundary will not match and cause an error.
977 *
978 * This function should detect area boundaries and skip them for the test of individual
979 * code points' properties.
980 * Then it should check that the areas contain all the same properties except where overridden.
981 * For this, it would have had to set a flag for which code points were listed explicitly.
982 */
983static void U_CALLCONV
984unicodeDataLineFn(void *context,
985 char *fields[][2], int32_t fieldCount,
986 UErrorCode *pErrorCode)
987{
988 char buffer[100];
989 char *end;
990 uint32_t value;
991 UChar32 c;
992 int32_t i;
993 int8_t type;
994
995 /* get the character code, field 0 */
996 c=strtoul(fields[0][0], &end, 16);
997 if(end<=fields[0][0] || end!=fields[0][1]) {
998 log_err("error: syntax error in field 0 at %s\n", fields[0][0]);
999 return;
1000 }
1001 if((uint32_t)c>=UCHAR_MAX_VALUE + 1) {
1002 log_err("error in UnicodeData.txt: code point %lu out of range\n", c);
1003 return;
1004 }
1005
1006 /* get general category, field 2 */
1007 *fields[2][1]=0;
1008 type = (int8_t)tagValues[MakeProp(fields[2][0])];
1009 if(u_charType(c)!=type) {
1010 log_err("error: u_charType(U+%04lx)==%u instead of %u\n", c, u_charType(c), type);
1011 }
1012 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1013 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1014 }
1015
1016 /* get canonical combining class, field 3 */
1017 value=strtoul(fields[3][0], &end, 10);
1018 if(end<=fields[3][0] || end!=fields[3][1]) {
1019 log_err("error: syntax error in field 3 at code 0x%lx\n", c);
1020 return;
1021 }
1022 if(value>255) {
1023 log_err("error in UnicodeData.txt: combining class %lu out of range\n", value);
1024 return;
1025 }
1026#if !UCONFIG_NO_NORMALIZATION
1027 if(value!=u_getCombiningClass(c) || value!=(uint32_t)u_getIntPropertyValue(c, UCHAR_CANONICAL_COMBINING_CLASS)) {
1028 log_err("error: u_getCombiningClass(U+%04lx)==%hu instead of %lu\n", c, u_getCombiningClass(c), value);
1029 }
1030#endif
1031
1032 /* get BiDi category, field 4 */
1033 *fields[4][1]=0;
1034 i=MakeDir(fields[4][0]);
1035 if(i!=u_charDirection(c) || i!=u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)) {
1036 log_err("error: u_charDirection(U+%04lx)==%u instead of %u (%s)\n", c, u_charDirection(c), MakeDir(fields[4][0]), fields[4][0]);
1037 }
1038
1039 /* get ISO Comment, field 11 */
1040 *fields[11][1]=0;
1041 i=u_getISOComment(c, buffer, sizeof(buffer), pErrorCode);
1042 if(U_FAILURE(*pErrorCode) || 0!=strcmp(fields[11][0], buffer)) {
1043 log_err("error: u_getISOComment(U+%04lx) wrong (%s): \"%s\" should be \"%s\"\n",
1044 c, u_errorName(*pErrorCode),
1045 U_FAILURE(*pErrorCode) ? buffer : "[error]",
1046 fields[11][0]);
1047 }
1048
1049 /* get uppercase mapping, field 12 */
1050 if(fields[12][0]!=fields[12][1]) {
1051 value=strtoul(fields[12][0], &end, 16);
1052 if(end!=fields[12][1]) {
1053 log_err("error: syntax error in field 12 at code 0x%lx\n", c);
1054 return;
1055 }
1056 if((UChar32)value!=u_toupper(c)) {
1057 log_err("error: u_toupper(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_toupper(c), value);
1058 }
1059 } else {
1060 /* no case mapping: the API must map the code point to itself */
1061 if(c!=u_toupper(c)) {
1062 log_err("error: U+%04lx does not have an uppercase mapping but u_toupper()==U+%04lx\n", c, u_toupper(c));
1063 }
1064 }
1065
1066 /* get lowercase mapping, field 13 */
1067 if(fields[13][0]!=fields[13][1]) {
1068 value=strtoul(fields[13][0], &end, 16);
1069 if(end!=fields[13][1]) {
1070 log_err("error: syntax error in field 13 at code 0x%lx\n", c);
1071 return;
1072 }
1073 if((UChar32)value!=u_tolower(c)) {
1074 log_err("error: u_tolower(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_tolower(c), value);
1075 }
1076 } else {
1077 /* no case mapping: the API must map the code point to itself */
1078 if(c!=u_tolower(c)) {
1079 log_err("error: U+%04lx does not have a lowercase mapping but u_tolower()==U+%04lx\n", c, u_tolower(c));
1080 }
1081 }
1082
1083 /* get titlecase mapping, field 14 */
1084 if(fields[14][0]!=fields[14][1]) {
1085 value=strtoul(fields[14][0], &end, 16);
1086 if(end!=fields[14][1]) {
1087 log_err("error: syntax error in field 14 at code 0x%lx\n", c);
1088 return;
1089 }
1090 if((UChar32)value!=u_totitle(c)) {
1091 log_err("error: u_totitle(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_totitle(c), value);
1092 }
1093 } else {
1094 /* no case mapping: the API must map the code point to itself */
1095 if(c!=u_totitle(c)) {
1096 log_err("error: U+%04lx does not have a titlecase mapping but u_totitle()==U+%04lx\n", c, u_totitle(c));
1097 }
1098 }
1099}
1100
1101static UBool U_CALLCONV
1102enumTypeRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1103 static const UChar32 test[][2]={
1104 {0x41, U_UPPERCASE_LETTER},
1105 {0x308, U_NON_SPACING_MARK},
1106 {0xfffe, U_GENERAL_OTHER_TYPES},
1107 {0xe0041, U_FORMAT_CHAR},
1108 {0xeffff, U_UNASSIGNED}
1109 };
1110
374ca955 1111 int32_t i, count;
b75a7d8f
A
1112
1113 if(0!=strcmp((const char *)context, "a1")) {
1114 log_err("error: u_enumCharTypes() passes on an incorrect context pointer\n");
1115 return FALSE;
1116 }
1117
374ca955 1118 count=LENGTHOF(test);
b75a7d8f
A
1119 for(i=0; i<count; ++i) {
1120 if(start<=test[i][0] && test[i][0]<limit) {
1121 if(type!=(UCharCategory)test[i][1]) {
1122 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld instead of U+%04lx with %ld\n",
1123 start, limit, (long)type, test[i][0], test[i][1]);
1124 }
374ca955 1125 /* stop at the range that includes the last test code point (increases code coverage for enumeration) */
b75a7d8f
A
1126 return i==(count-1) ? FALSE : TRUE;
1127 }
1128 }
1129
1130 if(start>test[count-1][0]) {
1131 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld after it should have stopped\n",
1132 start, limit, (long)type);
1133 return FALSE;
1134 }
1135
374ca955
A
1136 return TRUE;
1137}
1138
1139static UBool U_CALLCONV
1140enumDefaultsRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1141 /* default Bidi classes for unassigned code points */
1142 static const int32_t defaultBidi[][2]={ /* { limit, class } */
1143 { 0x0590, U_LEFT_TO_RIGHT },
1144 { 0x0600, U_RIGHT_TO_LEFT },
1145 { 0x07C0, U_RIGHT_TO_LEFT_ARABIC },
1146 { 0x0900, U_RIGHT_TO_LEFT },
1147 { 0xFB1D, U_LEFT_TO_RIGHT },
1148 { 0xFB50, U_RIGHT_TO_LEFT },
1149 { 0xFE00, U_RIGHT_TO_LEFT_ARABIC },
1150 { 0xFE70, U_LEFT_TO_RIGHT },
1151 { 0xFF00, U_RIGHT_TO_LEFT_ARABIC },
1152 { 0x10800, U_LEFT_TO_RIGHT },
1153 { 0x11000, U_RIGHT_TO_LEFT },
1154 { 0x110000, U_LEFT_TO_RIGHT }
1155 };
1156
1157 UChar32 c;
1158 int32_t i;
1159 UCharDirection shouldBeDir;
1160
b75a7d8f
A
1161 /*
1162 * LineBreak.txt specifies:
1163 * # - Assigned characters that are not listed explicitly are given the value
1164 * # "AL".
1165 * # - Unassigned characters are given the value "XX".
1166 *
1167 * PUA characters are listed explicitly with "XX".
1168 * Verify that no assigned character has "XX".
1169 */
1170 if(type!=U_UNASSIGNED && type!=U_PRIVATE_USE_CHAR) {
1171 c=start;
1172 while(c<limit) {
1173 if(0==u_getIntPropertyValue(c, UCHAR_LINE_BREAK)) {
1174 log_err("error UCHAR_LINE_BREAK(assigned U+%04lx)=XX\n", c);
1175 }
1176 ++c;
1177 }
1178 }
1179
1180 /*
1181 * Verify default Bidi classes.
374ca955
A
1182 * For recent Unicode versions, see UCD.html.
1183 *
1184 * For older Unicode versions:
b75a7d8f
A
1185 * See table 3-7 "Bidirectional Character Types" in UAX #9.
1186 * http://www.unicode.org/reports/tr9/
1187 *
1188 * See also DerivedBidiClass.txt for Cn code points!
374ca955
A
1189 *
1190 * Unicode 4.0.1/Public Review Issue #28 (http://www.unicode.org/review/resolved-pri.html)
1191 * changed some default values.
1192 * In particular, non-characters and unassigned Default Ignorable Code Points
1193 * change from L to BN.
1194 *
1195 * UCD.html version 4.0.1 does not yet reflect these changes.
b75a7d8f
A
1196 */
1197 if(type==U_UNASSIGNED || type==U_PRIVATE_USE_CHAR) {
1198 /* enumerate the intersections of defaultBidi ranges with [start..limit[ */
1199 c=start;
1200 for(i=0; i<LENGTHOF(defaultBidi) && c<limit; ++i) {
1201 if((int32_t)c<defaultBidi[i][0]) {
1202 while(c<limit && (int32_t)c<defaultBidi[i][0]) {
374ca955
A
1203 if(U_IS_UNICODE_NONCHAR(c) || u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
1204 shouldBeDir=U_BOUNDARY_NEUTRAL;
1205 } else {
1206 shouldBeDir=(UCharDirection)defaultBidi[i][1];
1207 }
1208
1209 if( u_charDirection(c)!=shouldBeDir ||
1210 u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)!=shouldBeDir
b75a7d8f
A
1211 ) {
1212 log_err("error: u_charDirection(unassigned/PUA U+%04lx)=%s should be %s\n",
374ca955 1213 c, dirStrings[u_charDirection(c)], dirStrings[shouldBeDir]);
b75a7d8f
A
1214 }
1215 ++c;
1216 }
1217 }
1218 }
1219 }
1220
1221 return TRUE;
1222}
1223
1224/* tests for several properties */
1225static void TestUnicodeData()
1226{
b75a7d8f
A
1227 UVersionInfo expectVersionArray;
1228 UVersionInfo versionArray;
1229 char *fields[15][2];
1230 UErrorCode errorCode;
1231 UChar32 c;
1232 int8_t type;
1233
b75a7d8f
A
1234 u_versionFromString(expectVersionArray, U_UNICODE_VERSION);
1235 u_getUnicodeVersion(versionArray);
1236 if(memcmp(versionArray, expectVersionArray, U_MAX_VERSION_LENGTH) != 0)
1237 {
1238 log_err("Testing u_getUnicodeVersion() - expected " U_UNICODE_VERSION " got %d.%d.%d.%d\n",
1239 versionArray[0], versionArray[1], versionArray[2], versionArray[3]);
1240 }
1241
1242#if defined(ICU_UNICODE_VERSION)
1243 /* test only happens where we have configure.in with UNICODE_VERSION - sanity check. */
1244 if(strcmp(U_UNICODE_VERSION, ICU_UNICODE_VERSION))
1245 {
1246 log_err("Testing configure.in's ICU_UNICODE_VERSION - expected " U_UNICODE_VERSION " got " ICU_UNICODE_VERSION "\n");
1247 }
1248#endif
1249
1250 if (ublock_getCode((UChar)0x0041) != UBLOCK_BASIC_LATIN || u_getIntPropertyValue(0x41, UCHAR_BLOCK)!=(int32_t)UBLOCK_BASIC_LATIN) {
1251 log_err("ublock_getCode(U+0041) property failed! Expected : %i Got: %i \n", UBLOCK_BASIC_LATIN,ublock_getCode((UChar)0x0041));
1252 }
1253
1254 errorCode=U_ZERO_ERROR;
73c04bcf 1255 parseUCDFile("UnicodeData.txt", fields, 15, unicodeDataLineFn, NULL, &errorCode);
b75a7d8f 1256 if(U_FAILURE(errorCode)) {
b75a7d8f
A
1257 return; /* if we couldn't parse UnicodeData.txt, we should return */
1258 }
1259
1260 /* sanity check on repeated properties */
1261 for(c=0xfffe; c<=0x10ffff;) {
1262 type=u_charType(c);
1263 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1264 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1265 }
1266 if(type!=U_UNASSIGNED) {
1267 log_err("error: u_charType(U+%04lx)!=U_UNASSIGNED (returns %d)\n", c, u_charType(c));
1268 }
1269 if((c&0xffff)==0xfffe) {
1270 ++c;
1271 } else {
1272 c+=0xffff;
1273 }
1274 }
1275
1276 /* test that PUA is not "unassigned" */
1277 for(c=0xe000; c<=0x10fffd;) {
1278 type=u_charType(c);
1279 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1280 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1281 }
1282 if(type==U_UNASSIGNED) {
1283 log_err("error: u_charType(U+%04lx)==U_UNASSIGNED\n", c);
1284 } else if(type!=U_PRIVATE_USE_CHAR) {
1285 log_verbose("PUA override: u_charType(U+%04lx)=%d\n", c, type);
1286 }
1287 if(c==0xf8ff) {
1288 c=0xf0000;
1289 } else if(c==0xffffd) {
1290 c=0x100000;
1291 } else {
1292 ++c;
1293 }
1294 }
1295
1296 /* test u_enumCharTypes() */
1297 u_enumCharTypes(enumTypeRange, "a1");
374ca955
A
1298
1299 /* check default properties */
1300 u_enumCharTypes(enumDefaultsRange, NULL);
b75a7d8f
A
1301}
1302
1303static void TestCodeUnit(){
1304 const UChar codeunit[]={0x0000,0xe065,0x20ac,0xd7ff,0xd800,0xd841,0xd905,0xdbff,0xdc00,0xdc02,0xddee,0xdfff,0};
1305
1306 int32_t i;
1307
1308 for(i=0; i<(int32_t)(sizeof(codeunit)/sizeof(codeunit[0])); i++){
1309 UChar c=codeunit[i];
1310 if(i<4){
1311 if(!(UTF_IS_SINGLE(c)) || (UTF_IS_LEAD(c)) || (UTF_IS_TRAIL(c)) ||(UTF_IS_SURROGATE(c))){
1312 log_err("ERROR: U+%04x is a single", c);
1313 }
1314
1315 }
1316 if(i >= 4 && i< 8){
1317 if(!(UTF_IS_LEAD(c)) || UTF_IS_SINGLE(c) || UTF_IS_TRAIL(c) || !(UTF_IS_SURROGATE(c))){
1318 log_err("ERROR: U+%04x is a first surrogate", c);
1319 }
1320 }
1321 if(i >= 8 && i< 12){
1322 if(!(UTF_IS_TRAIL(c)) || UTF_IS_SINGLE(c) || UTF_IS_LEAD(c) || !(UTF_IS_SURROGATE(c))){
1323 log_err("ERROR: U+%04x is a second surrogate", c);
1324 }
1325 }
1326 }
1327
1328}
1329
1330static void TestCodePoint(){
1331 const UChar32 codePoint[]={
1332 /*surrogate, notvalid(codepoint), not a UnicodeChar, not Error */
1333 0xd800,
1334 0xdbff,
1335 0xdc00,
1336 0xdfff,
1337 0xdc04,
1338 0xd821,
1339 /*not a surrogate, valid, isUnicodeChar , not Error*/
1340 0x20ac,
1341 0xd7ff,
1342 0xe000,
1343 0xe123,
1344 0x0061,
1345 0xe065,
1346 0x20402,
1347 0x24506,
1348 0x23456,
1349 0x20402,
1350 0x10402,
1351 0x23456,
1352 /*not a surrogate, not valid, isUnicodeChar, isError */
1353 0x0015,
1354 0x009f,
1355 /*not a surrogate, not valid, not isUnicodeChar, isError */
1356 0xffff,
1357 0xfffe,
1358 };
1359 int32_t i;
1360 for(i=0; i<(int32_t)(sizeof(codePoint)/sizeof(codePoint[0])); i++){
1361 UChar32 c=codePoint[i];
1362 if(i<6){
1363 if(!UTF_IS_SURROGATE(c) || !U_IS_SURROGATE(c) || !U16_IS_SURROGATE(c)){
1364 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1365 }
1366 if(UTF_IS_VALID(c)){
1367 log_err("ERROR: isValid() failed for U+%04x\n", c);
1368 }
1369 if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){
1370 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1371 }
1372 if(UTF_IS_ERROR(c)){
1373 log_err("ERROR: isError() failed for U+%04x\n", c);
1374 }
1375 }else if(i >=6 && i<18){
1376 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
1377 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1378 }
1379 if(!UTF_IS_VALID(c)){
1380 log_err("ERROR: isValid() failed for U+%04x\n", c);
1381 }
1382 if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){
1383 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1384 }
1385 if(UTF_IS_ERROR(c)){
1386 log_err("ERROR: isError() failed for U+%04x\n", c);
1387 }
1388 }else if(i >=18 && i<20){
1389 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
1390 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1391 }
1392 if(UTF_IS_VALID(c)){
1393 log_err("ERROR: isValid() failed for U+%04x\n", c);
1394 }
1395 if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){
1396 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1397 }
1398 if(!UTF_IS_ERROR(c)){
1399 log_err("ERROR: isError() failed for U+%04x\n", c);
1400 }
1401 }
1402 else if(i >=18 && i<(int32_t)(sizeof(codePoint)/sizeof(codePoint[0]))){
1403 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
1404 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1405 }
1406 if(UTF_IS_VALID(c)){
1407 log_err("ERROR: isValid() failed for U+%04x\n", c);
1408 }
1409 if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){
1410 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1411 }
1412 if(!UTF_IS_ERROR(c)){
1413 log_err("ERROR: isError() failed for U+%04x\n", c);
1414 }
1415 }
1416 }
1417
374ca955
A
1418 if(
1419 !U_IS_BMP(0) || !U_IS_BMP(0x61) || !U_IS_BMP(0x20ac) ||
1420 !U_IS_BMP(0xd9da) || !U_IS_BMP(0xdfed) || !U_IS_BMP(0xffff) ||
1421 U_IS_BMP(U_SENTINEL) || U_IS_BMP(0x10000) || U_IS_BMP(0x50005) ||
1422 U_IS_BMP(0x10ffff) || U_IS_BMP(0x110000) || U_IS_BMP(0x7fffffff)
1423 ) {
1424 log_err("error with U_IS_BMP()\n");
1425 }
1426
1427 if(
1428 U_IS_SUPPLEMENTARY(0) || U_IS_SUPPLEMENTARY(0x61) || U_IS_SUPPLEMENTARY(0x20ac) ||
1429 U_IS_SUPPLEMENTARY(0xd9da) || U_IS_SUPPLEMENTARY(0xdfed) || U_IS_SUPPLEMENTARY(0xffff) ||
1430 U_IS_SUPPLEMENTARY(U_SENTINEL) || !U_IS_SUPPLEMENTARY(0x10000) || !U_IS_SUPPLEMENTARY(0x50005) ||
1431 !U_IS_SUPPLEMENTARY(0x10ffff) || U_IS_SUPPLEMENTARY(0x110000) || U_IS_SUPPLEMENTARY(0x7fffffff)
1432 ) {
1433 log_err("error with U_IS_SUPPLEMENTARY()\n");
1434 }
b75a7d8f
A
1435}
1436
1437static void TestCharLength()
1438{
1439 const int32_t codepoint[]={
1440 1, 0x0061,
1441 1, 0xe065,
1442 1, 0x20ac,
1443 2, 0x20402,
1444 2, 0x23456,
1445 2, 0x24506,
1446 2, 0x20402,
1447 2, 0x10402,
1448 1, 0xd7ff,
1449 1, 0xe000
1450 };
1451
1452 int32_t i;
1453 UBool multiple;
1454 for(i=0; i<(int32_t)(sizeof(codepoint)/sizeof(codepoint[0])); i=(int16_t)(i+2)){
1455 UChar32 c=codepoint[i+1];
1456 if(UTF_CHAR_LENGTH(c) != codepoint[i] || U16_LENGTH(c) != codepoint[i]){
1457 log_err("The no: of code units for U+%04x:- Expected: %d Got: %d\n", c, codepoint[i], UTF_CHAR_LENGTH(c));
1458 }
1459 multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
1460 if(UTF_NEED_MULTIPLE_UCHAR(c) != multiple){
1461 log_err("ERROR: Unicode::needMultipleUChar() failed for U+%04x\n", c);
1462 }
1463 }
1464}
1465
1466/*internal functions ----*/
1467static int32_t MakeProp(char* str)
1468{
1469 int32_t result = 0;
1470 char* matchPosition =0;
1471
1472 matchPosition = strstr(tagStrings, str);
1473 if (matchPosition == 0)
1474 {
1475 log_err("unrecognized type letter ");
1476 log_err(str);
1477 }
374ca955
A
1478 else
1479 result = (int32_t)((matchPosition - tagStrings) / 2);
b75a7d8f
A
1480 return result;
1481}
1482
1483static int32_t MakeDir(char* str)
1484{
1485 int32_t pos = 0;
1486 for (pos = 0; pos < 19; pos++) {
1487 if (strcmp(str, dirStrings[pos]) == 0) {
1488 return pos;
1489 }
1490 }
1491 return -1;
1492}
1493
1494/* test u_charName() -------------------------------------------------------- */
1495
1496static const struct {
1497 uint32_t code;
1498 const char *name, *oldName, *extName;
1499} names[]={
1500 {0x0061, "LATIN SMALL LETTER A", "", "LATIN SMALL LETTER A"},
1501 {0x0284, "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK", "LATIN SMALL LETTER DOTLESS J BAR HOOK", "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK" },
1502 {0x3401, "CJK UNIFIED IDEOGRAPH-3401", "", "CJK UNIFIED IDEOGRAPH-3401" },
1503 {0x7fed, "CJK UNIFIED IDEOGRAPH-7FED", "", "CJK UNIFIED IDEOGRAPH-7FED" },
1504 {0xac00, "HANGUL SYLLABLE GA", "", "HANGUL SYLLABLE GA" },
1505 {0xd7a3, "HANGUL SYLLABLE HIH", "", "HANGUL SYLLABLE HIH" },
1506 {0xd800, "", "", "<lead surrogate-D800>" },
1507 {0xdc00, "", "", "<trail surrogate-DC00>" },
1508 {0xff08, "FULLWIDTH LEFT PARENTHESIS", "FULLWIDTH OPENING PARENTHESIS", "FULLWIDTH LEFT PARENTHESIS" },
1509 {0xffe5, "FULLWIDTH YEN SIGN", "", "FULLWIDTH YEN SIGN" },
1510 {0xffff, "", "", "<noncharacter-FFFF>" },
1511 {0x23456, "CJK UNIFIED IDEOGRAPH-23456", "", "CJK UNIFIED IDEOGRAPH-23456" }
1512};
1513
1514static UBool
1515enumCharNamesFn(void *context,
1516 UChar32 code, UCharNameChoice nameChoice,
1517 const char *name, int32_t length) {
1518 int32_t *pCount=(int32_t *)context;
1519 int i;
1520
1521 if(length<=0 || length!=(int32_t)strlen(name)) {
1522 /* should not be called with an empty string or invalid length */
1523 log_err("u_enumCharName(0x%lx)=%s but length=%ld\n", name, length);
1524 return TRUE;
1525 }
1526
1527 ++*pCount;
1528 for(i=0; i<sizeof(names)/sizeof(names[0]); ++i) {
1529 if(code==(UChar32)names[i].code) {
1530 switch (nameChoice) {
1531 case U_EXTENDED_CHAR_NAME:
1532 if(0!=strcmp(name, names[i].extName)) {
1533 log_err("u_enumCharName(0x%lx - Extended)=%s instead of %s\n", code, name, names[i].extName);
1534 }
1535 break;
1536 case U_UNICODE_CHAR_NAME:
1537 if(0!=strcmp(name, names[i].name)) {
1538 log_err("u_enumCharName(0x%lx)=%s instead of %s\n", code, name, names[i].name);
1539 }
1540 break;
1541 case U_UNICODE_10_CHAR_NAME:
1542 if(names[i].oldName[0]==0 || 0!=strcmp(name, names[i].oldName)) {
1543 log_err("u_enumCharName(0x%lx - 1.0)=%s instead of %s\n", code, name, names[i].oldName);
1544 }
1545 break;
1546 case U_CHAR_NAME_CHOICE_COUNT:
1547 break;
1548 }
1549 break;
1550 }
1551 }
1552 return TRUE;
1553}
1554
1555struct enumExtCharNamesContext {
1556 uint32_t length;
1557 int32_t last;
1558};
1559
1560static UBool
1561enumExtCharNamesFn(void *context,
1562 UChar32 code, UCharNameChoice nameChoice,
1563 const char *name, int32_t length) {
1564 struct enumExtCharNamesContext *ecncp = (struct enumExtCharNamesContext *) context;
1565
1566 if (ecncp->last != (int32_t) code - 1) {
1567 if (ecncp->last < 0) {
1568 log_err("u_enumCharName(0x%lx - Ext) after u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x%lx - Ext)\n", code, ecncp->last, ecncp->last + 1);
1569 } else {
1570 log_err("u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x0 - Ext)\n", code);
1571 }
1572 }
1573 ecncp->last = (int32_t) code;
1574
1575 if (!*name) {
1576 log_err("u_enumCharName(0x%lx - Ext) should not be an empty string\n", code);
1577 }
1578
1579 return enumCharNamesFn(&ecncp->length, code, nameChoice, name, length);
1580}
1581
1582/**
1583 * This can be made more efficient by moving it into putil.c and having
1584 * it directly access the ebcdic translation tables.
1585 * TODO: If we get this method in putil.c, then delete it from here.
1586 */
1587static UChar
1588u_charToUChar(char c) {
1589 UChar uc;
1590 u_charsToUChars(&c, &uc, 1);
1591 return uc;
1592}
1593
1594static void
1595TestCharNames() {
1596 static char name[80];
1597 UErrorCode errorCode=U_ZERO_ERROR;
1598 struct enumExtCharNamesContext extContext;
1599 int32_t length;
1600 UChar32 c;
1601 int32_t i;
1602
1603 log_verbose("Testing uprv_getMaxCharNameLength()\n");
1604 length=uprv_getMaxCharNameLength();
1605 if(length==0) {
1606 /* no names data available */
1607 return;
1608 }
1609 if(length<83) { /* Unicode 3.2 max char name length */
1610 log_err("uprv_getMaxCharNameLength()=%d is too short");
1611 }
1612 /* ### TODO same tests for max ISO comment length as for max name length */
1613
1614 log_verbose("Testing u_charName()\n");
1615 for(i=0; i<(int32_t)(sizeof(names)/sizeof(names[0])); ++i) {
1616 /* modern Unicode character name */
1617 length=u_charName(names[i].code, U_UNICODE_CHAR_NAME, name, sizeof(name), &errorCode);
1618 if(U_FAILURE(errorCode)) {
1619 log_err("u_charName(0x%lx) error %s\n", names[i].code, u_errorName(errorCode));
1620 return;
1621 }
1622 if(length<0 || 0!=strcmp(name, names[i].name) || length!=(uint16_t)strlen(name)) {
1623 log_err("u_charName(0x%lx) gets: %s (length %ld) instead of: %s\n", names[i].code, name, length, names[i].name);
1624 }
1625
1626 /* find the modern name */
1627 if (*names[i].name) {
1628 c=u_charFromName(U_UNICODE_CHAR_NAME, names[i].name, &errorCode);
1629 if(U_FAILURE(errorCode)) {
1630 log_err("u_charFromName(%s) error %s\n", names[i].name, u_errorName(errorCode));
1631 return;
1632 }
1633 if(c!=(UChar32)names[i].code) {
1634 log_err("u_charFromName(%s) gets 0x%lx instead of 0x%lx\n", names[i].name, c, names[i].code);
1635 }
1636 }
1637
1638 /* Unicode 1.0 character name */
1639 length=u_charName(names[i].code, U_UNICODE_10_CHAR_NAME, name, sizeof(name), &errorCode);
1640 if(U_FAILURE(errorCode)) {
1641 log_err("u_charName(0x%lx - 1.0) error %s\n", names[i].code, u_errorName(errorCode));
1642 return;
1643 }
1644 if(length<0 || (length>0 && 0!=strcmp(name, names[i].oldName)) || length!=(uint16_t)strlen(name)) {
1645 log_err("u_charName(0x%lx - 1.0) gets %s length %ld instead of nothing or %s\n", names[i].code, name, length, names[i].oldName);
1646 }
1647
1648 /* find the Unicode 1.0 name if it is stored (length>0 means that we could read it) */
1649 if(names[i].oldName[0]!=0 /* && length>0 */) {
1650 c=u_charFromName(U_UNICODE_10_CHAR_NAME, names[i].oldName, &errorCode);
1651 if(U_FAILURE(errorCode)) {
1652 log_err("u_charFromName(%s - 1.0) error %s\n", names[i].oldName, u_errorName(errorCode));
1653 return;
1654 }
1655 if(c!=(UChar32)names[i].code) {
1656 log_err("u_charFromName(%s - 1.0) gets 0x%lx instead of 0x%lx\n", names[i].oldName, c, names[i].code);
1657 }
1658 }
1659 }
1660
1661 /* test u_enumCharNames() */
1662 length=0;
1663 errorCode=U_ZERO_ERROR;
1664 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumCharNamesFn, &length, U_UNICODE_CHAR_NAME, &errorCode);
1665 if(U_FAILURE(errorCode) || length<94140) {
1666 log_err("u_enumCharNames(%ld..%lx) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE, u_errorName(errorCode), length);
1667 }
1668
1669 extContext.length = 0;
1670 extContext.last = -1;
1671 errorCode=U_ZERO_ERROR;
1672 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumExtCharNamesFn, &extContext, U_EXTENDED_CHAR_NAME, &errorCode);
1673 if(U_FAILURE(errorCode) || extContext.length<UCHAR_MAX_VALUE + 1) {
1674 log_err("u_enumCharNames(%ld..0x%lx - Extended) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, u_errorName(errorCode), extContext.length);
1675 }
1676
1677 /* test that u_charFromName() uppercases the input name, i.e., works with mixed-case names (new in 2.0) */
1678 if(0x61!=u_charFromName(U_UNICODE_CHAR_NAME, "LATin smALl letTER A", &errorCode)) {
1679 log_err("u_charFromName(U_UNICODE_CHAR_NAME, \"LATin smALl letTER A\") did not find U+0061 (%s)\n", u_errorName(errorCode));
1680 }
1681
1682 /* Test getCharNameCharacters */
1683 if(!QUICK) {
1684 enum { BUFSIZE = 256 };
1685 UErrorCode ec = U_ZERO_ERROR;
1686 char buf[BUFSIZE];
1687 int32_t maxLength;
1688 UChar32 cp;
1689 UChar pat[BUFSIZE], dumbPat[BUFSIZE];
1690 int32_t l1, l2;
1691 UBool map[256];
1692 UBool ok;
1693
1694 USet* set = uset_open(1, 0); /* empty set */
1695 USet* dumb = uset_open(1, 0); /* empty set */
1696
1697 /*
1698 * uprv_getCharNameCharacters() will likely return more lowercase
1699 * letters than actual character names contain because
1700 * it includes all the characters in lowercased names of
1701 * general categories, for the full possible set of extended names.
1702 */
374ca955
A
1703 {
1704 USetAdder sa={
1705 NULL,
1706 uset_add,
1707 uset_addRange,
73c04bcf
A
1708 uset_addString,
1709 NULL /* don't need remove() */
374ca955
A
1710 };
1711 sa.set=set;
1712 uprv_getCharNameCharacters(&sa);
1713 }
b75a7d8f
A
1714
1715 /* build set the dumb (but sure-fire) way */
374ca955 1716 for (i=0; i<256; ++i) {
b75a7d8f 1717 map[i] = FALSE;
374ca955 1718 }
b75a7d8f
A
1719
1720 maxLength=0;
1721 for (cp=0; cp<0x110000; ++cp) {
1722 int32_t len = u_charName(cp, U_EXTENDED_CHAR_NAME,
1723 buf, BUFSIZE, &ec);
1724 if (U_FAILURE(ec)) {
1725 log_err("FAIL: u_charName failed when it shouldn't\n");
1726 uset_close(set);
1727 uset_close(dumb);
1728 return;
1729 }
1730 if(len>maxLength) {
1731 maxLength=len;
1732 }
1733
1734 for (i=0; i<len; ++i) {
1735 if (!map[(uint8_t) buf[i]]) {
1736 uset_add(dumb, (UChar32)u_charToUChar(buf[i]));
1737 map[(uint8_t) buf[i]] = TRUE;
1738 }
1739 }
374ca955
A
1740
1741 /* test for leading/trailing whitespace */
1742 if(buf[0]==' ' || buf[0]=='\t' || buf[len-1]==' ' || buf[len-1]=='\t') {
1743 log_err("u_charName(U+%04x) returns a name with leading or trailing whitespace\n", cp);
1744 }
1745 }
1746
1747 if(map[(uint8_t)'\t']) {
1748 log_err("u_charName() returned a name with a TAB for some code point\n", cp);
b75a7d8f
A
1749 }
1750
1751 length=uprv_getMaxCharNameLength();
1752 if(length!=maxLength) {
1753 log_err("uprv_getMaxCharNameLength()=%d differs from the maximum length %d of all extended names\n",
1754 length, maxLength);
1755 }
1756
1757 /* compare the sets. Where is my uset_equals?!! */
1758 ok=TRUE;
1759 for(i=0; i<256; ++i) {
1760 if(uset_contains(set, i)!=uset_contains(dumb, i)) {
1761 if(0x61<=i && i<=0x7a /* a-z */ && uset_contains(set, i) && !uset_contains(dumb, i)) {
1762 /* ignore lowercase a-z that are in set but not in dumb */
1763 ok=TRUE;
1764 } else {
1765 ok=FALSE;
1766 break;
1767 }
1768 }
1769 }
1770
1771 l1 = uset_toPattern(set, pat, BUFSIZE, TRUE, &ec);
1772 l2 = uset_toPattern(dumb, dumbPat, BUFSIZE, TRUE, &ec);
1773 if (U_FAILURE(ec)) {
1774 log_err("FAIL: uset_toPattern failed when it shouldn't\n");
1775 uset_close(set);
1776 uset_close(dumb);
1777 return;
1778 }
1779
1780 if (l1 >= BUFSIZE) {
1781 l1 = BUFSIZE-1;
1782 pat[l1] = 0;
1783 }
1784 if (l2 >= BUFSIZE) {
1785 l2 = BUFSIZE-1;
1786 dumbPat[l2] = 0;
1787 }
1788
1789 if (!ok) {
b75a7d8f 1790 log_err("FAIL: uprv_getCharNameCharacters() returned %s, expected %s (too many lowercase a-z are ok)\n",
374ca955
A
1791 aescstrdup(pat, l1), aescstrdup(dumbPat, l2));
1792 } else if(VERBOSITY) {
1793 log_verbose("Ok: uprv_getCharNameCharacters() returned %s\n", aescstrdup(pat, l1));
b75a7d8f
A
1794 }
1795
1796 uset_close(set);
1797 uset_close(dumb);
1798 }
1799
1800 /* ### TODO: test error cases and other interesting things */
1801}
1802
1803/* test u_isMirrored() and u_charMirror() ----------------------------------- */
1804
1805static void
1806TestMirroring() {
73c04bcf
A
1807 USet *set;
1808 UErrorCode errorCode;
1809
1810 UChar32 start, end, c2, c3;
1811 int32_t i;
1812
1813 U_STRING_DECL(mirroredPattern, "[:Bidi_Mirrored:]", 17);
1814
1815 U_STRING_INIT(mirroredPattern, "[:Bidi_Mirrored:]", 17);
1816
b75a7d8f
A
1817 log_verbose("Testing u_isMirrored()\n");
1818 if(!(u_isMirrored(0x28) && u_isMirrored(0xbb) && u_isMirrored(0x2045) && u_isMirrored(0x232a) &&
1819 !u_isMirrored(0x27) && !u_isMirrored(0x61) && !u_isMirrored(0x284) && !u_isMirrored(0x3400)
1820 )
1821 ) {
1822 log_err("u_isMirrored() does not work correctly\n");
1823 }
1824
1825 log_verbose("Testing u_charMirror()\n");
1826 if(!(u_charMirror(0x3c)==0x3e && u_charMirror(0x5d)==0x5b && u_charMirror(0x208d)==0x208e && u_charMirror(0x3017)==0x3016 &&
73c04bcf 1827 u_charMirror(0xbb)==0xab && u_charMirror(0x2215)==0x29F5 && u_charMirror(0x29F5)==0x2215 && /* large delta between the code points */
46f4442e
A
1828 u_charMirror(0x2e)==0x2e && u_charMirror(0x6f3)==0x6f3 && u_charMirror(0x301c)==0x301c && u_charMirror(0xa4ab)==0xa4ab &&
1829 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
1830 u_charMirror(0x2018)==0x2018 && u_charMirror(0x201b)==0x201b && u_charMirror(0x301d)==0x301d
b75a7d8f
A
1831 )
1832 ) {
1833 log_err("u_charMirror() does not work correctly\n");
1834 }
73c04bcf
A
1835
1836 /* verify that Bidi_Mirroring_Glyph roundtrips */
1837 errorCode=U_ZERO_ERROR;
1838 set=uset_openPattern(mirroredPattern, 17, &errorCode);
1839
1840 if (U_FAILURE(errorCode)) {
1841 log_data_err("uset_openPattern(mirroredPattern, 17, &errorCode) failed!");
1842 } else {
1843 for(i=0; 0==uset_getItem(set, i, &start, &end, NULL, 0, &errorCode); ++i) {
1844 do {
1845 c2=u_charMirror(start);
1846 c3=u_charMirror(c2);
1847 if(c3!=start) {
1848 log_err("u_charMirror() does not roundtrip: U+%04lx->U+%04lx->U+%04lx\n", (long)start, (long)c2, (long)c3);
1849 }
1850 } while(++start<=end);
1851 }
1852 }
1853
1854 uset_close(set);
b75a7d8f
A
1855}
1856
1857
1858struct RunTestData
1859{
1860 const char *runText;
1861 UScriptCode runCode;
1862};
1863
1864typedef struct RunTestData RunTestData;
1865
1866static void
1867CheckScriptRuns(UScriptRun *scriptRun, int32_t *runStarts, const RunTestData *testData, int32_t nRuns,
1868 const char *prefix)
1869{
1870 int32_t run, runStart, runLimit;
1871 UScriptCode runCode;
1872
1873 /* iterate over all the runs */
1874 run = 0;
1875 while (uscript_nextRun(scriptRun, &runStart, &runLimit, &runCode)) {
1876 if (runStart != runStarts[run]) {
1877 log_err("%s: incorrect start offset for run %d: expected %d, got %d\n",
1878 prefix, run, runStarts[run], runStart);
1879 }
1880
1881 if (runLimit != runStarts[run + 1]) {
1882 log_err("%s: incorrect limit offset for run %d: expected %d, got %d\n",
1883 prefix, run, runStarts[run + 1], runLimit);
1884 }
1885
1886 if (runCode != testData[run].runCode) {
1887 log_err("%s: incorrect script for run %d: expected \"%s\", got \"%s\"\n",
1888 prefix, run, uscript_getName(testData[run].runCode), uscript_getName(runCode));
1889 }
1890
1891 run += 1;
1892
1893 /* stop when we've seen all the runs we expect to see */
1894 if (run >= nRuns) {
1895 break;
1896 }
1897 }
1898
1899 /* Complain if we didn't see then number of runs we expected */
1900 if (run != nRuns) {
1901 log_err("%s: incorrect number of runs: expected %d, got %d\n", prefix, run, nRuns);
1902 }
1903}
1904
1905static void
1906TestUScriptRunAPI()
1907{
374ca955 1908 static const RunTestData testData1[] = {
b75a7d8f
A
1909 {"\\u0020\\u0946\\u0939\\u093F\\u0928\\u094D\\u0926\\u0940\\u0020", USCRIPT_DEVANAGARI},
1910 {"\\u0627\\u0644\\u0639\\u0631\\u0628\\u064A\\u0629\\u0020", USCRIPT_ARABIC},
1911 {"\\u0420\\u0443\\u0441\\u0441\\u043A\\u0438\\u0439\\u0020", USCRIPT_CYRILLIC},
1912 {"English (", USCRIPT_LATIN},
1913 {"\\u0E44\\u0E17\\u0E22", USCRIPT_THAI},
1914 {") ", USCRIPT_LATIN},
1915 {"\\u6F22\\u5B75", USCRIPT_HAN},
1916 {"\\u3068\\u3072\\u3089\\u304C\\u306A\\u3068", USCRIPT_HIRAGANA},
1917 {"\\u30AB\\u30BF\\u30AB\\u30CA", USCRIPT_KATAKANA},
1918 {"\\U00010400\\U00010401\\U00010402\\U00010403", USCRIPT_DESERET}
1919 };
374ca955
A
1920
1921 static const RunTestData testData2[] = {
1922 {"((((((((((abc))))))))))", USCRIPT_LATIN}
1923 };
1924
1925 static const struct {
1926 const RunTestData *testData;
1927 int32_t nRuns;
1928 } testDataEntries[] = {
1929 {testData1, LENGTHOF(testData1)},
1930 {testData2, LENGTHOF(testData2)}
1931 };
1932
1933 static const int32_t nTestEntries = LENGTHOF(testDataEntries);
1934 int32_t testEntry;
1935
1936 for (testEntry = 0; testEntry < nTestEntries; testEntry += 1) {
1937 UChar testString[1024];
1938 int32_t runStarts[256];
1939 int32_t nTestRuns = testDataEntries[testEntry].nRuns;
1940 const RunTestData *testData = testDataEntries[testEntry].testData;
1941
1942 int32_t run, stringLimit;
1943 UScriptRun *scriptRun = NULL;
1944 UErrorCode err;
1945
1946 /*
1947 * Fill in the test string and the runStarts array.
1948 */
1949 stringLimit = 0;
1950 for (run = 0; run < nTestRuns; run += 1) {
1951 runStarts[run] = stringLimit;
1952 stringLimit += u_unescape(testData[run].runText, &testString[stringLimit], 1024 - stringLimit);
1953 /*stringLimit -= 1;*/
1954 }
1955
1956 /* The limit of the last run */
1957 runStarts[nTestRuns] = stringLimit;
1958
1959 /*
1960 * Make sure that calling uscript_OpenRun with a NULL text pointer
1961 * and a non-zero text length returns the correct error.
1962 */
1963 err = U_ZERO_ERROR;
1964 scriptRun = uscript_openRun(NULL, stringLimit, &err);
1965
1966 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
1967 log_err("uscript_openRun(NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
1968 }
1969
1970 if (scriptRun != NULL) {
1971 log_err("uscript_openRun(NULL, stringLimit, &err) returned a non-NULL result.\n");
1972 uscript_closeRun(scriptRun);
1973 }
1974
1975 /*
1976 * Make sure that calling uscript_OpenRun with a non-NULL text pointer
1977 * and a zero text length returns the correct error.
1978 */
1979 err = U_ZERO_ERROR;
1980 scriptRun = uscript_openRun(testString, 0, &err);
1981
1982 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
1983 log_err("uscript_openRun(testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
1984 }
1985
1986 if (scriptRun != NULL) {
1987 log_err("uscript_openRun(testString, 0, &err) returned a non-NULL result.\n");
1988 uscript_closeRun(scriptRun);
1989 }
1990
1991 /*
1992 * Make sure that calling uscript_openRun with a NULL text pointer
1993 * and a zero text length doesn't return an error.
1994 */
1995 err = U_ZERO_ERROR;
1996 scriptRun = uscript_openRun(NULL, 0, &err);
1997
1998 if (U_FAILURE(err)) {
1999 log_err("Got error %s from uscript_openRun(NULL, 0, &err)\n", u_errorName(err));
2000 }
2001
2002 /* Make sure that the empty iterator doesn't find any runs */
2003 if (uscript_nextRun(scriptRun, NULL, NULL, NULL)) {
2004 log_err("uscript_nextRun(...) returned TRUE for an empty iterator.\n");
2005 }
2006
2007 /*
2008 * Make sure that calling uscript_setRunText with a NULL text pointer
2009 * and a non-zero text length returns the correct error.
2010 */
2011 err = U_ZERO_ERROR;
2012 uscript_setRunText(scriptRun, NULL, stringLimit, &err);
2013
2014 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2015 log_err("uscript_setRunText(scriptRun, NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2016 }
2017
2018 /*
2019 * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2020 * and a zero text length returns the correct error.
2021 */
2022 err = U_ZERO_ERROR;
2023 uscript_setRunText(scriptRun, testString, 0, &err);
2024
2025 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2026 log_err("uscript_setRunText(scriptRun, testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2027 }
2028
2029 /*
2030 * Now call uscript_setRunText on the empty iterator
2031 * and make sure that it works.
2032 */
2033 err = U_ZERO_ERROR;
2034 uscript_setRunText(scriptRun, testString, stringLimit, &err);
2035
2036 if (U_FAILURE(err)) {
2037 log_err("Got error %s from uscript_setRunText(...)\n", u_errorName(err));
2038 } else {
2039 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_setRunText");
2040 }
2041
b75a7d8f 2042 uscript_closeRun(scriptRun);
374ca955
A
2043
2044 /*
2045 * Now open an interator over the testString
2046 * using uscript_openRun and make sure that it works
2047 */
2048 scriptRun = uscript_openRun(testString, stringLimit, &err);
2049
2050 if (U_FAILURE(err)) {
2051 log_err("Got error %s from uscript_openRun(...)\n", u_errorName(err));
2052 } else {
2053 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_openRun");
2054 }
2055
2056 /* Now reset the iterator, and make sure
2057 * that it still works.
2058 */
2059 uscript_resetRun(scriptRun);
2060
2061 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_resetRun");
2062
2063 /* Close the iterator */
b75a7d8f
A
2064 uscript_closeRun(scriptRun);
2065 }
b75a7d8f
A
2066}
2067
2068/* test additional, non-core properties */
2069static void
2070TestAdditionalProperties() {
2071 /* test data for u_charAge() */
2072 static const struct {
2073 UChar32 c;
2074 UVersionInfo version;
2075 } charAges[]={
2076 {0x41, { 1, 1, 0, 0 }},
2077 {0xffff, { 1, 1, 0, 0 }},
2078 {0x20ab, { 2, 0, 0, 0 }},
2079 {0x2fffe, { 2, 0, 0, 0 }},
2080 {0x20ac, { 2, 1, 0, 0 }},
2081 {0xfb1d, { 3, 0, 0, 0 }},
2082 {0x3f4, { 3, 1, 0, 0 }},
2083 {0x10300, { 3, 1, 0, 0 }},
2084 {0x220, { 3, 2, 0, 0 }},
2085 {0xff60, { 3, 2, 0, 0 }}
2086 };
2087
2088 /* test data for u_hasBinaryProperty() */
46f4442e 2089 static const int32_t
b75a7d8f
A
2090 props[][3]={ /* code point, property, value */
2091 { 0x0627, UCHAR_ALPHABETIC, TRUE },
2092 { 0x1034a, UCHAR_ALPHABETIC, TRUE },
2093 { 0x2028, UCHAR_ALPHABETIC, FALSE },
2094
2095 { 0x0066, UCHAR_ASCII_HEX_DIGIT, TRUE },
2096 { 0x0067, UCHAR_ASCII_HEX_DIGIT, FALSE },
2097
2098 { 0x202c, UCHAR_BIDI_CONTROL, TRUE },
2099 { 0x202f, UCHAR_BIDI_CONTROL, FALSE },
2100
2101 { 0x003c, UCHAR_BIDI_MIRRORED, TRUE },
2102 { 0x003d, UCHAR_BIDI_MIRRORED, FALSE },
2103
46f4442e
A
2104 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
2105 { 0x2018, UCHAR_BIDI_MIRRORED, FALSE },
2106 { 0x201d, UCHAR_BIDI_MIRRORED, FALSE },
2107 { 0x201f, UCHAR_BIDI_MIRRORED, FALSE },
2108 { 0x301e, UCHAR_BIDI_MIRRORED, FALSE },
2109
b75a7d8f
A
2110 { 0x058a, UCHAR_DASH, TRUE },
2111 { 0x007e, UCHAR_DASH, FALSE },
2112
2113 { 0x0c4d, UCHAR_DIACRITIC, TRUE },
2114 { 0x3000, UCHAR_DIACRITIC, FALSE },
2115
2116 { 0x0e46, UCHAR_EXTENDER, TRUE },
2117 { 0x0020, UCHAR_EXTENDER, FALSE },
2118
2119#if !UCONFIG_NO_NORMALIZATION
2120 { 0xfb1d, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2121 { 0x1d15f, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2122 { 0xfb1e, UCHAR_FULL_COMPOSITION_EXCLUSION, FALSE },
374ca955
A
2123
2124 { 0x110a, UCHAR_NFD_INERT, TRUE }, /* Jamo L */
2125 { 0x0308, UCHAR_NFD_INERT, FALSE },
2126
2127 { 0x1164, UCHAR_NFKD_INERT, TRUE }, /* Jamo V */
2128 { 0x1d79d, UCHAR_NFKD_INERT, FALSE }, /* math compat version of xi */
2129
2130 { 0x0021, UCHAR_NFC_INERT, TRUE }, /* ! */
2131 { 0x0061, UCHAR_NFC_INERT, FALSE }, /* a */
2132 { 0x00e4, UCHAR_NFC_INERT, FALSE }, /* a-umlaut */
2133 { 0x0102, UCHAR_NFC_INERT, FALSE }, /* a-breve */
2134 { 0xac1c, UCHAR_NFC_INERT, FALSE }, /* Hangul LV */
2135 { 0xac1d, UCHAR_NFC_INERT, TRUE }, /* Hangul LVT */
2136
2137 { 0x1d79d, UCHAR_NFKC_INERT, FALSE }, /* math compat version of xi */
2138 { 0x2a6d6, UCHAR_NFKC_INERT, TRUE }, /* Han, last of CJK ext. B */
2139
2140 { 0x00e4, UCHAR_SEGMENT_STARTER, TRUE },
2141 { 0x0308, UCHAR_SEGMENT_STARTER, FALSE },
2142 { 0x110a, UCHAR_SEGMENT_STARTER, TRUE }, /* Jamo L */
2143 { 0x1164, UCHAR_SEGMENT_STARTER, FALSE },/* Jamo V */
2144 { 0xac1c, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LV */
2145 { 0xac1d, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LVT */
b75a7d8f
A
2146#endif
2147
2148 { 0x0044, UCHAR_HEX_DIGIT, TRUE },
2149 { 0xff46, UCHAR_HEX_DIGIT, TRUE },
2150 { 0x0047, UCHAR_HEX_DIGIT, FALSE },
2151
2152 { 0x30fb, UCHAR_HYPHEN, TRUE },
2153 { 0xfe58, UCHAR_HYPHEN, FALSE },
2154
2155 { 0x2172, UCHAR_ID_CONTINUE, TRUE },
2156 { 0x0307, UCHAR_ID_CONTINUE, TRUE },
2157 { 0x005c, UCHAR_ID_CONTINUE, FALSE },
2158
2159 { 0x2172, UCHAR_ID_START, TRUE },
2160 { 0x007a, UCHAR_ID_START, TRUE },
2161 { 0x0039, UCHAR_ID_START, FALSE },
2162
2163 { 0x4db5, UCHAR_IDEOGRAPHIC, TRUE },
2164 { 0x2f999, UCHAR_IDEOGRAPHIC, TRUE },
2165 { 0x2f99, UCHAR_IDEOGRAPHIC, FALSE },
2166
2167 { 0x200c, UCHAR_JOIN_CONTROL, TRUE },
2168 { 0x2029, UCHAR_JOIN_CONTROL, FALSE },
2169
2170 { 0x1d7bc, UCHAR_LOWERCASE, TRUE },
2171 { 0x0345, UCHAR_LOWERCASE, TRUE },
2172 { 0x0030, UCHAR_LOWERCASE, FALSE },
2173
2174 { 0x1d7a9, UCHAR_MATH, TRUE },
2175 { 0x2135, UCHAR_MATH, TRUE },
2176 { 0x0062, UCHAR_MATH, FALSE },
2177
2178 { 0xfde1, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2179 { 0x10ffff, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2180 { 0x10fffd, UCHAR_NONCHARACTER_CODE_POINT, FALSE },
2181
2182 { 0x0022, UCHAR_QUOTATION_MARK, TRUE },
2183 { 0xff62, UCHAR_QUOTATION_MARK, TRUE },
2184 { 0xd840, UCHAR_QUOTATION_MARK, FALSE },
2185
2186 { 0x061f, UCHAR_TERMINAL_PUNCTUATION, TRUE },
2187 { 0xe003f, UCHAR_TERMINAL_PUNCTUATION, FALSE },
2188
2189 { 0x1d44a, UCHAR_UPPERCASE, TRUE },
2190 { 0x2162, UCHAR_UPPERCASE, TRUE },
2191 { 0x0345, UCHAR_UPPERCASE, FALSE },
2192
2193 { 0x0020, UCHAR_WHITE_SPACE, TRUE },
2194 { 0x202f, UCHAR_WHITE_SPACE, TRUE },
2195 { 0x3001, UCHAR_WHITE_SPACE, FALSE },
2196
2197 { 0x0711, UCHAR_XID_CONTINUE, TRUE },
2198 { 0x1d1aa, UCHAR_XID_CONTINUE, TRUE },
2199 { 0x007c, UCHAR_XID_CONTINUE, FALSE },
2200
2201 { 0x16ee, UCHAR_XID_START, TRUE },
2202 { 0x23456, UCHAR_XID_START, TRUE },
2203 { 0x1d1aa, UCHAR_XID_START, FALSE },
2204
2205 /*
2206 * Version break:
2207 * The following properties are only supported starting with the
2208 * Unicode version indicated in the second field.
2209 */
374ca955 2210 { -1, 0x320, 0 },
b75a7d8f
A
2211
2212 { 0x180c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2213 { 0xfe02, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2214 { 0x1801, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, FALSE },
2215
2216 { 0x0341, UCHAR_DEPRECATED, TRUE },
46f4442e
A
2217 { 0xe0041, UCHAR_DEPRECATED, TRUE }, /* changed from Unicode 5 to 5.1 */
2218 { 0xe0100, UCHAR_DEPRECATED, FALSE },
b75a7d8f
A
2219
2220 { 0x00a0, UCHAR_GRAPHEME_BASE, TRUE },
2221 { 0x0a4d, UCHAR_GRAPHEME_BASE, FALSE },
46f4442e
A
2222 { 0xff9d, UCHAR_GRAPHEME_BASE, TRUE },
2223 { 0xff9f, UCHAR_GRAPHEME_BASE, FALSE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
b75a7d8f
A
2224
2225 { 0x0300, UCHAR_GRAPHEME_EXTEND, TRUE },
46f4442e
A
2226 { 0xff9d, UCHAR_GRAPHEME_EXTEND, FALSE },
2227 { 0xff9f, UCHAR_GRAPHEME_EXTEND, TRUE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
b75a7d8f
A
2228 { 0x0603, UCHAR_GRAPHEME_EXTEND, FALSE },
2229
2230 { 0x0a4d, UCHAR_GRAPHEME_LINK, TRUE },
2231 { 0xff9f, UCHAR_GRAPHEME_LINK, FALSE },
2232
2233 { 0x2ff7, UCHAR_IDS_BINARY_OPERATOR, TRUE },
2234 { 0x2ff3, UCHAR_IDS_BINARY_OPERATOR, FALSE },
2235
2236 { 0x2ff3, UCHAR_IDS_TRINARY_OPERATOR, TRUE },
2237 { 0x2f03, UCHAR_IDS_TRINARY_OPERATOR, FALSE },
2238
2239 { 0x0ec1, UCHAR_LOGICAL_ORDER_EXCEPTION, TRUE },
2240 { 0xdcba, UCHAR_LOGICAL_ORDER_EXCEPTION, FALSE },
2241
2242 { 0x2e9b, UCHAR_RADICAL, TRUE },
2243 { 0x4e00, UCHAR_RADICAL, FALSE },
2244
2245 { 0x012f, UCHAR_SOFT_DOTTED, TRUE },
2246 { 0x0049, UCHAR_SOFT_DOTTED, FALSE },
2247
2248 { 0xfa11, UCHAR_UNIFIED_IDEOGRAPH, TRUE },
2249 { 0xfa12, UCHAR_UNIFIED_IDEOGRAPH, FALSE },
2250
73c04bcf 2251 { -1, 0x401, 0 }, /* version break for Unicode 4.0.1 */
374ca955
A
2252
2253 { 0x002e, UCHAR_S_TERM, TRUE },
2254 { 0x0061, UCHAR_S_TERM, FALSE },
2255
2256 { 0x180c, UCHAR_VARIATION_SELECTOR, TRUE },
2257 { 0xfe03, UCHAR_VARIATION_SELECTOR, TRUE },
2258 { 0xe01ef, UCHAR_VARIATION_SELECTOR, TRUE },
2259 { 0xe0200, UCHAR_VARIATION_SELECTOR, FALSE },
2260
b75a7d8f
A
2261 /* enum/integer type properties */
2262
2263 /* UCHAR_BIDI_CLASS tested for assigned characters in TestUnicodeData() */
2264 /* test default Bidi classes for unassigned code points */
2265 { 0x0590, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
73c04bcf 2266 { 0x05cf, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
b75a7d8f 2267 { 0x05ed, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
73c04bcf
A
2268 { 0x07f2, UCHAR_BIDI_CLASS, U_DIR_NON_SPACING_MARK }, /* Nko, new in Unicode 5.0 */
2269 { 0x07fe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, /* unassigned R */
b75a7d8f
A
2270 { 0x08ba, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2271 { 0xfb37, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2272 { 0xfb42, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2273 { 0x10806, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2274 { 0x10909, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2275 { 0x10fe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2276
46f4442e 2277 { 0x0605, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
b75a7d8f
A
2278 { 0x061c, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2279 { 0x063f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2280 { 0x070e, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2281 { 0x0775, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2282 { 0xfbc2, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2283 { 0xfd90, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2284 { 0xfefe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2285
2286 { 0x02AF, UCHAR_BLOCK, UBLOCK_IPA_EXTENSIONS },
2287 { 0x0C4E, UCHAR_BLOCK, UBLOCK_TELUGU },
2288 { 0x155A, UCHAR_BLOCK, UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS },
2289 { 0x1717, UCHAR_BLOCK, UBLOCK_TAGALOG },
374ca955 2290 { 0x1900, UCHAR_BLOCK, UBLOCK_LIMBU },
b75a7d8f
A
2291 { 0x1AFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2292 { 0x3040, UCHAR_BLOCK, UBLOCK_HIRAGANA },
2293 { 0x1D0FF, UCHAR_BLOCK, UBLOCK_BYZANTINE_MUSICAL_SYMBOLS },
374ca955 2294 { 0x50000, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
b75a7d8f 2295 { 0xEFFFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
374ca955 2296 { 0x10D0FF, UCHAR_BLOCK, UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B },
b75a7d8f
A
2297
2298 /* UCHAR_CANONICAL_COMBINING_CLASS tested for assigned characters in TestUnicodeData() */
2299 { 0xd7d7, UCHAR_CANONICAL_COMBINING_CLASS, 0 },
2300
2301 { 0x00A0, UCHAR_DECOMPOSITION_TYPE, U_DT_NOBREAK },
2302 { 0x00A8, UCHAR_DECOMPOSITION_TYPE, U_DT_COMPAT },
2303 { 0x00bf, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2304 { 0x00c0, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2305 { 0x1E9B, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2306 { 0xBCDE, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2307 { 0xFB5D, UCHAR_DECOMPOSITION_TYPE, U_DT_MEDIAL },
2308 { 0x1D736, UCHAR_DECOMPOSITION_TYPE, U_DT_FONT },
2309 { 0xe0033, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2310
2311 { 0x0009, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2312 { 0x0020, UCHAR_EAST_ASIAN_WIDTH, U_EA_NARROW },
2313 { 0x00B1, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2314 { 0x20A9, UCHAR_EAST_ASIAN_WIDTH, U_EA_HALFWIDTH },
2315 { 0x2FFB, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2316 { 0x3000, UCHAR_EAST_ASIAN_WIDTH, U_EA_FULLWIDTH },
2317 { 0x35bb, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2318 { 0x58bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2319 { 0xD7A3, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2320 { 0xEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2321 { 0x1D198, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2322 { 0x20000, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2323 { 0x2F8C7, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2324 { 0x3a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, /* plane 3 got default W values in Unicode 4 */
2325 { 0x5a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2326 { 0xFEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2327 { 0x10EEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2328
2329 /* UCHAR_GENERAL_CATEGORY tested for assigned characters in TestUnicodeData() */
2330 { 0xd7d7, UCHAR_GENERAL_CATEGORY, 0 },
2331
2332 { 0x0444, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2333 { 0x0639, UCHAR_JOINING_GROUP, U_JG_AIN },
2334 { 0x072A, UCHAR_JOINING_GROUP, U_JG_DALATH_RISH },
2335 { 0x0647, UCHAR_JOINING_GROUP, U_JG_HEH },
2336 { 0x06C1, UCHAR_JOINING_GROUP, U_JG_HEH_GOAL },
2337 { 0x06C3, UCHAR_JOINING_GROUP, U_JG_HAMZA_ON_HEH_GOAL },
2338
2339 { 0x200C, UCHAR_JOINING_TYPE, U_JT_NON_JOINING },
2340 { 0x200D, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2341 { 0x0639, UCHAR_JOINING_TYPE, U_JT_DUAL_JOINING },
2342 { 0x0640, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2343 { 0x06C3, UCHAR_JOINING_TYPE, U_JT_RIGHT_JOINING },
2344 { 0x0300, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2345 { 0x070F, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2346 { 0xe0033, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2347
2348 /* TestUnicodeData() verifies that no assigned character has "XX" (unknown) */
2349 { 0xe7e7, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2350 { 0x10fffd, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2351 { 0x0028, UCHAR_LINE_BREAK, U_LB_OPEN_PUNCTUATION },
2352 { 0x232A, UCHAR_LINE_BREAK, U_LB_CLOSE_PUNCTUATION },
2353 { 0x3401, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2354 { 0x4e02, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
b75a7d8f
A
2355 { 0x20004, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2356 { 0xf905, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2357 { 0xdb7e, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2358 { 0xdbfd, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2359 { 0xdffc, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2360 { 0x2762, UCHAR_LINE_BREAK, U_LB_EXCLAMATION },
2361 { 0x002F, UCHAR_LINE_BREAK, U_LB_BREAK_SYMBOLS },
2362 { 0x1D49C, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2363 { 0x1731, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2364
2365 /* UCHAR_NUMERIC_TYPE tested in TestNumericProperties() */
2366
2367 /* UCHAR_SCRIPT tested in TestUScriptCodeAPI() */
2368
2369 { 0x1100, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2370 { 0x1111, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2371 { 0x1159, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2372 { 0x115f, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2373
2374 { 0x1160, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2375 { 0x1161, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2376 { 0x1172, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2377 { 0x11a2, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2378
2379 { 0x11a8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2380 { 0x11b8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2381 { 0x11c8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2382 { 0x11f9, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2383
2384 { 0x115a, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2385 { 0x115e, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2386 { 0x11a3, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2387 { 0x11a7, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2388 { 0x11fa, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2389 { 0x11ff, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2390
2391 { 0xac00, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2392 { 0xac1c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2393 { 0xc5ec, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2394 { 0xd788, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2395
2396 { 0xac01, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2397 { 0xac1b, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2398 { 0xac1d, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2399 { 0xc5ee, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2400 { 0xd7a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2401
2402 { 0xd7a4, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2403
73c04bcf
A
2404 { -1, 0x410, 0 }, /* version break for Unicode 4.1 */
2405
2406 { 0x00d7, UCHAR_PATTERN_SYNTAX, TRUE },
2407 { 0xfe45, UCHAR_PATTERN_SYNTAX, TRUE },
2408 { 0x0061, UCHAR_PATTERN_SYNTAX, FALSE },
2409
2410 { 0x0020, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2411 { 0x0085, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2412 { 0x200f, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2413 { 0x00a0, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2414 { 0x3000, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2415
2416 { 0x1d200, UCHAR_BLOCK, UBLOCK_ANCIENT_GREEK_MUSICAL_NOTATION },
2417 { 0x2c8e, UCHAR_BLOCK, UBLOCK_COPTIC },
2418 { 0xfe17, UCHAR_BLOCK, UBLOCK_VERTICAL_FORMS },
2419
2420 { 0x1a00, UCHAR_SCRIPT, USCRIPT_BUGINESE },
2421 { 0x2cea, UCHAR_SCRIPT, USCRIPT_COPTIC },
2422 { 0xa82b, UCHAR_SCRIPT, USCRIPT_SYLOTI_NAGRI },
2423 { 0x103d0, UCHAR_SCRIPT, USCRIPT_OLD_PERSIAN },
2424
2425 { 0xcc28, UCHAR_LINE_BREAK, U_LB_H2 },
2426 { 0xcc29, UCHAR_LINE_BREAK, U_LB_H3 },
2427 { 0xac03, UCHAR_LINE_BREAK, U_LB_H3 },
2428 { 0x115f, UCHAR_LINE_BREAK, U_LB_JL },
2429 { 0x11aa, UCHAR_LINE_BREAK, U_LB_JT },
2430 { 0x11a1, UCHAR_LINE_BREAK, U_LB_JV },
2431
2432 { 0xb2c9, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_LVT },
2433 { 0x036f, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_EXTEND },
2434 { 0x0000, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_CONTROL },
2435 { 0x1160, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_V },
2436
2437 { 0x05f4, UCHAR_WORD_BREAK, U_WB_MIDLETTER },
2438 { 0x4ef0, UCHAR_WORD_BREAK, U_WB_OTHER },
2439 { 0x19d9, UCHAR_WORD_BREAK, U_WB_NUMERIC },
2440 { 0x2044, UCHAR_WORD_BREAK, U_WB_MIDNUM },
2441
2442 { 0xfffd, UCHAR_SENTENCE_BREAK, U_SB_OTHER },
2443 { 0x1ffc, UCHAR_SENTENCE_BREAK, U_SB_UPPER },
2444 { 0xff63, UCHAR_SENTENCE_BREAK, U_SB_CLOSE },
2445 { 0x2028, UCHAR_SENTENCE_BREAK, U_SB_SEP },
2446
b75a7d8f
A
2447 /* undefined UProperty values */
2448 { 0x61, 0x4a7, 0 },
2449 { 0x234bc, 0x15ed, 0 }
2450 };
2451
2452 UVersionInfo version;
2453 UChar32 c;
2454 int32_t i, result, uVersion;
2455 UProperty which;
2456
2457 /* what is our Unicode version? */
2458 u_getUnicodeVersion(version);
374ca955 2459 uVersion=((int32_t)version[0]<<8)|(version[1]<<4)|version[2]; /* major/minor/update version numbers */
b75a7d8f
A
2460
2461 u_charAge(0x20, version);
2462 if(version[0]==0) {
2463 /* no additional properties available */
2464 log_err("TestAdditionalProperties: no additional properties available, not tested\n");
2465 return;
2466 }
2467
2468 /* test u_charAge() */
2469 for(i=0; i<sizeof(charAges)/sizeof(charAges[0]); ++i) {
2470 u_charAge(charAges[i].c, version);
2471 if(0!=memcmp(version, charAges[i].version, sizeof(UVersionInfo))) {
2472 log_err("error: u_charAge(U+%04lx)={ %u, %u, %u, %u } instead of { %u, %u, %u, %u }\n",
2473 charAges[i].c,
2474 version[0], version[1], version[2], version[3],
2475 charAges[i].version[0], charAges[i].version[1], charAges[i].version[2], charAges[i].version[3]);
2476 }
2477 }
2478
2479 if( u_getIntPropertyMinValue(UCHAR_DASH)!=0 ||
2480 u_getIntPropertyMinValue(UCHAR_BIDI_CLASS)!=0 ||
2481 u_getIntPropertyMinValue(UCHAR_BLOCK)!=0 || /* j2478 */
2482 u_getIntPropertyMinValue(UCHAR_SCRIPT)!=0 || /*JB#2410*/
2483 u_getIntPropertyMinValue(0x2345)!=0
2484 ) {
2485 log_err("error: u_getIntPropertyMinValue() wrong\n");
2486 }
73c04bcf
A
2487 if( u_getIntPropertyMaxValue(UCHAR_DASH)!=1) {
2488 log_err("error: u_getIntPropertyMaxValue(UCHAR_DASH) wrong\n");
2489 }
2490 if( u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE)!=1) {
2491 log_err("error: u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE) wrong\n");
2492 }
46f4442e 2493 if( u_getIntPropertyMaxValue((UProperty)(UCHAR_BINARY_LIMIT-1))!=1) {
73c04bcf
A
2494 log_err("error: u_getIntPropertyMaxValue(UCHAR_BINARY_LIMIT-1) wrong\n");
2495 }
2496 if( u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS)!=(int32_t)U_CHAR_DIRECTION_COUNT-1 ) {
2497 log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS) wrong\n");
2498 }
2499 if( u_getIntPropertyMaxValue(UCHAR_BLOCK)!=(int32_t)UBLOCK_COUNT-1 ) {
2500 log_err("error: u_getIntPropertyMaxValue(UCHAR_BLOCK) wrong\n");
2501 }
2502 if(u_getIntPropertyMaxValue(UCHAR_LINE_BREAK)!=(int32_t)U_LB_COUNT-1) {
2503 log_err("error: u_getIntPropertyMaxValue(UCHAR_LINE_BREAK) wrong\n");
2504 }
2505 if(u_getIntPropertyMaxValue(UCHAR_SCRIPT)!=(int32_t)USCRIPT_CODE_LIMIT-1) {
2506 log_err("error: u_getIntPropertyMaxValue(UCHAR_SCRIPT) wrong\n");
2507 }
2508 if(u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE)!=(int32_t)U_NT_COUNT-1) {
2509 log_err("error: u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE) wrong\n");
2510 }
2511 if(u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY)!=(int32_t)U_CHAR_CATEGORY_COUNT-1) {
2512 log_err("error: u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY) wrong\n");
2513 }
2514 if(u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE)!=(int32_t)U_HST_COUNT-1) {
2515 log_err("error: u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE) wrong\n");
2516 }
2517 if(u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK)!=(int32_t)U_GCB_COUNT-1) {
2518 log_err("error: u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK) wrong\n");
2519 }
2520 if(u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK)!=(int32_t)U_SB_COUNT-1) {
2521 log_err("error: u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK) wrong\n");
2522 }
2523 if(u_getIntPropertyMaxValue(UCHAR_WORD_BREAK)!=(int32_t)U_WB_COUNT-1) {
2524 log_err("error: u_getIntPropertyMaxValue(UCHAR_WORD_BREAK) wrong\n");
2525 }
2526 /*JB#2410*/
2527 if( u_getIntPropertyMaxValue(0x2345)!=-1) {
2528 log_err("error: u_getIntPropertyMaxValue(0x2345) wrong\n");
2529 }
2530 if( u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) != (int32_t) (U_DT_COUNT - 1)) {
2531 log_err("error: u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) wrong\n");
2532 }
2533 if( u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) != (int32_t) (U_JG_COUNT -1)) {
2534 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) wrong\n");
2535 }
2536 if( u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) != (int32_t) (U_JT_COUNT -1)) {
2537 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) wrong\n");
2538 }
2539 if( u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) != (int32_t) (U_EA_COUNT -1)) {
2540 log_err("error: u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) wrong\n");
b75a7d8f
A
2541 }
2542
2543 /* test u_hasBinaryProperty() and u_getIntPropertyValue() */
2544 for(i=0; i<sizeof(props)/sizeof(props[0]); ++i) {
2545 if(props[i][0]<0) {
2546 /* Unicode version break */
2547 if(uVersion<props[i][1]) {
2548 break; /* do not test properties that are not yet supported */
2549 } else {
2550 continue; /* skip this row */
2551 }
2552 }
2553
2554 c=(UChar32)props[i][0];
2555 which=(UProperty)props[i][1];
2556
2557 if(which<UCHAR_INT_START) {
2558 result=u_hasBinaryProperty(c, which);
2559 if(result!=props[i][2]) {
2560 log_err("error: u_hasBinaryProperty(U+%04lx, %d)=%d is wrong (props[%d])\n",
2561 c, which, result, i);
2562 }
2563 }
2564
2565 result=u_getIntPropertyValue(c, which);
2566 if(result!=props[i][2]) {
2567 log_err("error: u_getIntPropertyValue(U+%04lx, 0x1000+%d)=%d is wrong, should be %d (props[%d])\n",
2568 c, (int32_t)which-0x1000, result, props[i][2], i);
2569 }
2570
2571 /* test separate functions, too */
2572 switch((UProperty)props[i][1]) {
2573 case UCHAR_ALPHABETIC:
2574 if(u_isUAlphabetic((UChar32)props[i][0])!=(UBool)props[i][2]) {
2575 log_err("error: u_isUAlphabetic(U+%04lx)=%d is wrong (props[%d])\n",
2576 props[i][0], result, i);
2577 }
2578 break;
2579 case UCHAR_LOWERCASE:
2580 if(u_isULowercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2581 log_err("error: u_isULowercase(U+%04lx)=%d is wrong (props[%d])\n",
2582 props[i][0], result, i);
2583 }
2584 break;
2585 case UCHAR_UPPERCASE:
2586 if(u_isUUppercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2587 log_err("error: u_isUUppercase(U+%04lx)=%d is wrong (props[%d])\n",
2588 props[i][0], result, i);
2589 }
2590 break;
2591 case UCHAR_WHITE_SPACE:
2592 if(u_isUWhiteSpace((UChar32)props[i][0])!=(UBool)props[i][2]) {
2593 log_err("error: u_isUWhiteSpace(U+%04lx)=%d is wrong (props[%d])\n",
2594 props[i][0], result, i);
2595 }
2596 break;
2597 default:
2598 break;
2599 }
2600 }
2601}
2602
2603static void
2604TestNumericProperties(void) {
2605 /* see UnicodeData.txt, DerivedNumericValues.txt */
2606 static const struct {
2607 UChar32 c;
2608 int32_t type;
2609 double numValue;
2610 } values[]={
2611 { 0x0F33, U_NT_NUMERIC, -1./2. },
2612 { 0x0C66, U_NT_DECIMAL, 0 },
2613 { 0x96f6, U_NT_NUMERIC, 0 },
2614 { 0x2159, U_NT_NUMERIC, 1./6. },
2615 { 0x00BD, U_NT_NUMERIC, 1./2. },
2616 { 0x0031, U_NT_DECIMAL, 1. },
2617 { 0x4e00, U_NT_NUMERIC, 1. },
2618 { 0x58f1, U_NT_NUMERIC, 1. },
2619 { 0x10320, U_NT_NUMERIC, 1. },
2620 { 0x0F2B, U_NT_NUMERIC, 3./2. },
2621 { 0x00B2, U_NT_DIGIT, 2. },
2622 { 0x5f10, U_NT_NUMERIC, 2. },
2623 { 0x1813, U_NT_DECIMAL, 3. },
2624 { 0x5f0e, U_NT_NUMERIC, 3. },
2625 { 0x2173, U_NT_NUMERIC, 4. },
2626 { 0x8086, U_NT_NUMERIC, 4. },
2627 { 0x278E, U_NT_DIGIT, 5. },
2628 { 0x1D7F2, U_NT_DECIMAL, 6. },
2629 { 0x247A, U_NT_DIGIT, 7. },
2630 { 0x7396, U_NT_NUMERIC, 9. },
2631 { 0x1372, U_NT_NUMERIC, 10. },
2632 { 0x216B, U_NT_NUMERIC, 12. },
2633 { 0x16EE, U_NT_NUMERIC, 17. },
2634 { 0x249A, U_NT_NUMERIC, 19. },
2635 { 0x303A, U_NT_NUMERIC, 30. },
2636 { 0x5345, U_NT_NUMERIC, 30. },
2637 { 0x32B2, U_NT_NUMERIC, 37. },
2638 { 0x1375, U_NT_NUMERIC, 40. },
2639 { 0x10323, U_NT_NUMERIC, 50. },
2640 { 0x0BF1, U_NT_NUMERIC, 100. },
2641 { 0x964c, U_NT_NUMERIC, 100. },
2642 { 0x217E, U_NT_NUMERIC, 500. },
2643 { 0x2180, U_NT_NUMERIC, 1000. },
2644 { 0x4edf, U_NT_NUMERIC, 1000. },
2645 { 0x2181, U_NT_NUMERIC, 5000. },
2646 { 0x137C, U_NT_NUMERIC, 10000. },
2647 { 0x4e07, U_NT_NUMERIC, 10000. },
2648 { 0x4ebf, U_NT_NUMERIC, 100000000. },
2649 { 0x5146, U_NT_NUMERIC, 1000000000000. },
2650 { 0x61, U_NT_NONE, U_NO_NUMERIC_VALUE },
2651 { 0x3000, U_NT_NONE, U_NO_NUMERIC_VALUE },
2652 { 0xfffe, U_NT_NONE, U_NO_NUMERIC_VALUE },
2653 { 0x10301, U_NT_NONE, U_NO_NUMERIC_VALUE },
2654 { 0xe0033, U_NT_NONE, U_NO_NUMERIC_VALUE },
2655 { 0x10ffff, U_NT_NONE, U_NO_NUMERIC_VALUE }
2656 };
2657
2658 double nv;
2659 UChar32 c;
2660 int32_t i, type;
2661
2662 for(i=0; i<LENGTHOF(values); ++i) {
2663 c=values[i].c;
2664 type=u_getIntPropertyValue(c, UCHAR_NUMERIC_TYPE);
2665 nv=u_getNumericValue(c);
2666
2667 if(type!=values[i].type) {
2668 log_err("UCHAR_NUMERIC_TYPE(U+%04lx)=%d should be %d\n", c, type, values[i].type);
2669 }
2670 if(0.000001 <= fabs(nv - values[i].numValue)) {
2671 log_err("u_getNumericValue(U+%04lx)=%g should be %g\n", c, nv, values[i].numValue);
2672 }
2673 }
2674}
2675
2676/**
2677 * Test the property names and property value names API.
2678 */
2679static void
2680TestPropertyNames(void) {
2681 int32_t p, v, choice=0, rev;
2682 UBool atLeastSomething = FALSE;
2683
2684 for (p=0; ; ++p) {
46f4442e 2685 UProperty propEnum = (UProperty)p;
b75a7d8f
A
2686 UBool sawProp = FALSE;
2687 if(p > 10 && !atLeastSomething) {
2688 log_data_err("Never got anything after 10 tries.\nYour data is probably fried. Quitting this test\n", p, choice);
2689 return;
2690 }
2691
2692 for (choice=0; ; ++choice) {
46f4442e 2693 const char* name = u_getPropertyName(propEnum, (UPropertyNameChoice)choice);
b75a7d8f 2694 if (name) {
46f4442e
A
2695 if (!sawProp)
2696 log_verbose("prop 0x%04x+%2d:", p&~0xfff, p&0xfff);
b75a7d8f
A
2697 log_verbose("%d=\"%s\"", choice, name);
2698 sawProp = TRUE;
2699 atLeastSomething = TRUE;
2700
2701 /* test reverse mapping */
2702 rev = u_getPropertyEnum(name);
2703 if (rev != p) {
2704 log_err("Property round-trip failure: %d -> %s -> %d\n",
2705 p, name, rev);
2706 }
2707 }
2708 if (!name && choice>0) break;
2709 }
2710 if (sawProp) {
2711 /* looks like a valid property; check the values */
46f4442e 2712 const char* pname = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
b75a7d8f
A
2713 int32_t max = 0;
2714 if (p == UCHAR_CANONICAL_COMBINING_CLASS) {
2715 max = 255;
2716 } else if (p == UCHAR_GENERAL_CATEGORY_MASK) {
2717 /* it's far too slow to iterate all the way up to
2718 the real max, U_GC_P_MASK */
2719 max = U_GC_NL_MASK;
2720 } else if (p == UCHAR_BLOCK) {
2721 /* UBlockCodes, unlike other values, start at 1 */
2722 max = 1;
2723 }
2724 log_verbose("\n");
2725 for (v=-1; ; ++v) {
2726 UBool sawValue = FALSE;
2727 for (choice=0; ; ++choice) {
46f4442e 2728 const char* vname = u_getPropertyValueName(propEnum, v, (UPropertyNameChoice)choice);
b75a7d8f
A
2729 if (vname) {
2730 if (!sawValue) log_verbose(" %s, value %d:", pname, v);
2731 log_verbose("%d=\"%s\"", choice, vname);
2732 sawValue = TRUE;
2733
2734 /* test reverse mapping */
46f4442e 2735 rev = u_getPropertyValueEnum(propEnum, vname);
b75a7d8f
A
2736 if (rev != v) {
2737 log_err("Value round-trip failure (%s): %d -> %s -> %d\n",
2738 pname, v, vname, rev);
2739 }
2740 }
2741 if (!vname && choice>0) break;
2742 }
2743 if (sawValue) {
2744 log_verbose("\n");
2745 }
2746 if (!sawValue && v>=max) break;
2747 }
2748 }
2749 if (!sawProp) {
2750 if (p>=UCHAR_STRING_LIMIT) {
2751 break;
2752 } else if (p>=UCHAR_DOUBLE_LIMIT) {
2753 p = UCHAR_STRING_START - 1;
2754 } else if (p>=UCHAR_MASK_LIMIT) {
2755 p = UCHAR_DOUBLE_START - 1;
2756 } else if (p>=UCHAR_INT_LIMIT) {
2757 p = UCHAR_MASK_START - 1;
2758 } else if (p>=UCHAR_BINARY_LIMIT) {
2759 p = UCHAR_INT_START - 1;
2760 }
2761 }
2762 }
2763}
2764
2765/**
2766 * Test the property values API. See JB#2410.
2767 */
2768static void
2769TestPropertyValues(void) {
2770 int32_t i, p, min, max;
2771 UErrorCode ec;
2772
2773 /* Min should be 0 for everything. */
2774 /* Until JB#2478 is fixed, the one exception is UCHAR_BLOCK. */
2775 for (p=UCHAR_INT_START; p<UCHAR_INT_LIMIT; ++p) {
46f4442e
A
2776 UProperty propEnum = (UProperty)p;
2777 min = u_getIntPropertyMinValue(propEnum);
b75a7d8f
A
2778 if (min != 0) {
2779 if (p == UCHAR_BLOCK) {
2780 /* This is okay...for now. See JB#2487.
2781 TODO Update this for JB#2487. */
2782 } else {
2783 const char* name;
46f4442e
A
2784 name = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
2785 if (name == NULL)
2786 name = "<ERROR>";
b75a7d8f
A
2787 log_err("FAIL: u_getIntPropertyMinValue(%s) = %d, exp. 0\n",
2788 name, min);
2789 }
2790 }
2791 }
2792
2793 if( u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY_MASK)!=0 ||
2794 u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY_MASK)!=-1) {
2795 log_err("error: u_getIntPropertyMin/MaxValue(UCHAR_GENERAL_CATEGORY_MASK) is wrong\n");
2796 }
2797
2798 /* Max should be -1 for invalid properties. */
46f4442e 2799 max = u_getIntPropertyMaxValue(UCHAR_INVALID_CODE);
b75a7d8f
A
2800 if (max != -1) {
2801 log_err("FAIL: u_getIntPropertyMaxValue(-1) = %d, exp. -1\n",
2802 max);
2803 }
2804
73c04bcf 2805 /* Script should return USCRIPT_INVALID_CODE for an invalid code point. */
b75a7d8f
A
2806 for (i=0; i<2; ++i) {
2807 int32_t script;
2808 const char* desc;
2809 ec = U_ZERO_ERROR;
2810 switch (i) {
2811 case 0:
2812 script = uscript_getScript(-1, &ec);
2813 desc = "uscript_getScript(-1)";
2814 break;
2815 case 1:
2816 script = u_getIntPropertyValue(-1, UCHAR_SCRIPT);
2817 desc = "u_getIntPropertyValue(-1, UCHAR_SCRIPT)";
2818 break;
2819 default:
2820 log_err("Internal test error. Too many scripts\n");
2821 return;
2822 }
2823 /* We don't explicitly test ec. It should be U_FAILURE but it
2824 isn't documented as such. */
73c04bcf 2825 if (script != (int32_t)USCRIPT_INVALID_CODE) {
b75a7d8f
A
2826 log_err("FAIL: %s = %d, exp. 0\n",
2827 desc, script);
2828 }
2829 }
2830}
2831
2832/* add characters from a serialized set to a normal one */
2833static void
2834_setAddSerialized(USet *set, const USerializedSet *sset) {
2835 UChar32 start, end;
2836 int32_t i, count;
2837
2838 count=uset_getSerializedRangeCount(sset);
2839 for(i=0; i<count; ++i) {
2840 uset_getSerializedRange(sset, i, &start, &end);
2841 uset_addRange(set, start, end);
2842 }
2843}
2844
2845/* various tests for consistency of UCD data and API behavior */
2846static void
2847TestConsistency() {
2848#if !UCONFIG_NO_NORMALIZATION
2849 UChar buffer16[300];
2850#endif
2851 char buffer[300];
2852 USet *set1, *set2, *set3, *set4;
2853 UErrorCode errorCode;
2854
2855#if !UCONFIG_NO_NORMALIZATION
2856 USerializedSet sset;
2857#endif
2858 UChar32 start, end;
2859 int32_t i, length;
2860
2861 U_STRING_DECL(hyphenPattern, "[:Hyphen:]", 10);
2862 U_STRING_DECL(dashPattern, "[:Dash:]", 8);
2863 U_STRING_DECL(lowerPattern, "[:Lowercase:]", 13);
2864 U_STRING_DECL(formatPattern, "[:Cf:]", 6);
2865 U_STRING_DECL(alphaPattern, "[:Alphabetic:]", 14);
2866
73c04bcf
A
2867 U_STRING_DECL(mathBlocksPattern,
2868 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
2869 1+32+46+46+45+43+1+1); /* +1 for NUL */
2870 U_STRING_DECL(mathPattern, "[:Math:]", 8);
2871 U_STRING_DECL(unassignedPattern, "[:Cn:]", 6);
2872 U_STRING_DECL(unknownPattern, "[:sc=Unknown:]", 14);
2873 U_STRING_DECL(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
2874
b75a7d8f
A
2875 U_STRING_INIT(hyphenPattern, "[:Hyphen:]", 10);
2876 U_STRING_INIT(dashPattern, "[:Dash:]", 8);
2877 U_STRING_INIT(lowerPattern, "[:Lowercase:]", 13);
2878 U_STRING_INIT(formatPattern, "[:Cf:]", 6);
2879 U_STRING_INIT(alphaPattern, "[:Alphabetic:]", 14);
2880
73c04bcf
A
2881 U_STRING_INIT(mathBlocksPattern,
2882 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
2883 1+32+46+46+45+43+1+1); /* +1 for NUL */
2884 U_STRING_INIT(mathPattern, "[:Math:]", 8);
2885 U_STRING_INIT(unassignedPattern, "[:Cn:]", 6);
2886 U_STRING_INIT(unknownPattern, "[:sc=Unknown:]", 14);
2887 U_STRING_INIT(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
2888
b75a7d8f
A
2889 /*
2890 * It used to be that UCD.html and its precursors said
2891 * "Those dashes used to mark connections between pieces of words,
2892 * plus the Katakana middle dot."
2893 *
2894 * Unicode 4 changed 00AD Soft Hyphen to Cf and removed it from Dash
2895 * but not from Hyphen.
2896 * UTC 94 (2003mar) decided to leave it that way and to changed UCD.html.
2897 * Therefore, do not show errors when testing the Hyphen property.
2898 */
2899 log_verbose("Starting with Unicode 4, inconsistencies with [:Hyphen:] are\n"
2900 "known to the UTC and not considered errors.\n");
2901
2902 errorCode=U_ZERO_ERROR;
2903 set1=uset_openPattern(hyphenPattern, 10, &errorCode);
2904 set2=uset_openPattern(dashPattern, 8, &errorCode);
2905 if(U_SUCCESS(errorCode)) {
2906 /* remove the Katakana middle dot(s) from set1 */
2907 uset_remove(set1, 0x30fb);
2908 uset_remove(set1, 0xff65); /* halfwidth variant */
2909 showAMinusB(set1, set2, "[:Hyphen:]", "[:Dash:]", FALSE);
2910 } else {
2911 log_err("error opening [:Hyphen:] or [:Dash:] - %s\n", u_errorName(errorCode));
2912 }
2913
2914 /* check that Cf is neither Hyphen nor Dash nor Alphabetic */
2915 set3=uset_openPattern(formatPattern, 6, &errorCode);
2916 set4=uset_openPattern(alphaPattern, 14, &errorCode);
2917 if(U_SUCCESS(errorCode)) {
2918 showAIntersectB(set3, set1, "[:Cf:]", "[:Hyphen:]", FALSE);
2919 showAIntersectB(set3, set2, "[:Cf:]", "[:Dash:]", TRUE);
2920 showAIntersectB(set3, set4, "[:Cf:]", "[:Alphabetic:]", TRUE);
2921 } else {
2922 log_err("error opening [:Cf:] or [:Alpbabetic:] - %s\n", u_errorName(errorCode));
2923 }
2924
2925 uset_close(set1);
2926 uset_close(set2);
2927 uset_close(set3);
2928 uset_close(set4);
2929
2930 /*
2931 * Check that each lowercase character has "small" in its name
2932 * and not "capital".
2933 * There are some such characters, some of which seem odd.
2934 * Use the verbose flag to see these notices.
2935 */
2936 errorCode=U_ZERO_ERROR;
2937 set1=uset_openPattern(lowerPattern, 13, &errorCode);
2938 if(U_SUCCESS(errorCode)) {
2939 for(i=0;; ++i) {
2940 length=uset_getItem(set1, i, &start, &end, NULL, 0, &errorCode);
2941 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
2942 break; /* done */
2943 }
2944 if(U_FAILURE(errorCode)) {
2945 log_err("error iterating over [:Lowercase:] at item %d: %s\n",
2946 i, u_errorName(errorCode));
2947 break;
2948 }
2949 if(length!=0) {
2950 break; /* done with code points, got a string or -1 */
2951 }
2952
2953 while(start<=end) {
2954 length=u_charName(start, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode);
2955 if(U_FAILURE(errorCode)) {
2956 log_err("error getting the name of U+%04x - %s\n", start, u_errorName(errorCode));
2957 errorCode=U_ZERO_ERROR;
2958 continue;
2959 }
2960 if( (strstr(buffer, "SMALL")==NULL || strstr(buffer, "CAPITAL")!=NULL) &&
2961 strstr(buffer, "SMALL CAPITAL")==NULL
2962 ) {
2963 log_verbose("info: [:Lowercase:] contains U+%04x whose name does not suggest lowercase: %s\n", start, buffer);
2964 }
2965 ++start;
2966 }
2967 }
2968 } else {
2969 log_err("error opening [:Lowercase:] - %s\n", u_errorName(errorCode));
2970 }
2971 uset_close(set1);
2972
2973#if !UCONFIG_NO_NORMALIZATION
2974
2975 /*
2976 * Test for an example that unorm_getCanonStartSet() delivers
2977 * all characters that compose from the input one,
2978 * even in multiple steps.
2979 * For example, the set for "I" (0049) should contain both
2980 * I-diaeresis (00CF) and I-diaeresis-acute (1E2E).
2981 * In general, the set for the middle such character should be a subset
2982 * of the set for the first.
2983 */
2984 set1=uset_open(1, 0);
2985 set2=uset_open(1, 0);
2986
374ca955
A
2987 if (unorm_getCanonStartSet(0x49, &sset)) {
2988 _setAddSerialized(set1, &sset);
b75a7d8f 2989
374ca955
A
2990 /* enumerate all characters that are plausible to be latin letters */
2991 for(start=0xa0; start<0x2000; ++start) {
2992 if(unorm_getDecomposition(start, FALSE, buffer16, LENGTHOF(buffer16))>1 && buffer16[0]==0x49) {
2993 uset_add(set2, start);
2994 }
b75a7d8f 2995 }
374ca955
A
2996
2997 compareUSets(set1, set2,
2998 "[canon start set of 0049]", "[all c with canon decomp with 0049]",
2999 TRUE);
3000 } else {
3001 log_err("error calling unorm_getCanonStartSet()\n");
b75a7d8f
A
3002 }
3003
b75a7d8f
A
3004 uset_close(set1);
3005 uset_close(set2);
3006
3007#endif
73c04bcf
A
3008
3009 /* verify that all assigned characters in Math blocks are exactly Math characters */
3010 errorCode=U_ZERO_ERROR;
3011 set1=uset_openPattern(mathBlocksPattern, -1, &errorCode);
3012 set2=uset_openPattern(mathPattern, 8, &errorCode);
3013 set3=uset_openPattern(unassignedPattern, 6, &errorCode);
3014 if(U_SUCCESS(errorCode)) {
3015 uset_retainAll(set2, set1); /* [math blocks]&[:Math:] */
3016 uset_complement(set3); /* assigned characters */
3017 uset_retainAll(set1, set3); /* [math blocks]&[assigned] */
3018 compareUSets(set1, set2,
3019 "[assigned Math block chars]", "[math blocks]&[:Math:]",
3020 TRUE);
3021 } else {
3022 log_err("error opening [math blocks] or [:Math:] or [:Cn:] - %s\n", u_errorName(errorCode));
3023 }
3024 uset_close(set1);
3025 uset_close(set2);
3026 uset_close(set3);
3027
3028 /* new in Unicode 5.0: exactly all unassigned+PUA+surrogate code points have script=Unknown */
3029 errorCode=U_ZERO_ERROR;
3030 set1=uset_openPattern(unknownPattern, 14, &errorCode);
3031 set2=uset_openPattern(reservedPattern, 20, &errorCode);
3032 if(U_SUCCESS(errorCode)) {
3033 compareUSets(set1, set2,
3034 "[:sc=Unknown:]", "[[:Cn:][:Co:][:Cs:]]",
3035 TRUE);
3036 } else {
3037 log_err("error opening [:sc=Unknown:] or [[:Cn:][:Co:][:Cs:]] - %s\n", u_errorName(errorCode));
3038 }
3039 uset_close(set1);
3040 uset_close(set2);
b75a7d8f 3041}
374ca955 3042
73c04bcf
A
3043/*
3044 * Starting with ICU4C 3.4, the core Unicode properties files
3045 * (uprops.icu, ucase.icu, ubidi.icu, unorm.icu)
3046 * are hardcoded in the common DLL and therefore not included
3047 * in the data package any more.
3048 * Test requiring these files are disabled so that
3049 * we need not jump through hoops (like adding snapshots of these files
3050 * to testdata).
3051 * See Jitterbug 4497.
3052 */
3053#define HARDCODED_DATA_4497 1
3054
374ca955
A
3055/* API coverage for ucase.c */
3056static void TestUCase() {
73c04bcf 3057#if !HARDCODED_DATA_4497
374ca955
A
3058 UDataMemory *pData;
3059 UCaseProps *csp;
73c04bcf 3060 const UCaseProps *ccsp;
374ca955
A
3061 UErrorCode errorCode;
3062
3063 /* coverage for ucase_openBinary() */
3064 errorCode=U_ZERO_ERROR;
3065 pData=udata_open(NULL, UCASE_DATA_TYPE, UCASE_DATA_NAME, &errorCode);
3066 if(U_FAILURE(errorCode)) {
3067 log_data_err("unable to open " UCASE_DATA_NAME "." UCASE_DATA_TYPE ": %s\n",
3068 u_errorName(errorCode));
3069 return;
3070 }
3071
3072 csp=ucase_openBinary((const uint8_t *)pData->pHeader, -1, &errorCode);
3073 if(U_FAILURE(errorCode)) {
3074 log_err("ucase_openBinary() fails for the contents of " UCASE_DATA_NAME "." UCASE_DATA_TYPE ": %s\n",
3075 u_errorName(errorCode));
3076 udata_close(pData);
3077 return;
3078 }
3079
3080 if(UCASE_LOWER!=ucase_getType(csp, 0xdf)) { /* verify islower(sharp s) */
3081 log_err("ucase_openBinary() does not seem to return working UCaseProps\n");
3082 }
3083
3084 ucase_close(csp);
3085 udata_close(pData);
73c04bcf
A
3086
3087 /* coverage for ucase_getDummy() */
3088 errorCode=U_ZERO_ERROR;
3089 ccsp=ucase_getDummy(&errorCode);
3090 if(ucase_tolower(ccsp, 0x41)!=0x41) {
3091 log_err("ucase_tolower(dummy, A)!=A\n");
3092 }
46f4442e 3093#endif
73c04bcf
A
3094}
3095
3096/* API coverage for ubidi_props.c */
3097static void TestUBiDiProps() {
3098#if !HARDCODED_DATA_4497
3099 UDataMemory *pData;
3100 UBiDiProps *bdp;
73c04bcf
A
3101 const UBiDiProps *cbdp;
3102 UErrorCode errorCode;
3103
73c04bcf
A
3104 /* coverage for ubidi_openBinary() */
3105 errorCode=U_ZERO_ERROR;
3106 pData=udata_open(NULL, UBIDI_DATA_TYPE, UBIDI_DATA_NAME, &errorCode);
3107 if(U_FAILURE(errorCode)) {
3108 log_data_err("unable to open " UBIDI_DATA_NAME "." UBIDI_DATA_TYPE ": %s\n",
3109 u_errorName(errorCode));
3110 return;
3111 }
3112
3113 bdp=ubidi_openBinary((const uint8_t *)pData->pHeader, -1, &errorCode);
3114 if(U_FAILURE(errorCode)) {
3115 log_err("ubidi_openBinary() fails for the contents of " UBIDI_DATA_NAME "." UBIDI_DATA_TYPE ": %s\n",
3116 u_errorName(errorCode));
3117 udata_close(pData);
3118 return;
3119 }
3120
3121 if(0x2215!=ubidi_getMirror(bdp, 0x29F5)) { /* verify some data */
3122 log_err("ubidi_openBinary() does not seem to return working UBiDiProps\n");
3123 }
3124
3125 ubidi_closeProps(bdp);
3126 udata_close(pData);
73c04bcf
A
3127
3128 /* coverage for ubidi_getDummy() */
3129 errorCode=U_ZERO_ERROR;
3130 cbdp=ubidi_getDummy(&errorCode);
3131 if(ubidi_getClass(cbdp, 0x20)!=0) {
3132 log_err("ubidi_getClass(dummy, space)!=0\n");
3133 }
46f4442e 3134#endif
73c04bcf
A
3135}
3136
3137/* test case folding, compare return values with CaseFolding.txt ------------ */
3138
3139/* bit set for which case foldings for a character have been tested already */
3140enum {
3141 CF_SIMPLE=1,
3142 CF_FULL=2,
3143 CF_TURKIC=4,
3144 CF_ALL=7
3145};
3146
3147static void
3148testFold(UChar32 c, int which,
3149 UChar32 simple, UChar32 turkic,
3150 const UChar *full, int32_t fullLength,
3151 const UChar *turkicFull, int32_t turkicFullLength) {
3152 UChar s[2], t[32];
3153 UChar32 c2;
3154 int32_t length, length2;
3155
3156 UErrorCode errorCode=U_ZERO_ERROR;
3157
3158 length=0;
3159 U16_APPEND_UNSAFE(s, length, c);
3160
3161 if((which&CF_SIMPLE)!=0 && (c2=u_foldCase(c, 0))!=simple) {
3162 log_err("u_foldCase(U+%04lx, default)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3163 }
3164 if((which&CF_FULL)!=0) {
3165 length2=u_strFoldCase(t, LENGTHOF(t), s, length, 0, &errorCode);
3166 if(length2!=fullLength || 0!=u_memcmp(t, full, fullLength)) {
3167 log_err("u_strFoldCase(U+%04lx, default) does not fold properly\n", (long)c);
3168 }
3169 }
3170 if((which&CF_TURKIC)!=0) {
3171 if((c2=u_foldCase(c, U_FOLD_CASE_EXCLUDE_SPECIAL_I))!=turkic) {
3172 log_err("u_foldCase(U+%04lx, turkic)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3173 }
3174
3175 length2=u_strFoldCase(t, LENGTHOF(t), s, length, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &errorCode);
3176 if(length2!=turkicFullLength || 0!=u_memcmp(t, turkicFull, length2)) {
3177 log_err("u_strFoldCase(U+%04lx, turkic) does not fold properly\n", (long)c);
3178 }
3179 }
3180}
3181
3182/* test that c case-folds to itself */
3183static void
3184testFoldToSelf(UChar32 c, int which) {
3185 UChar s[2];
3186 int32_t length;
3187
3188 length=0;
3189 U16_APPEND_UNSAFE(s, length, c);
3190 testFold(c, which, c, c, s, length, s, length);
3191}
3192
3193struct CaseFoldingData {
3194 USet *notSeen;
3195 UChar32 prev, prevSimple;
3196 UChar prevFull[32];
3197 int32_t prevFullLength;
3198 int which;
3199};
3200typedef struct CaseFoldingData CaseFoldingData;
3201
3202static void U_CALLCONV
3203caseFoldingLineFn(void *context,
3204 char *fields[][2], int32_t fieldCount,
3205 UErrorCode *pErrorCode) {
3206 CaseFoldingData *pData=(CaseFoldingData *)context;
3207 char *end;
3208 UChar full[32];
3209 UChar32 c, prev, simple;
3210 int32_t count;
3211 int which;
3212 char status;
3213
3214 /* get code point */
3215 c=(UChar32)strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
3216 end=(char *)u_skipWhitespace(end);
3217 if(end<=fields[0][0] || end!=fields[0][1]) {
3218 log_err("syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
3219 *pErrorCode=U_PARSE_ERROR;
3220 return;
3221 }
3222
3223 /* get the status of this mapping */
3224 status=*u_skipWhitespace(fields[1][0]);
3225 if(status!='C' && status!='S' && status!='F' && status!='T') {
3226 log_err("unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
3227 *pErrorCode=U_PARSE_ERROR;
3228 return;
3229 }
3230
3231 /* get the mapping */
3232 count=u_parseString(fields[2][0], full, 32, (uint32_t *)&simple, pErrorCode);
3233 if(U_FAILURE(*pErrorCode)) {
3234 log_err("error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
3235 return;
3236 }
3237
3238 /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
3239 if(count==0 || count>2 || (count==2 && U16_IS_SINGLE(full[1]))) {
3240 simple=c;
3241 }
3242
3243 if(c!=(prev=pData->prev)) {
3244 /*
3245 * Test remaining mappings for the previous code point.
3246 * If a turkic folding was not mentioned, then it should fold the same
3247 * as the regular simple case folding.
3248 */
3249 UChar s[2];
3250 int32_t length;
3251
3252 length=0;
3253 U16_APPEND_UNSAFE(s, length, prev);
3254 testFold(prev, (~pData->which)&CF_ALL,
3255 prev, pData->prevSimple,
3256 s, length,
3257 pData->prevFull, pData->prevFullLength);
3258 pData->prev=pData->prevSimple=c;
3259 length=0;
3260 U16_APPEND_UNSAFE(pData->prevFull, length, c);
3261 pData->prevFullLength=length;
3262 pData->which=0;
3263 }
3264
3265 /*
3266 * Turn the status into a bit set of case foldings to test.
3267 * Remember non-Turkic case foldings as defaults for Turkic mode.
3268 */
3269 switch(status) {
3270 case 'C':
3271 which=CF_SIMPLE|CF_FULL;
3272 pData->prevSimple=simple;
3273 u_memcpy(pData->prevFull, full, count);
3274 pData->prevFullLength=count;
3275 break;
3276 case 'S':
3277 which=CF_SIMPLE;
3278 pData->prevSimple=simple;
3279 break;
3280 case 'F':
3281 which=CF_FULL;
3282 u_memcpy(pData->prevFull, full, count);
3283 pData->prevFullLength=count;
3284 break;
3285 case 'T':
3286 which=CF_TURKIC;
3287 break;
3288 default:
3289 which=0;
3290 break; /* won't happen because of test above */
3291 }
3292
3293 testFold(c, which, simple, simple, full, count, full, count);
3294
3295 /* remember which case foldings of c have been tested */
3296 pData->which|=which;
3297
3298 /* remove c from the set of ones not mentioned in CaseFolding.txt */
3299 uset_remove(pData->notSeen, c);
3300}
3301
3302static void
3303TestCaseFolding() {
3304 CaseFoldingData data={ NULL };
3305 char *fields[3][2];
3306 UErrorCode errorCode;
3307
3308 static char *lastLine= (char *)"10FFFF; C; 10FFFF;";
3309
3310 errorCode=U_ZERO_ERROR;
3311 /* test BMP & plane 1 - nothing interesting above */
3312 data.notSeen=uset_open(0, 0x1ffff);
3313 data.prevFullLength=1; /* length of full case folding of U+0000 */
3314
3315 parseUCDFile("CaseFolding.txt", fields, 3, caseFoldingLineFn, &data, &errorCode);
3316 if(U_SUCCESS(errorCode)) {
3317 int32_t i, start, end;
3318
3319 /* add a pseudo-last line to finish testing of the actual last one */
3320 fields[0][0]=lastLine;
3321 fields[0][1]=lastLine+6;
3322 fields[1][0]=lastLine+7;
3323 fields[1][1]=lastLine+9;
3324 fields[2][0]=lastLine+10;
3325 fields[2][1]=lastLine+17;
3326 caseFoldingLineFn(&data, fields, 3, &errorCode);
3327
3328 /* verify that all code points that are not mentioned in CaseFolding.txt fold to themselves */
3329 for(i=0;
3330 0==uset_getItem(data.notSeen, i, &start, &end, NULL, 0, &errorCode) &&
3331 U_SUCCESS(errorCode);
3332 ++i
3333 ) {
3334 do {
3335 testFoldToSelf(start, CF_ALL);
3336 } while(++start<=end);
3337 }
3338 }
3339
3340 uset_close(data.notSeen);
374ca955 3341}