]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/cintltst/cucdtst.c
ICU-62107.0.1.tar.gz
[apple/icu.git] / icuSources / test / cintltst / cucdtst.c
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f
A
3/********************************************************************
4 * COPYRIGHT:
2ca993e8 5 * Copyright (c) 1997-2016, International Business Machines Corporation and
b75a7d8f
A
6 * others. All Rights Reserved.
7 ********************************************************************/
46f4442e 8/*******************************************************************************
b75a7d8f
A
9*
10* File CUCDTST.C
11*
12* Modification History:
13* Name Description
14* Madhu Katragadda Ported for C API, added tests for string functions
46f4442e 15********************************************************************************
b75a7d8f
A
16*/
17
18#include <string.h>
19#include <math.h>
20#include <stdlib.h>
21
22#include "unicode/utypes.h"
23#include "unicode/uchar.h"
24#include "unicode/putil.h"
25#include "unicode/ustring.h"
26#include "unicode/uloc.h"
729e4ab9 27#include "unicode/unorm2.h"
0f5d89e8
A
28#include "unicode/utf16.h"
29#include "unicode/utf_old.h"
b75a7d8f 30#include "cintltst.h"
374ca955 31#include "putilimp.h"
b75a7d8f 32#include "uparse.h"
374ca955 33#include "ucase.h"
73c04bcf 34#include "ubidi_props.h"
b75a7d8f 35#include "uprops.h"
374ca955 36#include "uset_imp.h"
b75a7d8f 37#include "usc_impl.h"
f3c0d7a5 38#include "udatamem.h"
374ca955 39#include "cucdapi.h"
b331163b 40#include "cmemory.h"
b75a7d8f
A
41
42/* prototypes --------------------------------------------------------------- */
43
44static void TestUpperLower(void);
45static void TestLetterNumber(void);
46static void TestMisc(void);
47static void TestPOSIX(void);
48static void TestControlPrint(void);
49static void TestIdentifier(void);
50static void TestUnicodeData(void);
51static void TestCodeUnit(void);
52static void TestCodePoint(void);
53static void TestCharLength(void);
54static void TestCharNames(void);
b331163b 55static void TestUCharFromNameUnderflow(void);
b75a7d8f 56static void TestMirroring(void);
b75a7d8f
A
57static void TestUScriptRunAPI(void);
58static void TestAdditionalProperties(void);
59static void TestNumericProperties(void);
60static void TestPropertyNames(void);
61static void TestPropertyValues(void);
62static void TestConsistency(void);
73c04bcf 63static void TestCaseFolding(void);
b75a7d8f
A
64
65/* internal methods used */
66static int32_t MakeProp(char* str);
67static int32_t MakeDir(char* str);
68
73c04bcf
A
69/* helpers ------------------------------------------------------------------ */
70
71static void
72parseUCDFile(const char *filename,
73 char *fields[][2], int32_t fieldCount,
74 UParseLineFn *lineFn, void *context,
75 UErrorCode *pErrorCode) {
76 char path[256];
77 char backupPath[256];
78
79 if(U_FAILURE(*pErrorCode)) {
80 return;
81 }
82
83 /* Look inside ICU_DATA first */
84 strcpy(path, u_getDataDirectory());
85 strcat(path, ".." U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING);
86 strcat(path, filename);
87
88 /* As a fallback, try to guess where the source data was located
89 * at the time ICU was built, and look there.
90 */
91 strcpy(backupPath, ctest_dataSrcDir());
92 strcat(backupPath, U_FILE_SEP_STRING);
93 strcat(backupPath, "unidata" U_FILE_SEP_STRING);
94 strcat(backupPath, filename);
95
96 u_parseDelimitedFile(path, ';', fields, fieldCount, lineFn, context, pErrorCode);
97 if(*pErrorCode==U_FILE_ACCESS_ERROR) {
98 *pErrorCode=U_ZERO_ERROR;
99 u_parseDelimitedFile(backupPath, ';', fields, fieldCount, lineFn, context, pErrorCode);
100 }
101 if(U_FAILURE(*pErrorCode)) {
729e4ab9 102 log_err_status(*pErrorCode, "error parsing %s: %s\n", filename, u_errorName(*pErrorCode));
73c04bcf
A
103 }
104}
105
b75a7d8f
A
106/* test data ---------------------------------------------------------------- */
107
b75a7d8f
A
108static const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf";
109static const int32_t tagValues[] =
110 {
111 /* Mn */ U_NON_SPACING_MARK,
112 /* Mc */ U_COMBINING_SPACING_MARK,
113 /* Me */ U_ENCLOSING_MARK,
114 /* Nd */ U_DECIMAL_DIGIT_NUMBER,
115 /* Nl */ U_LETTER_NUMBER,
116 /* No */ U_OTHER_NUMBER,
117 /* Zs */ U_SPACE_SEPARATOR,
118 /* Zl */ U_LINE_SEPARATOR,
119 /* Zp */ U_PARAGRAPH_SEPARATOR,
120 /* Cc */ U_CONTROL_CHAR,
121 /* Cf */ U_FORMAT_CHAR,
122 /* Cs */ U_SURROGATE,
123 /* Co */ U_PRIVATE_USE_CHAR,
124 /* Cn */ U_UNASSIGNED,
125 /* Lu */ U_UPPERCASE_LETTER,
126 /* Ll */ U_LOWERCASE_LETTER,
127 /* Lt */ U_TITLECASE_LETTER,
128 /* Lm */ U_MODIFIER_LETTER,
129 /* Lo */ U_OTHER_LETTER,
130 /* Pc */ U_CONNECTOR_PUNCTUATION,
131 /* Pd */ U_DASH_PUNCTUATION,
132 /* Ps */ U_START_PUNCTUATION,
133 /* Pe */ U_END_PUNCTUATION,
134 /* Po */ U_OTHER_PUNCTUATION,
135 /* Sm */ U_MATH_SYMBOL,
136 /* Sc */ U_CURRENCY_SYMBOL,
137 /* Sk */ U_MODIFIER_SYMBOL,
138 /* So */ U_OTHER_SYMBOL,
139 /* Pi */ U_INITIAL_PUNCTUATION,
140 /* Pf */ U_FINAL_PUNCTUATION
141 };
142
143static const char dirStrings[][5] = {
144 "L",
145 "R",
146 "EN",
147 "ES",
148 "ET",
149 "AN",
150 "CS",
151 "B",
152 "S",
153 "WS",
154 "ON",
155 "LRE",
156 "LRO",
157 "AL",
158 "RLE",
159 "RLO",
160 "PDF",
161 "NSM",
57a6839d
A
162 "BN",
163 /* new in Unicode 6.3/ICU 52 */
164 "FSI",
165 "LRI",
166 "RLI",
167 "PDI"
b75a7d8f
A
168};
169
170void addUnicodeTest(TestNode** root);
171
172void addUnicodeTest(TestNode** root)
173{
b75a7d8f
A
174 addTest(root, &TestCodeUnit, "tsutil/cucdtst/TestCodeUnit");
175 addTest(root, &TestCodePoint, "tsutil/cucdtst/TestCodePoint");
176 addTest(root, &TestCharLength, "tsutil/cucdtst/TestCharLength");
46f4442e
A
177 addTest(root, &TestBinaryValues, "tsutil/cucdtst/TestBinaryValues");
178 addTest(root, &TestUnicodeData, "tsutil/cucdtst/TestUnicodeData");
b75a7d8f
A
179 addTest(root, &TestAdditionalProperties, "tsutil/cucdtst/TestAdditionalProperties");
180 addTest(root, &TestNumericProperties, "tsutil/cucdtst/TestNumericProperties");
181 addTest(root, &TestUpperLower, "tsutil/cucdtst/TestUpperLower");
182 addTest(root, &TestLetterNumber, "tsutil/cucdtst/TestLetterNumber");
183 addTest(root, &TestMisc, "tsutil/cucdtst/TestMisc");
184 addTest(root, &TestPOSIX, "tsutil/cucdtst/TestPOSIX");
185 addTest(root, &TestControlPrint, "tsutil/cucdtst/TestControlPrint");
186 addTest(root, &TestIdentifier, "tsutil/cucdtst/TestIdentifier");
187 addTest(root, &TestCharNames, "tsutil/cucdtst/TestCharNames");
b331163b 188 addTest(root, &TestUCharFromNameUnderflow, "tsutil/cucdtst/TestUCharFromNameUnderflow");
b75a7d8f
A
189 addTest(root, &TestMirroring, "tsutil/cucdtst/TestMirroring");
190 addTest(root, &TestUScriptCodeAPI, "tsutil/cucdtst/TestUScriptCodeAPI");
729e4ab9
A
191 addTest(root, &TestHasScript, "tsutil/cucdtst/TestHasScript");
192 addTest(root, &TestGetScriptExtensions, "tsutil/cucdtst/TestGetScriptExtensions");
51004dcb 193 addTest(root, &TestScriptMetadataAPI, "tsutil/cucdtst/TestScriptMetadataAPI");
b75a7d8f
A
194 addTest(root, &TestUScriptRunAPI, "tsutil/cucdtst/TestUScriptRunAPI");
195 addTest(root, &TestPropertyNames, "tsutil/cucdtst/TestPropertyNames");
196 addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues");
197 addTest(root, &TestConsistency, "tsutil/cucdtst/TestConsistency");
73c04bcf 198 addTest(root, &TestCaseFolding, "tsutil/cucdtst/TestCaseFolding");
b75a7d8f
A
199}
200
201/*==================================================== */
202/* test u_toupper() and u_tolower() */
203/*==================================================== */
204static void TestUpperLower()
205{
206 const UChar upper[] = {0x41, 0x42, 0x00b2, 0x01c4, 0x01c6, 0x01c9, 0x01c8, 0x01c9, 0x000c, 0x0000};
207 const UChar lower[] = {0x61, 0x62, 0x00b2, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01c9, 0x000c, 0x0000};
208 U_STRING_DECL(upperTest, "abcdefg123hij.?:klmno", 21);
209 U_STRING_DECL(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
210 int32_t i;
211
212 U_STRING_INIT(upperTest, "abcdefg123hij.?:klmno", 21);
213 U_STRING_INIT(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
214
215/*
216Checks LetterLike Symbols which were previously a source of confusion
217[Bertrand A. D. 02/04/98]
218*/
219 for (i=0x2100;i<0x2138;i++)
220 {
73c04bcf
A
221 /* Unicode 5.0 adds lowercase U+214E (TURNED SMALL F) to U+2132 (TURNED CAPITAL F) */
222 if(i!=0x2126 && i!=0x212a && i!=0x212b && i!=0x2132)
b75a7d8f
A
223 {
224 if (i != (int)u_tolower(i)) /* itself */
225 log_err("Failed case conversion with itself: U+%04x\n", i);
226 if (i != (int)u_toupper(i))
227 log_err("Failed case conversion with itself: U+%04x\n", i);
228 }
229 }
230
231 for(i=0; i < u_strlen(upper); i++){
232 if(u_tolower(upper[i]) != lower[i]){
233 log_err("FAILED u_tolower() for %lx Expected %lx Got %lx\n", upper[i], lower[i], u_tolower(upper[i]));
234 }
235 }
236
237 log_verbose("testing upper lower\n");
238 for (i = 0; i < 21; i++) {
239
240 if (u_isalpha(upperTest[i]) && !u_islower(upperTest[i]))
241 {
242 log_err("Failed isLowerCase test at %c\n", upperTest[i]);
243 }
244 else if (u_isalpha(lowerTest[i]) && !u_isupper(lowerTest[i]))
245 {
246 log_err("Failed isUpperCase test at %c\n", lowerTest[i]);
247 }
248 else if (upperTest[i] != u_tolower(lowerTest[i]))
249 {
250 log_err("Failed case conversion from %c To %c :\n", lowerTest[i], upperTest[i]);
251 }
252 else if (lowerTest[i] != u_toupper(upperTest[i]))
253 {
254 log_err("Failed case conversion : %c To %c \n", upperTest[i], lowerTest[i]);
255 }
256 else if (upperTest[i] != u_tolower(upperTest[i]))
257 {
258 log_err("Failed case conversion with itself: %c\n", upperTest[i]);
259 }
260 else if (lowerTest[i] != u_toupper(lowerTest[i]))
261 {
262 log_err("Failed case conversion with itself: %c\n", lowerTest[i]);
263 }
264 }
265 log_verbose("done testing upper lower\n");
266
267 log_verbose("testing u_istitle\n");
268 {
269 static const UChar expected[] = {
270 0x1F88,
271 0x1F89,
272 0x1F8A,
273 0x1F8B,
274 0x1F8C,
275 0x1F8D,
276 0x1F8E,
277 0x1F8F,
278 0x1F88,
279 0x1F89,
280 0x1F8A,
281 0x1F8B,
282 0x1F8C,
283 0x1F8D,
284 0x1F8E,
285 0x1F8F,
286 0x1F98,
287 0x1F99,
288 0x1F9A,
289 0x1F9B,
290 0x1F9C,
291 0x1F9D,
292 0x1F9E,
293 0x1F9F,
294 0x1F98,
295 0x1F99,
296 0x1F9A,
297 0x1F9B,
298 0x1F9C,
299 0x1F9D,
300 0x1F9E,
301 0x1F9F,
302 0x1FA8,
303 0x1FA9,
304 0x1FAA,
305 0x1FAB,
306 0x1FAC,
307 0x1FAD,
308 0x1FAE,
309 0x1FAF,
310 0x1FA8,
311 0x1FA9,
312 0x1FAA,
313 0x1FAB,
314 0x1FAC,
315 0x1FAD,
316 0x1FAE,
317 0x1FAF,
318 0x1FBC,
319 0x1FBC,
320 0x1FCC,
321 0x1FCC,
322 0x1FFC,
323 0x1FFC,
324 };
2ca993e8 325 int32_t num = UPRV_LENGTHOF(expected);
b75a7d8f
A
326 for(i=0; i<num; i++){
327 if(!u_istitle(expected[i])){
328 log_err("u_istitle failed for 0x%4X. Expected TRUE, got FALSE\n",expected[i]);
329 }
330 }
331
332 }
333}
334
73c04bcf 335/* compare two sets and verify that their difference or intersection is empty */
b75a7d8f
A
336static UBool
337showADiffB(const USet *a, const USet *b,
338 const char *a_name, const char *b_name,
339 UBool expect, UBool diffIsError) {
73c04bcf 340 USet *aa;
b75a7d8f 341 int32_t i, start, end, length;
b75a7d8f
A
342 UErrorCode errorCode;
343
73c04bcf
A
344 /*
345 * expect:
346 * TRUE -> a-b should be empty, that is, b should contain all of a
347 * FALSE -> a&b should be empty, that is, a should contain none of b (and vice versa)
348 */
349 if(expect ? uset_containsAll(b, a) : uset_containsNone(a, b)) {
350 return TRUE;
351 }
352
353 /* clone a to aa because a is const */
354 aa=uset_open(1, 0);
355 if(aa==NULL) {
356 /* unusual problem - out of memory? */
357 return FALSE;
358 }
359 uset_addAll(aa, a);
360
361 /* compute the set in question */
362 if(expect) {
363 /* a-b */
364 uset_removeAll(aa, b);
365 } else {
366 /* a&b */
367 uset_retainAll(aa, b);
368 }
369
370 /* aa is not empty because of the initial tests above; show its contents */
b75a7d8f 371 errorCode=U_ZERO_ERROR;
b75a7d8f
A
372 i=0;
373 for(;;) {
73c04bcf 374 length=uset_getItem(aa, i, &start, &end, NULL, 0, &errorCode);
b75a7d8f 375 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
73c04bcf 376 break; /* done */
b75a7d8f
A
377 }
378 if(U_FAILURE(errorCode)) {
73c04bcf 379 log_err("error comparing %s with %s at difference item %d: %s\n",
b75a7d8f 380 a_name, b_name, i, u_errorName(errorCode));
73c04bcf 381 break;
b75a7d8f
A
382 }
383 if(length!=0) {
73c04bcf 384 break; /* done with code points, got a string or -1 */
b75a7d8f
A
385 }
386
73c04bcf
A
387 if(diffIsError) {
388 if(expect) {
389 log_err("error: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
390 } else {
391 log_err("error: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
392 }
393 } else {
394 if(expect) {
395 log_verbose("info: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
396 } else {
397 log_verbose("info: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
b75a7d8f
A
398 }
399 }
400
401 ++i;
402 }
73c04bcf
A
403
404 uset_close(aa);
405 return FALSE;
b75a7d8f
A
406}
407
408static UBool
409showAMinusB(const USet *a, const USet *b,
410 const char *a_name, const char *b_name,
411 UBool diffIsError) {
412 return showADiffB(a, b, a_name, b_name, TRUE, diffIsError);
413}
414
415static UBool
416showAIntersectB(const USet *a, const USet *b,
417 const char *a_name, const char *b_name,
418 UBool diffIsError) {
419 return showADiffB(a, b, a_name, b_name, FALSE, diffIsError);
420}
421
422static UBool
423compareUSets(const USet *a, const USet *b,
424 const char *a_name, const char *b_name,
425 UBool diffIsError) {
73c04bcf
A
426 /*
427 * Use an arithmetic & not a logical && so that both branches
428 * are always taken and all differences are shown.
429 */
b75a7d8f 430 return
73c04bcf 431 showAMinusB(a, b, a_name, b_name, diffIsError) &
b75a7d8f
A
432 showAMinusB(b, a, b_name, a_name, diffIsError);
433}
434
435/* test isLetter(u_isapha()) and isDigit(u_isdigit()) */
436static void TestLetterNumber()
437{
438 UChar i = 0x0000;
439
440 log_verbose("Testing for isalpha\n");
441 for (i = 0x0041; i < 0x005B; i++) {
442 if (!u_isalpha(i))
443 {
444 log_err("Failed isLetter test at %.4X\n", i);
445 }
446 }
447 for (i = 0x0660; i < 0x066A; i++) {
448 if (u_isalpha(i))
449 {
450 log_err("Failed isLetter test with numbers at %.4X\n", i);
451 }
452 }
453
454 log_verbose("Testing for isdigit\n");
455 for (i = 0x0660; i < 0x066A; i++) {
456 if (!u_isdigit(i))
457 {
458 log_verbose("Failed isNumber test at %.4X\n", i);
459 }
460 }
461
462 log_verbose("Testing for isalnum\n");
463 for (i = 0x0041; i < 0x005B; i++) {
464 if (!u_isalnum(i))
465 {
466 log_err("Failed isAlNum test at %.4X\n", i);
467 }
468 }
469 for (i = 0x0660; i < 0x066A; i++) {
470 if (!u_isalnum(i))
471 {
472 log_err("Failed isAlNum test at %.4X\n", i);
473 }
474 }
475
476 {
477 /*
478 * The following checks work only starting from Unicode 4.0.
479 * Check the version number here.
480 */
374ca955 481 static UVersionInfo u401={ 4, 0, 1, 0 };
b75a7d8f
A
482 UVersionInfo version;
483 u_getUnicodeVersion(version);
374ca955 484 if(version[0]<4 || 0==memcmp(version, u401, 4)) {
b75a7d8f
A
485 return;
486 }
487 }
488
489 {
490 /*
491 * Sanity check:
492 * Verify that exactly the digit characters have decimal digit values.
493 * This assumption is used in the implementation of u_digit()
494 * (which checks nt=de)
495 * compared with the parallel java.lang.Character.digit()
496 * (which checks Nd).
497 *
498 * This was not true in Unicode 3.2 and earlier.
374ca955
A
499 * Unicode 4.0 fixed discrepancies.
500 * Unicode 4.0.1 re-introduced problems in this area due to an
501 * unintentionally incomplete last-minute change.
b75a7d8f
A
502 */
503 U_STRING_DECL(digitsPattern, "[:Nd:]", 6);
504 U_STRING_DECL(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
505
506 USet *digits, *decimalValues;
507 UErrorCode errorCode;
508
509 U_STRING_INIT(digitsPattern, "[:Nd:]", 6);
510 U_STRING_INIT(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
511 errorCode=U_ZERO_ERROR;
512 digits=uset_openPattern(digitsPattern, 6, &errorCode);
513 decimalValues=uset_openPattern(decimalValuesPattern, 24, &errorCode);
514
515 if(U_SUCCESS(errorCode)) {
516 compareUSets(digits, decimalValues, "[:Nd:]", "[:Numeric_Type=Decimal:]", TRUE);
517 }
518
519 uset_close(digits);
520 uset_close(decimalValues);
521 }
522}
523
729e4ab9
A
524static void testSampleCharProps(UBool propFn(UChar32), const char *propName,
525 const UChar32 *sampleChars, int32_t sampleCharsLength,
526 UBool expected) {
527 int32_t i;
528 for (i = 0; i < sampleCharsLength; ++i) {
529 UBool result = propFn(sampleChars[i]);
530 if (result != expected) {
531 log_err("error: character property function %s(U+%04x)=%d is wrong\n",
532 propName, sampleChars[i], result);
533 }
534 }
535}
536
b75a7d8f
A
537/* Tests for isDefined(u_isdefined)(, isBaseForm(u_isbase()), isSpaceChar(u_isspace()), isWhiteSpace(), u_CharDigitValue() */
538static void TestMisc()
539{
729e4ab9
A
540 static const UChar32 sampleSpaces[] = {0x0020, 0x00a0, 0x2000, 0x2001, 0x2005};
541 static const UChar32 sampleNonSpaces[] = {0x61, 0x62, 0x63, 0x64, 0x74};
542 static const UChar32 sampleUndefined[] = {0xfff1, 0xfff7, 0xfa6e};
543 static const UChar32 sampleDefined[] = {0x523E, 0x4f88, 0xfffd};
544 static const UChar32 sampleBase[] = {0x0061, 0x0031, 0x03d2};
545 static const UChar32 sampleNonBase[] = {0x002B, 0x0020, 0x203B};
b75a7d8f 546/* static const UChar sampleChars[] = {0x000a, 0x0045, 0x4e00, 0xDC00, 0xFFE8, 0xFFF0};*/
729e4ab9
A
547 static const UChar32 sampleDigits[]= {0x0030, 0x0662, 0x0F23, 0x0ED5};
548 static const UChar32 sampleNonDigits[] = {0x0010, 0x0041, 0x0122, 0x68FE};
549 static const UChar32 sampleWhiteSpaces[] = {0x2008, 0x2009, 0x200a, 0x001c, 0x000c};
550 static const UChar32 sampleNonWhiteSpaces[] = {0x61, 0x62, 0x3c, 0x28, 0x3f, 0x85, 0x2007, 0xffef};
b75a7d8f
A
551
552 static const int32_t sampleDigitValues[] = {0, 2, 3, 5};
553
554 uint32_t mask;
555
556 int32_t i;
557 char icuVersion[U_MAX_VERSION_STRING_LENGTH];
558 UVersionInfo realVersion;
559
560 memset(icuVersion, 0, U_MAX_VERSION_STRING_LENGTH);
561
b331163b
A
562 testSampleCharProps(u_isspace, "u_isspace", sampleSpaces, UPRV_LENGTHOF(sampleSpaces), TRUE);
563 testSampleCharProps(u_isspace, "u_isspace", sampleNonSpaces, UPRV_LENGTHOF(sampleNonSpaces), FALSE);
b75a7d8f 564
729e4ab9 565 testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
b331163b 566 sampleSpaces, UPRV_LENGTHOF(sampleSpaces), TRUE);
729e4ab9 567 testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
b331163b 568 sampleNonSpaces, UPRV_LENGTHOF(sampleNonSpaces), FALSE);
b75a7d8f 569
729e4ab9 570 testSampleCharProps(u_isWhitespace, "u_isWhitespace",
b331163b 571 sampleWhiteSpaces, UPRV_LENGTHOF(sampleWhiteSpaces), TRUE);
729e4ab9 572 testSampleCharProps(u_isWhitespace, "u_isWhitespace",
b331163b 573 sampleNonWhiteSpaces, UPRV_LENGTHOF(sampleNonWhiteSpaces), FALSE);
b75a7d8f 574
729e4ab9 575 testSampleCharProps(u_isdefined, "u_isdefined",
b331163b 576 sampleDefined, UPRV_LENGTHOF(sampleDefined), TRUE);
729e4ab9 577 testSampleCharProps(u_isdefined, "u_isdefined",
b331163b 578 sampleUndefined, UPRV_LENGTHOF(sampleUndefined), FALSE);
729e4ab9 579
b331163b
A
580 testSampleCharProps(u_isbase, "u_isbase", sampleBase, UPRV_LENGTHOF(sampleBase), TRUE);
581 testSampleCharProps(u_isbase, "u_isbase", sampleNonBase, UPRV_LENGTHOF(sampleNonBase), FALSE);
b75a7d8f 582
b331163b
A
583 testSampleCharProps(u_isdigit, "u_isdigit", sampleDigits, UPRV_LENGTHOF(sampleDigits), TRUE);
584 testSampleCharProps(u_isdigit, "u_isdigit", sampleNonDigits, UPRV_LENGTHOF(sampleNonDigits), FALSE);
729e4ab9 585
b331163b 586 for (i = 0; i < UPRV_LENGTHOF(sampleDigits); i++) {
729e4ab9
A
587 if (u_charDigitValue(sampleDigits[i]) != sampleDigitValues[i]) {
588 log_err("error: u_charDigitValue(U+04x)=%d != %d\n",
589 sampleDigits[i], u_charDigitValue(sampleDigits[i]), sampleDigitValues[i]);
b75a7d8f
A
590 }
591 }
592
593 /* Tests the ICU version #*/
594 u_getVersion(realVersion);
595 u_versionToString(realVersion, icuVersion);
374ca955 596 if (strncmp(icuVersion, U_ICU_VERSION, uprv_min((int32_t)strlen(icuVersion), (int32_t)strlen(U_ICU_VERSION))) != 0)
b75a7d8f
A
597 {
598 log_err("ICU version test failed. Header says=%s, got=%s \n", U_ICU_VERSION, icuVersion);
599 }
600#if defined(ICU_VERSION)
601 /* test only happens where we have configure.in with VERSION - sanity check. */
602 if(strcmp(U_ICU_VERSION, ICU_VERSION))
603 {
604 log_err("ICU version mismatch: Header says %s, build environment says %s.\n", U_ICU_VERSION, ICU_VERSION);
605 }
606#endif
607
608 /* test U_GC_... */
609 if(
610 U_GET_GC_MASK(0x41)!=U_GC_LU_MASK ||
611 U_GET_GC_MASK(0x662)!=U_GC_ND_MASK ||
612 U_GET_GC_MASK(0xa0)!=U_GC_ZS_MASK ||
613 U_GET_GC_MASK(0x28)!=U_GC_PS_MASK ||
614 U_GET_GC_MASK(0x2044)!=U_GC_SM_MASK ||
615 U_GET_GC_MASK(0xe0063)!=U_GC_CF_MASK
616 ) {
617 log_err("error: U_GET_GC_MASK does not work properly\n");
618 }
619
620 mask=0;
621 mask=(mask&~U_GC_CN_MASK)|U_GC_CN_MASK;
622
623 mask=(mask&~U_GC_LU_MASK)|U_GC_LU_MASK;
624 mask=(mask&~U_GC_LL_MASK)|U_GC_LL_MASK;
625 mask=(mask&~U_GC_LT_MASK)|U_GC_LT_MASK;
626 mask=(mask&~U_GC_LM_MASK)|U_GC_LM_MASK;
627 mask=(mask&~U_GC_LO_MASK)|U_GC_LO_MASK;
628
629 mask=(mask&~U_GC_MN_MASK)|U_GC_MN_MASK;
630 mask=(mask&~U_GC_ME_MASK)|U_GC_ME_MASK;
631 mask=(mask&~U_GC_MC_MASK)|U_GC_MC_MASK;
632
633 mask=(mask&~U_GC_ND_MASK)|U_GC_ND_MASK;
634 mask=(mask&~U_GC_NL_MASK)|U_GC_NL_MASK;
635 mask=(mask&~U_GC_NO_MASK)|U_GC_NO_MASK;
636
637 mask=(mask&~U_GC_ZS_MASK)|U_GC_ZS_MASK;
638 mask=(mask&~U_GC_ZL_MASK)|U_GC_ZL_MASK;
639 mask=(mask&~U_GC_ZP_MASK)|U_GC_ZP_MASK;
640
641 mask=(mask&~U_GC_CC_MASK)|U_GC_CC_MASK;
642 mask=(mask&~U_GC_CF_MASK)|U_GC_CF_MASK;
643 mask=(mask&~U_GC_CO_MASK)|U_GC_CO_MASK;
644 mask=(mask&~U_GC_CS_MASK)|U_GC_CS_MASK;
645
646 mask=(mask&~U_GC_PD_MASK)|U_GC_PD_MASK;
647 mask=(mask&~U_GC_PS_MASK)|U_GC_PS_MASK;
648 mask=(mask&~U_GC_PE_MASK)|U_GC_PE_MASK;
649 mask=(mask&~U_GC_PC_MASK)|U_GC_PC_MASK;
650 mask=(mask&~U_GC_PO_MASK)|U_GC_PO_MASK;
651
652 mask=(mask&~U_GC_SM_MASK)|U_GC_SM_MASK;
653 mask=(mask&~U_GC_SC_MASK)|U_GC_SC_MASK;
654 mask=(mask&~U_GC_SK_MASK)|U_GC_SK_MASK;
655 mask=(mask&~U_GC_SO_MASK)|U_GC_SO_MASK;
656
657 mask=(mask&~U_GC_PI_MASK)|U_GC_PI_MASK;
658 mask=(mask&~U_GC_PF_MASK)|U_GC_PF_MASK;
659
660 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
661 log_err("error: problems with U_GC_XX_MASK constants\n");
662 }
663
664 mask=0;
665 mask=(mask&~U_GC_C_MASK)|U_GC_C_MASK;
666 mask=(mask&~U_GC_L_MASK)|U_GC_L_MASK;
667 mask=(mask&~U_GC_M_MASK)|U_GC_M_MASK;
668 mask=(mask&~U_GC_N_MASK)|U_GC_N_MASK;
669 mask=(mask&~U_GC_Z_MASK)|U_GC_Z_MASK;
670 mask=(mask&~U_GC_P_MASK)|U_GC_P_MASK;
671 mask=(mask&~U_GC_S_MASK)|U_GC_S_MASK;
672
673 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
674 log_err("error: problems with U_GC_Y_MASK constants\n");
675 }
676 {
677 static const UChar32 digit[10]={ 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039 };
678 for(i=0; i<10; i++){
679 if(digit[i]!=u_forDigit(i,10)){
680 log_err("u_forDigit failed for %i. Expected: 0x%4X Got: 0x%4X\n",i,digit[i],u_forDigit(i,10));
681 }
682 }
683 }
684
685 /* test u_digit() */
686 {
687 static const struct {
688 UChar32 c;
689 int8_t radix, value;
690 } data[]={
691 /* base 16 */
692 { 0x0031, 16, 1 },
693 { 0x0038, 16, 8 },
694 { 0x0043, 16, 12 },
695 { 0x0066, 16, 15 },
696 { 0x00e4, 16, -1 },
697 { 0x0662, 16, 2 },
698 { 0x06f5, 16, 5 },
699 { 0xff13, 16, 3 },
700 { 0xff41, 16, 10 },
701
702 /* base 8 */
703 { 0x0031, 8, 1 },
704 { 0x0038, 8, -1 },
705 { 0x0043, 8, -1 },
706 { 0x0066, 8, -1 },
707 { 0x00e4, 8, -1 },
708 { 0x0662, 8, 2 },
709 { 0x06f5, 8, 5 },
710 { 0xff13, 8, 3 },
711 { 0xff41, 8, -1 },
712
713 /* base 36 */
714 { 0x5a, 36, 35 },
715 { 0x7a, 36, 35 },
716 { 0xff3a, 36, 35 },
717 { 0xff5a, 36, 35 },
718
719 /* wrong radix values */
720 { 0x0031, 1, -1 },
721 { 0xff3a, 37, -1 }
722 };
723
b331163b 724 for(i=0; i<UPRV_LENGTHOF(data); ++i) {
b75a7d8f
A
725 if(u_digit(data[i].c, data[i].radix)!=data[i].value) {
726 log_err("u_digit(U+%04x, %d)=%d expected %d\n",
727 data[i].c,
728 data[i].radix,
729 u_digit(data[i].c, data[i].radix),
730 data[i].value);
731 }
732 }
733 }
734}
735
736/* test C/POSIX-style functions --------------------------------------------- */
737
738/* bit flags */
739#define ISAL 1
740#define ISLO 2
741#define ISUP 4
742
743#define ISDI 8
744#define ISXD 0x10
745
746#define ISAN 0x20
747
748#define ISPU 0x40
749#define ISGR 0x80
750#define ISPR 0x100
751
752#define ISSP 0x200
753#define ISBL 0x400
754#define ISCN 0x800
755
756/* C/POSIX-style functions, in the same order as the bit flags */
374ca955 757typedef UBool U_EXPORT2 IsPOSIXClass(UChar32 c);
b75a7d8f
A
758
759static const struct {
760 IsPOSIXClass *fn;
761 const char *name;
762} posixClasses[]={
763 { u_isalpha, "isalpha" },
764 { u_islower, "islower" },
765 { u_isupper, "isupper" },
766 { u_isdigit, "isdigit" },
767 { u_isxdigit, "isxdigit" },
768 { u_isalnum, "isalnum" },
769 { u_ispunct, "ispunct" },
770 { u_isgraph, "isgraph" },
771 { u_isprint, "isprint" },
772 { u_isspace, "isspace" },
773 { u_isblank, "isblank" },
774 { u_iscntrl, "iscntrl" }
775};
776
777static const struct {
778 UChar32 c;
779 uint32_t posixResults;
780} posixData[]={
781 { 0x0008, ISCN }, /* backspace */
782 { 0x0009, ISSP|ISBL|ISCN }, /* TAB */
783 { 0x000a, ISSP| ISCN }, /* LF */
784 { 0x000c, ISSP| ISCN }, /* FF */
785 { 0x000d, ISSP| ISCN }, /* CR */
786 { 0x0020, ISPR|ISSP|ISBL }, /* space */
787 { 0x0021, ISPU|ISGR|ISPR }, /* ! */
788 { 0x0033, ISDI|ISXD|ISAN| ISGR|ISPR }, /* 3 */
789 { 0x0040, ISPU|ISGR|ISPR }, /* @ */
790 { 0x0041, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* A */
791 { 0x007a, ISAL|ISLO| ISAN| ISGR|ISPR }, /* z */
792 { 0x007b, ISPU|ISGR|ISPR }, /* { */
793 { 0x0085, ISSP| ISCN }, /* NEL */
794 { 0x00a0, ISPR|ISSP|ISBL }, /* NBSP */
795 { 0x00a4, ISGR|ISPR }, /* currency sign */
796 { 0x00e4, ISAL|ISLO| ISAN| ISGR|ISPR }, /* a-umlaut */
797 { 0x0300, ISGR|ISPR }, /* combining grave */
798 { 0x0600, ISCN }, /* arabic number sign */
799 { 0x0627, ISAL| ISAN| ISGR|ISPR }, /* alef */
800 { 0x0663, ISDI|ISXD|ISAN| ISGR|ISPR }, /* arabic 3 */
801 { 0x2002, ISPR|ISSP|ISBL }, /* en space */
802 { 0x2007, ISPR|ISSP|ISBL }, /* figure space */
803 { 0x2009, ISPR|ISSP|ISBL }, /* thin space */
374ca955
A
804 { 0x200b, ISCN }, /* ZWSP */
805 /*{ 0x200b, ISPR|ISSP },*/ /* ZWSP */ /* ZWSP became a control char in 4.0.1*/
b75a7d8f
A
806 { 0x200e, ISCN }, /* LRM */
807 { 0x2028, ISPR|ISSP| ISCN }, /* LS */
808 { 0x2029, ISPR|ISSP| ISCN }, /* PS */
809 { 0x20ac, ISGR|ISPR }, /* Euro */
810 { 0xff15, ISDI|ISXD|ISAN| ISGR|ISPR }, /* fullwidth 5 */
811 { 0xff25, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* fullwidth E */
812 { 0xff35, ISAL| ISUP| ISAN| ISGR|ISPR }, /* fullwidth U */
813 { 0xff45, ISAL|ISLO| ISXD|ISAN| ISGR|ISPR }, /* fullwidth e */
814 { 0xff55, ISAL|ISLO| ISAN| ISGR|ISPR } /* fullwidth u */
815};
816
817static void
818TestPOSIX() {
819 uint32_t mask;
820 int32_t cl, i;
821 UBool expect;
822
823 mask=1;
824 for(cl=0; cl<12; ++cl) {
b331163b 825 for(i=0; i<UPRV_LENGTHOF(posixData); ++i) {
b75a7d8f
A
826 expect=(UBool)((posixData[i].posixResults&mask)!=0);
827 if(posixClasses[cl].fn(posixData[i].c)!=expect) {
828 log_err("u_%s(U+%04x)=%s is wrong\n",
829 posixClasses[cl].name, posixData[i].c, expect ? "FALSE" : "TRUE");
830 }
831 }
832 mask<<=1;
833 }
834}
835
836/* Tests for isControl(u_iscntrl()) and isPrintable(u_isprint()) */
837static void TestControlPrint()
838{
729e4ab9
A
839 const UChar32 sampleControl[] = {0x1b, 0x97, 0x82, 0x2028, 0x2029, 0x200c, 0x202b};
840 const UChar32 sampleNonControl[] = {0x61, 0x0031, 0x00e2};
841 const UChar32 samplePrintable[] = {0x0042, 0x005f, 0x2014};
842 const UChar32 sampleNonPrintable[] = {0x200c, 0x009f, 0x001b};
b75a7d8f 843 UChar32 c;
b75a7d8f 844
b331163b
A
845 testSampleCharProps(u_iscntrl, "u_iscntrl", sampleControl, UPRV_LENGTHOF(sampleControl), TRUE);
846 testSampleCharProps(u_iscntrl, "u_iscntrl", sampleNonControl, UPRV_LENGTHOF(sampleNonControl), FALSE);
b75a7d8f 847
729e4ab9 848 testSampleCharProps(u_isprint, "u_isprint",
b331163b 849 samplePrintable, UPRV_LENGTHOF(samplePrintable), TRUE);
729e4ab9 850 testSampleCharProps(u_isprint, "u_isprint",
b331163b 851 sampleNonPrintable, UPRV_LENGTHOF(sampleNonPrintable), FALSE);
b75a7d8f
A
852
853 /* test all ISO 8 controls */
854 for(c=0; c<=0x9f; ++c) {
855 if(c==0x20) {
856 /* skip ASCII graphic characters and continue with DEL */
857 c=0x7f;
858 }
859 if(!u_iscntrl(c)) {
860 log_err("error: u_iscntrl(ISO 8 control U+%04x)=FALSE\n", c);
861 }
862 if(!u_isISOControl(c)) {
863 log_err("error: u_isISOControl(ISO 8 control U+%04x)=FALSE\n", c);
864 }
865 if(u_isprint(c)) {
866 log_err("error: u_isprint(ISO 8 control U+%04x)=TRUE\n", c);
867 }
868 }
869
870 /* test all Latin-1 graphic characters */
871 for(c=0x20; c<=0xff; ++c) {
872 if(c==0x7f) {
873 c=0xa0;
874 } else if(c==0xad) {
875 /* Unicode 4 changes 00AD Soft Hyphen to Cf (and it is in fact not printable) */
876 ++c;
877 }
878 if(!u_isprint(c)) {
879 log_err("error: u_isprint(Latin-1 graphic character U+%04x)=FALSE\n", c);
880 }
881 }
882}
883
884/* u_isJavaIDStart, u_isJavaIDPart, u_isIDStart(), u_isIDPart(), u_isIDIgnorable()*/
885static void TestIdentifier()
886{
729e4ab9
A
887 const UChar32 sampleJavaIDStart[] = {0x0071, 0x00e4, 0x005f};
888 const UChar32 sampleNonJavaIDStart[] = {0x0020, 0x2030, 0x0082};
889 const UChar32 sampleJavaIDPart[] = {0x005f, 0x0032, 0x0045};
890 const UChar32 sampleNonJavaIDPart[] = {0x2030, 0x2020, 0x0020};
891 const UChar32 sampleUnicodeIDStart[] = {0x0250, 0x00e2, 0x0061};
892 const UChar32 sampleNonUnicodeIDStart[] = {0x2000, 0x000a, 0x2019};
893 const UChar32 sampleUnicodeIDPart[] = {0x005f, 0x0032, 0x0045};
894 const UChar32 sampleNonUnicodeIDPart[] = {0x2030, 0x00a3, 0x0020};
895 const UChar32 sampleIDIgnore[] = {0x0006, 0x0010, 0x206b, 0x85};
896 const UChar32 sampleNonIDIgnore[] = {0x0075, 0x00a3, 0x0061};
897
898 testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
b331163b 899 sampleJavaIDStart, UPRV_LENGTHOF(sampleJavaIDStart), TRUE);
729e4ab9 900 testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
b331163b 901 sampleNonJavaIDStart, UPRV_LENGTHOF(sampleNonJavaIDStart), FALSE);
729e4ab9
A
902
903 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
b331163b 904 sampleJavaIDPart, UPRV_LENGTHOF(sampleJavaIDPart), TRUE);
729e4ab9 905 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
b331163b 906 sampleNonJavaIDPart, UPRV_LENGTHOF(sampleNonJavaIDPart), FALSE);
729e4ab9
A
907
908 /* IDPart should imply IDStart */
909 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
b331163b 910 sampleJavaIDStart, UPRV_LENGTHOF(sampleJavaIDStart), TRUE);
729e4ab9
A
911
912 testSampleCharProps(u_isIDStart, "u_isIDStart",
b331163b 913 sampleUnicodeIDStart, UPRV_LENGTHOF(sampleUnicodeIDStart), TRUE);
729e4ab9 914 testSampleCharProps(u_isIDStart, "u_isIDStart",
b331163b 915 sampleNonUnicodeIDStart, UPRV_LENGTHOF(sampleNonUnicodeIDStart), FALSE);
729e4ab9
A
916
917 testSampleCharProps(u_isIDPart, "u_isIDPart",
b331163b 918 sampleUnicodeIDPart, UPRV_LENGTHOF(sampleUnicodeIDPart), TRUE);
729e4ab9 919 testSampleCharProps(u_isIDPart, "u_isIDPart",
b331163b 920 sampleNonUnicodeIDPart, UPRV_LENGTHOF(sampleNonUnicodeIDPart), FALSE);
729e4ab9
A
921
922 /* IDPart should imply IDStart */
923 testSampleCharProps(u_isIDPart, "u_isIDPart",
b331163b 924 sampleUnicodeIDStart, UPRV_LENGTHOF(sampleUnicodeIDStart), TRUE);
729e4ab9
A
925
926 testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
b331163b 927 sampleIDIgnore, UPRV_LENGTHOF(sampleIDIgnore), TRUE);
729e4ab9 928 testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
b331163b 929 sampleNonIDIgnore, UPRV_LENGTHOF(sampleNonIDIgnore), FALSE);
b75a7d8f
A
930}
931
932/* for each line of UnicodeData.txt, check some of the properties */
4388f060
A
933typedef struct UnicodeDataContext {
934#if UCONFIG_NO_NORMALIZATION
935 const void *dummy;
936#else
937 const UNormalizer2 *nfc;
938 const UNormalizer2 *nfkc;
939#endif
940} UnicodeDataContext;
941
b75a7d8f
A
942/*
943 * ### TODO
944 * This test fails incorrectly if the First or Last code point of a repetitive area
945 * is overridden, which is allowed and is encouraged for the PUAs.
946 * Currently, this means that both area First/Last and override lines are
947 * tested against the properties from the API,
948 * and the area boundary will not match and cause an error.
949 *
950 * This function should detect area boundaries and skip them for the test of individual
951 * code points' properties.
952 * Then it should check that the areas contain all the same properties except where overridden.
953 * For this, it would have had to set a flag for which code points were listed explicitly.
954 */
955static void U_CALLCONV
956unicodeDataLineFn(void *context,
957 char *fields[][2], int32_t fieldCount,
958 UErrorCode *pErrorCode)
959{
960 char buffer[100];
4388f060 961 const char *d;
b75a7d8f
A
962 char *end;
963 uint32_t value;
964 UChar32 c;
965 int32_t i;
966 int8_t type;
4388f060
A
967 int32_t dt;
968 UChar dm[32], s[32];
969 int32_t dmLength, length;
970
971#if !UCONFIG_NO_NORMALIZATION
972 const UNormalizer2 *nfc, *nfkc;
973#endif
b75a7d8f
A
974
975 /* get the character code, field 0 */
976 c=strtoul(fields[0][0], &end, 16);
977 if(end<=fields[0][0] || end!=fields[0][1]) {
978 log_err("error: syntax error in field 0 at %s\n", fields[0][0]);
979 return;
980 }
981 if((uint32_t)c>=UCHAR_MAX_VALUE + 1) {
982 log_err("error in UnicodeData.txt: code point %lu out of range\n", c);
983 return;
984 }
985
986 /* get general category, field 2 */
987 *fields[2][1]=0;
988 type = (int8_t)tagValues[MakeProp(fields[2][0])];
989 if(u_charType(c)!=type) {
990 log_err("error: u_charType(U+%04lx)==%u instead of %u\n", c, u_charType(c), type);
991 }
992 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
993 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
994 }
995
996 /* get canonical combining class, field 3 */
997 value=strtoul(fields[3][0], &end, 10);
998 if(end<=fields[3][0] || end!=fields[3][1]) {
999 log_err("error: syntax error in field 3 at code 0x%lx\n", c);
1000 return;
1001 }
1002 if(value>255) {
1003 log_err("error in UnicodeData.txt: combining class %lu out of range\n", value);
1004 return;
1005 }
1006#if !UCONFIG_NO_NORMALIZATION
1007 if(value!=u_getCombiningClass(c) || value!=(uint32_t)u_getIntPropertyValue(c, UCHAR_CANONICAL_COMBINING_CLASS)) {
1008 log_err("error: u_getCombiningClass(U+%04lx)==%hu instead of %lu\n", c, u_getCombiningClass(c), value);
1009 }
4388f060
A
1010 nfkc=((UnicodeDataContext *)context)->nfkc;
1011 if(value!=unorm2_getCombiningClass(nfkc, c)) {
1012 log_err("error: unorm2_getCombiningClass(nfkc, U+%04lx)==%hu instead of %lu\n", c, unorm2_getCombiningClass(nfkc, c), value);
1013 }
b75a7d8f
A
1014#endif
1015
1016 /* get BiDi category, field 4 */
1017 *fields[4][1]=0;
1018 i=MakeDir(fields[4][0]);
1019 if(i!=u_charDirection(c) || i!=u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)) {
1020 log_err("error: u_charDirection(U+%04lx)==%u instead of %u (%s)\n", c, u_charDirection(c), MakeDir(fields[4][0]), fields[4][0]);
1021 }
1022
4388f060
A
1023 /* get Decomposition_Type & Decomposition_Mapping, field 5 */
1024 d=NULL;
1025 if(fields[5][0]==fields[5][1]) {
1026 /* no decomposition, except UnicodeData.txt omits Hangul syllable decompositions */
1027 if(c==0xac00 || c==0xd7a3) {
1028 dt=U_DT_CANONICAL;
1029 } else {
1030 dt=U_DT_NONE;
1031 }
1032 } else {
1033 d=fields[5][0];
1034 *fields[5][1]=0;
1035 dt=UCHAR_INVALID_CODE;
1036 if(*d=='<') {
1037 end=strchr(++d, '>');
1038 if(end!=NULL) {
1039 *end=0;
1040 dt=u_getPropertyValueEnum(UCHAR_DECOMPOSITION_TYPE, d);
1041 d=u_skipWhitespace(end+1);
1042 }
1043 } else {
1044 dt=U_DT_CANONICAL;
1045 }
1046 }
1047 if(dt>U_DT_NONE) {
1048 if(c==0xac00) {
1049 dm[0]=0x1100;
1050 dm[1]=0x1161;
1051 dm[2]=0;
1052 dmLength=2;
1053 } else if(c==0xd7a3) {
1054 dm[0]=0xd788;
1055 dm[1]=0x11c2;
1056 dm[2]=0;
1057 dmLength=2;
1058 } else {
1059 dmLength=u_parseString(d, dm, 32, NULL, pErrorCode);
1060 }
1061 } else {
1062 dmLength=-1;
1063 }
1064 if(dt<0 || U_FAILURE(*pErrorCode)) {
1065 log_err("error in UnicodeData.txt: syntax error in U+%04lX decomposition field\n", (long)c);
1066 return;
1067 }
1068#if !UCONFIG_NO_NORMALIZATION
1069 i=u_getIntPropertyValue(c, UCHAR_DECOMPOSITION_TYPE);
1070 if(i!=dt) {
1071 log_err("error: u_getIntPropertyValue(U+%04lx, UCHAR_DECOMPOSITION_TYPE)==%d instead of %d\n", c, i, dt);
1072 }
1073 /* Expect Decomposition_Mapping=nfkc.getRawDecomposition(c). */
1074 length=unorm2_getRawDecomposition(nfkc, c, s, 32, pErrorCode);
1075 if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1076 log_err("error: unorm2_getRawDecomposition(nfkc, U+%04lx)==%d instead of %d "
1077 "or the Decomposition_Mapping is different (%s)\n",
1078 c, length, dmLength, u_errorName(*pErrorCode));
1079 return;
1080 }
1081 /* For canonical decompositions only, expect Decomposition_Mapping=nfc.getRawDecomposition(c). */
1082 if(dt!=U_DT_CANONICAL) {
1083 dmLength=-1;
1084 }
1085 nfc=((UnicodeDataContext *)context)->nfc;
1086 length=unorm2_getRawDecomposition(nfc, c, s, 32, pErrorCode);
1087 if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1088 log_err("error: unorm2_getRawDecomposition(nfc, U+%04lx)==%d instead of %d "
1089 "or the Decomposition_Mapping is different (%s)\n",
1090 c, length, dmLength, u_errorName(*pErrorCode));
1091 return;
1092 }
1093 /* recompose */
1094 if(dt==U_DT_CANONICAL && !u_hasBinaryProperty(c, UCHAR_FULL_COMPOSITION_EXCLUSION)) {
1095 UChar32 a, b, composite;
1096 i=0;
1097 U16_NEXT(dm, i, dmLength, a);
1098 U16_NEXT(dm, i, dmLength, b);
1099 /* i==dmLength */
1100 composite=unorm2_composePair(nfc, a, b);
1101 if(composite!=c) {
1102 log_err("error: nfc U+%04lX decomposes to U+%04lX+U+%04lX but does not compose back (instead U+%04lX)\n",
1103 (long)c, (long)a, (long)b, (long)composite);
1104 }
1105 /*
1106 * Note: NFKC has fewer round-trip mappings than NFC,
1107 * so we can't just test unorm2_composePair(nfkc, a, b) here without further data.
1108 */
1109 }
1110#endif
1111
b75a7d8f
A
1112 /* get ISO Comment, field 11 */
1113 *fields[11][1]=0;
1114 i=u_getISOComment(c, buffer, sizeof(buffer), pErrorCode);
1115 if(U_FAILURE(*pErrorCode) || 0!=strcmp(fields[11][0], buffer)) {
729e4ab9 1116 log_err_status(*pErrorCode, "error: u_getISOComment(U+%04lx) wrong (%s): \"%s\" should be \"%s\"\n",
b75a7d8f
A
1117 c, u_errorName(*pErrorCode),
1118 U_FAILURE(*pErrorCode) ? buffer : "[error]",
1119 fields[11][0]);
1120 }
1121
1122 /* get uppercase mapping, field 12 */
1123 if(fields[12][0]!=fields[12][1]) {
1124 value=strtoul(fields[12][0], &end, 16);
1125 if(end!=fields[12][1]) {
1126 log_err("error: syntax error in field 12 at code 0x%lx\n", c);
1127 return;
1128 }
1129 if((UChar32)value!=u_toupper(c)) {
1130 log_err("error: u_toupper(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_toupper(c), value);
1131 }
1132 } else {
1133 /* no case mapping: the API must map the code point to itself */
1134 if(c!=u_toupper(c)) {
1135 log_err("error: U+%04lx does not have an uppercase mapping but u_toupper()==U+%04lx\n", c, u_toupper(c));
1136 }
1137 }
1138
1139 /* get lowercase mapping, field 13 */
1140 if(fields[13][0]!=fields[13][1]) {
1141 value=strtoul(fields[13][0], &end, 16);
1142 if(end!=fields[13][1]) {
1143 log_err("error: syntax error in field 13 at code 0x%lx\n", c);
1144 return;
1145 }
1146 if((UChar32)value!=u_tolower(c)) {
1147 log_err("error: u_tolower(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_tolower(c), value);
1148 }
1149 } else {
1150 /* no case mapping: the API must map the code point to itself */
1151 if(c!=u_tolower(c)) {
1152 log_err("error: U+%04lx does not have a lowercase mapping but u_tolower()==U+%04lx\n", c, u_tolower(c));
1153 }
1154 }
1155
1156 /* get titlecase mapping, field 14 */
1157 if(fields[14][0]!=fields[14][1]) {
1158 value=strtoul(fields[14][0], &end, 16);
1159 if(end!=fields[14][1]) {
1160 log_err("error: syntax error in field 14 at code 0x%lx\n", c);
1161 return;
1162 }
1163 if((UChar32)value!=u_totitle(c)) {
1164 log_err("error: u_totitle(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_totitle(c), value);
1165 }
1166 } else {
1167 /* no case mapping: the API must map the code point to itself */
1168 if(c!=u_totitle(c)) {
1169 log_err("error: U+%04lx does not have a titlecase mapping but u_totitle()==U+%04lx\n", c, u_totitle(c));
1170 }
1171 }
1172}
1173
1174static UBool U_CALLCONV
1175enumTypeRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1176 static const UChar32 test[][2]={
1177 {0x41, U_UPPERCASE_LETTER},
1178 {0x308, U_NON_SPACING_MARK},
1179 {0xfffe, U_GENERAL_OTHER_TYPES},
1180 {0xe0041, U_FORMAT_CHAR},
1181 {0xeffff, U_UNASSIGNED}
1182 };
1183
374ca955 1184 int32_t i, count;
b75a7d8f
A
1185
1186 if(0!=strcmp((const char *)context, "a1")) {
1187 log_err("error: u_enumCharTypes() passes on an incorrect context pointer\n");
1188 return FALSE;
1189 }
1190
b331163b 1191 count=UPRV_LENGTHOF(test);
b75a7d8f
A
1192 for(i=0; i<count; ++i) {
1193 if(start<=test[i][0] && test[i][0]<limit) {
1194 if(type!=(UCharCategory)test[i][1]) {
1195 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld instead of U+%04lx with %ld\n",
1196 start, limit, (long)type, test[i][0], test[i][1]);
1197 }
374ca955 1198 /* stop at the range that includes the last test code point (increases code coverage for enumeration) */
b75a7d8f
A
1199 return i==(count-1) ? FALSE : TRUE;
1200 }
1201 }
1202
1203 if(start>test[count-1][0]) {
1204 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld after it should have stopped\n",
1205 start, limit, (long)type);
1206 return FALSE;
1207 }
1208
374ca955
A
1209 return TRUE;
1210}
1211
1212static UBool U_CALLCONV
1213enumDefaultsRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
4388f060 1214 /* default Bidi classes for unassigned code points, from the DerivedBidiClass.txt header */
374ca955
A
1215 static const int32_t defaultBidi[][2]={ /* { limit, class } */
1216 { 0x0590, U_LEFT_TO_RIGHT },
1217 { 0x0600, U_RIGHT_TO_LEFT },
1218 { 0x07C0, U_RIGHT_TO_LEFT_ARABIC },
6be67b06
A
1219 { 0x0860, U_RIGHT_TO_LEFT },
1220 { 0x0870, U_RIGHT_TO_LEFT_ARABIC }, // Unicode 10 changes U+0860..U+086F from R to AL.
4388f060
A
1221 { 0x08A0, U_RIGHT_TO_LEFT },
1222 { 0x0900, U_RIGHT_TO_LEFT_ARABIC }, /* Unicode 6.1 changes U+08A0..U+08FF from R to AL */
57a6839d
A
1223 { 0x20A0, U_LEFT_TO_RIGHT },
1224 { 0x20D0, U_EUROPEAN_NUMBER_TERMINATOR }, /* Unicode 6.3 changes the currency symbols block U+20A0..U+20CF to default to ET not L */
374ca955
A
1225 { 0xFB1D, U_LEFT_TO_RIGHT },
1226 { 0xFB50, U_RIGHT_TO_LEFT },
1227 { 0xFE00, U_RIGHT_TO_LEFT_ARABIC },
1228 { 0xFE70, U_LEFT_TO_RIGHT },
1229 { 0xFF00, U_RIGHT_TO_LEFT_ARABIC },
0f5d89e8 1230
374ca955 1231 { 0x10800, U_LEFT_TO_RIGHT },
0f5d89e8
A
1232 { 0x10D00, U_RIGHT_TO_LEFT }, // Unicode 11 changes U+10D00..U+10D3F from R to AL.
1233 { 0x10D40, U_RIGHT_TO_LEFT_ARABIC },
1234 { 0x10F30, U_RIGHT_TO_LEFT }, // Unicode 11 changes U+10F30..U+10F6F from R to AL.
1235 { 0x10F70, U_RIGHT_TO_LEFT_ARABIC },
374ca955 1236 { 0x11000, U_RIGHT_TO_LEFT },
0f5d89e8 1237
729e4ab9 1238 { 0x1E800, U_LEFT_TO_RIGHT }, /* new default-R range in Unicode 5.2: U+1E800 - U+1EFFF */
0f5d89e8
A
1239 { 0x1EC70, U_RIGHT_TO_LEFT }, // Unicode 11 changes U+1EC70..U+1ECBF from R to AL.
1240 { 0x1ECC0, U_RIGHT_TO_LEFT_ARABIC },
4388f060
A
1241 { 0x1EE00, U_RIGHT_TO_LEFT },
1242 { 0x1EF00, U_RIGHT_TO_LEFT_ARABIC }, /* Unicode 6.1 changes U+1EE00..U+1EEFF from R to AL */
729e4ab9 1243 { 0x1F000, U_RIGHT_TO_LEFT },
374ca955
A
1244 { 0x110000, U_LEFT_TO_RIGHT }
1245 };
1246
1247 UChar32 c;
1248 int32_t i;
1249 UCharDirection shouldBeDir;
1250
b75a7d8f
A
1251 /*
1252 * LineBreak.txt specifies:
1253 * # - Assigned characters that are not listed explicitly are given the value
1254 * # "AL".
1255 * # - Unassigned characters are given the value "XX".
1256 *
1257 * PUA characters are listed explicitly with "XX".
1258 * Verify that no assigned character has "XX".
1259 */
1260 if(type!=U_UNASSIGNED && type!=U_PRIVATE_USE_CHAR) {
1261 c=start;
1262 while(c<limit) {
1263 if(0==u_getIntPropertyValue(c, UCHAR_LINE_BREAK)) {
1264 log_err("error UCHAR_LINE_BREAK(assigned U+%04lx)=XX\n", c);
1265 }
1266 ++c;
1267 }
1268 }
1269
1270 /*
1271 * Verify default Bidi classes.
f3c0d7a5 1272 * See DerivedBidiClass.txt, especially for unassigned code points.
b75a7d8f
A
1273 */
1274 if(type==U_UNASSIGNED || type==U_PRIVATE_USE_CHAR) {
1275 /* enumerate the intersections of defaultBidi ranges with [start..limit[ */
1276 c=start;
b331163b 1277 for(i=0; i<UPRV_LENGTHOF(defaultBidi) && c<limit; ++i) {
b75a7d8f
A
1278 if((int32_t)c<defaultBidi[i][0]) {
1279 while(c<limit && (int32_t)c<defaultBidi[i][0]) {
374ca955
A
1280 if(U_IS_UNICODE_NONCHAR(c) || u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
1281 shouldBeDir=U_BOUNDARY_NEUTRAL;
1282 } else {
1283 shouldBeDir=(UCharDirection)defaultBidi[i][1];
1284 }
1285
1286 if( u_charDirection(c)!=shouldBeDir ||
1287 u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)!=shouldBeDir
b75a7d8f
A
1288 ) {
1289 log_err("error: u_charDirection(unassigned/PUA U+%04lx)=%s should be %s\n",
374ca955 1290 c, dirStrings[u_charDirection(c)], dirStrings[shouldBeDir]);
b75a7d8f
A
1291 }
1292 ++c;
1293 }
1294 }
1295 }
1296 }
1297
1298 return TRUE;
1299}
1300
1301/* tests for several properties */
1302static void TestUnicodeData()
1303{
b75a7d8f
A
1304 UVersionInfo expectVersionArray;
1305 UVersionInfo versionArray;
1306 char *fields[15][2];
1307 UErrorCode errorCode;
1308 UChar32 c;
1309 int8_t type;
1310
4388f060
A
1311 UnicodeDataContext context;
1312
b75a7d8f
A
1313 u_versionFromString(expectVersionArray, U_UNICODE_VERSION);
1314 u_getUnicodeVersion(versionArray);
1315 if(memcmp(versionArray, expectVersionArray, U_MAX_VERSION_LENGTH) != 0)
1316 {
1317 log_err("Testing u_getUnicodeVersion() - expected " U_UNICODE_VERSION " got %d.%d.%d.%d\n",
1318 versionArray[0], versionArray[1], versionArray[2], versionArray[3]);
1319 }
1320
1321#if defined(ICU_UNICODE_VERSION)
1322 /* test only happens where we have configure.in with UNICODE_VERSION - sanity check. */
1323 if(strcmp(U_UNICODE_VERSION, ICU_UNICODE_VERSION))
1324 {
1325 log_err("Testing configure.in's ICU_UNICODE_VERSION - expected " U_UNICODE_VERSION " got " ICU_UNICODE_VERSION "\n");
1326 }
1327#endif
1328
1329 if (ublock_getCode((UChar)0x0041) != UBLOCK_BASIC_LATIN || u_getIntPropertyValue(0x41, UCHAR_BLOCK)!=(int32_t)UBLOCK_BASIC_LATIN) {
1330 log_err("ublock_getCode(U+0041) property failed! Expected : %i Got: %i \n", UBLOCK_BASIC_LATIN,ublock_getCode((UChar)0x0041));
1331 }
1332
1333 errorCode=U_ZERO_ERROR;
4388f060
A
1334#if !UCONFIG_NO_NORMALIZATION
1335 context.nfc=unorm2_getNFCInstance(&errorCode);
1336 context.nfkc=unorm2_getNFKCInstance(&errorCode);
1337 if(U_FAILURE(errorCode)) {
1338 log_data_err("error: unable to open an NFC or NFKC UNormalizer2 - %s\n", u_errorName(errorCode));
1339 return;
1340 }
1341#endif
1342 parseUCDFile("UnicodeData.txt", fields, 15, unicodeDataLineFn, &context, &errorCode);
b75a7d8f 1343 if(U_FAILURE(errorCode)) {
b75a7d8f
A
1344 return; /* if we couldn't parse UnicodeData.txt, we should return */
1345 }
1346
1347 /* sanity check on repeated properties */
1348 for(c=0xfffe; c<=0x10ffff;) {
1349 type=u_charType(c);
1350 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1351 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1352 }
1353 if(type!=U_UNASSIGNED) {
1354 log_err("error: u_charType(U+%04lx)!=U_UNASSIGNED (returns %d)\n", c, u_charType(c));
1355 }
1356 if((c&0xffff)==0xfffe) {
1357 ++c;
1358 } else {
1359 c+=0xffff;
1360 }
1361 }
1362
1363 /* test that PUA is not "unassigned" */
1364 for(c=0xe000; c<=0x10fffd;) {
1365 type=u_charType(c);
1366 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1367 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1368 }
1369 if(type==U_UNASSIGNED) {
1370 log_err("error: u_charType(U+%04lx)==U_UNASSIGNED\n", c);
1371 } else if(type!=U_PRIVATE_USE_CHAR) {
1372 log_verbose("PUA override: u_charType(U+%04lx)=%d\n", c, type);
1373 }
1374 if(c==0xf8ff) {
1375 c=0xf0000;
1376 } else if(c==0xffffd) {
1377 c=0x100000;
1378 } else {
1379 ++c;
1380 }
1381 }
1382
1383 /* test u_enumCharTypes() */
1384 u_enumCharTypes(enumTypeRange, "a1");
374ca955
A
1385
1386 /* check default properties */
1387 u_enumCharTypes(enumDefaultsRange, NULL);
b75a7d8f
A
1388}
1389
1390static void TestCodeUnit(){
1391 const UChar codeunit[]={0x0000,0xe065,0x20ac,0xd7ff,0xd800,0xd841,0xd905,0xdbff,0xdc00,0xdc02,0xddee,0xdfff,0};
1392
1393 int32_t i;
1394
2ca993e8 1395 for(i=0; i<UPRV_LENGTHOF(codeunit); i++){
b75a7d8f 1396 UChar c=codeunit[i];
0f5d89e8
A
1397 if(i<4){
1398 if(!(U16_IS_SINGLE(c)) || (U16_IS_LEAD(c)) || (U16_IS_TRAIL(c)) ||
1399 U16_IS_SURROGATE(c) || U_IS_SURROGATE(c)) {
1400 log_err("ERROR: U+%04x is a single", c);
1401 }
1402
1403 }
1404 if(i >= 4 && i< 8){
1405 if(!(U16_IS_LEAD(c)) || U16_IS_SINGLE(c) || U16_IS_TRAIL(c) ||
1406 !U16_IS_SURROGATE(c) || !U_IS_SURROGATE(c)){
1407 log_err("ERROR: U+%04x is a first surrogate", c);
1408 }
1409 }
1410 if(i >= 8 && i< 12){
1411 if(!(U16_IS_TRAIL(c)) || U16_IS_SINGLE(c) || U16_IS_LEAD(c) ||
1412 !U16_IS_SURROGATE(c) || !U_IS_SURROGATE(c)){
1413 log_err("ERROR: U+%04x is a second surrogate", c);
1414 }
1415 }
1416#if !U_HIDE_OBSOLETE_UTF_OLD_H
b75a7d8f
A
1417 if(i<4){
1418 if(!(UTF_IS_SINGLE(c)) || (UTF_IS_LEAD(c)) || (UTF_IS_TRAIL(c)) ||(UTF_IS_SURROGATE(c))){
1419 log_err("ERROR: U+%04x is a single", c);
1420 }
1421
1422 }
1423 if(i >= 4 && i< 8){
1424 if(!(UTF_IS_LEAD(c)) || UTF_IS_SINGLE(c) || UTF_IS_TRAIL(c) || !(UTF_IS_SURROGATE(c))){
1425 log_err("ERROR: U+%04x is a first surrogate", c);
1426 }
1427 }
1428 if(i >= 8 && i< 12){
1429 if(!(UTF_IS_TRAIL(c)) || UTF_IS_SINGLE(c) || UTF_IS_LEAD(c) || !(UTF_IS_SURROGATE(c))){
1430 log_err("ERROR: U+%04x is a second surrogate", c);
1431 }
1432 }
0f5d89e8 1433#endif
b75a7d8f 1434 }
b75a7d8f
A
1435}
1436
1437static void TestCodePoint(){
1438 const UChar32 codePoint[]={
1439 /*surrogate, notvalid(codepoint), not a UnicodeChar, not Error */
1440 0xd800,
1441 0xdbff,
1442 0xdc00,
1443 0xdfff,
1444 0xdc04,
1445 0xd821,
1446 /*not a surrogate, valid, isUnicodeChar , not Error*/
1447 0x20ac,
1448 0xd7ff,
1449 0xe000,
1450 0xe123,
1451 0x0061,
1452 0xe065,
1453 0x20402,
1454 0x24506,
1455 0x23456,
1456 0x20402,
1457 0x10402,
1458 0x23456,
1459 /*not a surrogate, not valid, isUnicodeChar, isError */
1460 0x0015,
1461 0x009f,
1462 /*not a surrogate, not valid, not isUnicodeChar, isError */
1463 0xffff,
1464 0xfffe,
1465 };
1466 int32_t i;
0f5d89e8 1467 for(i=0; i<UPRV_LENGTHOF(codePoint); i++) {
b75a7d8f 1468 UChar32 c=codePoint[i];
0f5d89e8
A
1469 if(i<6) {
1470 if(!U_IS_SURROGATE(c) || !U16_IS_SURROGATE(c)) {
1471 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1472 }
1473 if(U_IS_UNICODE_CHAR(c)) {
1474 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1475 }
1476 } else if(i >=6 && i<18) {
1477 if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1478 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1479 }
1480 if(!U_IS_UNICODE_CHAR(c)) {
1481 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1482 }
1483 } else if(i >=18 && i<20) {
1484 if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1485 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1486 }
1487 if(!U_IS_UNICODE_CHAR(c)) {
1488 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1489 }
1490 } else if(i >=18 && i<UPRV_LENGTHOF(codePoint)) {
1491 if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1492 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1493 }
1494 if(U_IS_UNICODE_CHAR(c)) {
1495 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1496 }
1497 }
1498#if !U_HIDE_OBSOLETE_UTF_OLD_H
b75a7d8f 1499 if(i<6){
0f5d89e8 1500 if(!UTF_IS_SURROGATE(c)){
b75a7d8f
A
1501 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1502 }
1503 if(UTF_IS_VALID(c)){
1504 log_err("ERROR: isValid() failed for U+%04x\n", c);
1505 }
0f5d89e8 1506 if(UTF_IS_UNICODE_CHAR(c)){
b75a7d8f
A
1507 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1508 }
1509 if(UTF_IS_ERROR(c)){
1510 log_err("ERROR: isError() failed for U+%04x\n", c);
1511 }
1512 }else if(i >=6 && i<18){
0f5d89e8 1513 if(UTF_IS_SURROGATE(c)){
b75a7d8f
A
1514 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1515 }
1516 if(!UTF_IS_VALID(c)){
1517 log_err("ERROR: isValid() failed for U+%04x\n", c);
1518 }
0f5d89e8 1519 if(!UTF_IS_UNICODE_CHAR(c)){
b75a7d8f
A
1520 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1521 }
1522 if(UTF_IS_ERROR(c)){
1523 log_err("ERROR: isError() failed for U+%04x\n", c);
1524 }
1525 }else if(i >=18 && i<20){
0f5d89e8 1526 if(UTF_IS_SURROGATE(c)){
b75a7d8f
A
1527 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1528 }
1529 if(UTF_IS_VALID(c)){
1530 log_err("ERROR: isValid() failed for U+%04x\n", c);
1531 }
0f5d89e8 1532 if(!UTF_IS_UNICODE_CHAR(c)){
b75a7d8f
A
1533 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1534 }
1535 if(!UTF_IS_ERROR(c)){
1536 log_err("ERROR: isError() failed for U+%04x\n", c);
1537 }
1538 }
2ca993e8 1539 else if(i >=18 && i<UPRV_LENGTHOF(codePoint)){
0f5d89e8 1540 if(UTF_IS_SURROGATE(c)){
b75a7d8f
A
1541 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1542 }
1543 if(UTF_IS_VALID(c)){
1544 log_err("ERROR: isValid() failed for U+%04x\n", c);
1545 }
0f5d89e8 1546 if(UTF_IS_UNICODE_CHAR(c)){
b75a7d8f
A
1547 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1548 }
1549 if(!UTF_IS_ERROR(c)){
1550 log_err("ERROR: isError() failed for U+%04x\n", c);
1551 }
1552 }
0f5d89e8 1553#endif
b75a7d8f
A
1554 }
1555
374ca955
A
1556 if(
1557 !U_IS_BMP(0) || !U_IS_BMP(0x61) || !U_IS_BMP(0x20ac) ||
1558 !U_IS_BMP(0xd9da) || !U_IS_BMP(0xdfed) || !U_IS_BMP(0xffff) ||
1559 U_IS_BMP(U_SENTINEL) || U_IS_BMP(0x10000) || U_IS_BMP(0x50005) ||
1560 U_IS_BMP(0x10ffff) || U_IS_BMP(0x110000) || U_IS_BMP(0x7fffffff)
1561 ) {
1562 log_err("error with U_IS_BMP()\n");
1563 }
1564
1565 if(
1566 U_IS_SUPPLEMENTARY(0) || U_IS_SUPPLEMENTARY(0x61) || U_IS_SUPPLEMENTARY(0x20ac) ||
1567 U_IS_SUPPLEMENTARY(0xd9da) || U_IS_SUPPLEMENTARY(0xdfed) || U_IS_SUPPLEMENTARY(0xffff) ||
1568 U_IS_SUPPLEMENTARY(U_SENTINEL) || !U_IS_SUPPLEMENTARY(0x10000) || !U_IS_SUPPLEMENTARY(0x50005) ||
1569 !U_IS_SUPPLEMENTARY(0x10ffff) || U_IS_SUPPLEMENTARY(0x110000) || U_IS_SUPPLEMENTARY(0x7fffffff)
1570 ) {
1571 log_err("error with U_IS_SUPPLEMENTARY()\n");
1572 }
b75a7d8f
A
1573}
1574
1575static void TestCharLength()
1576{
1577 const int32_t codepoint[]={
1578 1, 0x0061,
1579 1, 0xe065,
1580 1, 0x20ac,
1581 2, 0x20402,
1582 2, 0x23456,
1583 2, 0x24506,
1584 2, 0x20402,
1585 2, 0x10402,
1586 1, 0xd7ff,
1587 1, 0xe000
1588 };
1589
1590 int32_t i;
0f5d89e8 1591#if !U_HIDE_OBSOLETE_UTF_OLD_H
b75a7d8f 1592 UBool multiple;
0f5d89e8 1593#endif
2ca993e8 1594 for(i=0; i<UPRV_LENGTHOF(codepoint); i=(int16_t)(i+2)){
b75a7d8f 1595 UChar32 c=codepoint[i+1];
0f5d89e8
A
1596 if(
1597#if !U_HIDE_OBSOLETE_UTF_OLD_H
1598 UTF_CHAR_LENGTH(c) != codepoint[i] ||
1599#endif
1600 U16_LENGTH(c) != codepoint[i]) {
4388f060 1601 log_err("The no: of code units for U+%04x:- Expected: %d Got: %d\n", c, codepoint[i], U16_LENGTH(c));
b75a7d8f 1602 }
0f5d89e8 1603#if !U_HIDE_OBSOLETE_UTF_OLD_H
b75a7d8f
A
1604 multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
1605 if(UTF_NEED_MULTIPLE_UCHAR(c) != multiple){
1606 log_err("ERROR: Unicode::needMultipleUChar() failed for U+%04x\n", c);
1607 }
0f5d89e8 1608#endif
b75a7d8f
A
1609 }
1610}
1611
1612/*internal functions ----*/
1613static int32_t MakeProp(char* str)
1614{
1615 int32_t result = 0;
1616 char* matchPosition =0;
1617
1618 matchPosition = strstr(tagStrings, str);
1619 if (matchPosition == 0)
1620 {
1621 log_err("unrecognized type letter ");
1622 log_err(str);
1623 }
374ca955
A
1624 else
1625 result = (int32_t)((matchPosition - tagStrings) / 2);
b75a7d8f
A
1626 return result;
1627}
1628
1629static int32_t MakeDir(char* str)
1630{
1631 int32_t pos = 0;
57a6839d 1632 for (pos = 0; pos < U_CHAR_DIRECTION_COUNT; pos++) {
b75a7d8f
A
1633 if (strcmp(str, dirStrings[pos]) == 0) {
1634 return pos;
1635 }
1636 }
1637 return -1;
1638}
1639
1640/* test u_charName() -------------------------------------------------------- */
1641
1642static const struct {
1643 uint32_t code;
729e4ab9 1644 const char *name, *oldName, *extName, *alias;
b75a7d8f
A
1645} names[]={
1646 {0x0061, "LATIN SMALL LETTER A", "", "LATIN SMALL LETTER A"},
4388f060 1647 {0x01a2, "LATIN CAPITAL LETTER OI", "",
729e4ab9
A
1648 "LATIN CAPITAL LETTER OI",
1649 "LATIN CAPITAL LETTER GHA"},
4388f060 1650 {0x0284, "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK", "",
729e4ab9
A
1651 "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK" },
1652 {0x0fd0, "TIBETAN MARK BSKA- SHOG GI MGO RGYAN", "",
1653 "TIBETAN MARK BSKA- SHOG GI MGO RGYAN",
1654 "TIBETAN MARK BKA- SHOG GI MGO RGYAN"},
b75a7d8f
A
1655 {0x3401, "CJK UNIFIED IDEOGRAPH-3401", "", "CJK UNIFIED IDEOGRAPH-3401" },
1656 {0x7fed, "CJK UNIFIED IDEOGRAPH-7FED", "", "CJK UNIFIED IDEOGRAPH-7FED" },
1657 {0xac00, "HANGUL SYLLABLE GA", "", "HANGUL SYLLABLE GA" },
1658 {0xd7a3, "HANGUL SYLLABLE HIH", "", "HANGUL SYLLABLE HIH" },
1659 {0xd800, "", "", "<lead surrogate-D800>" },
1660 {0xdc00, "", "", "<trail surrogate-DC00>" },
4388f060 1661 {0xff08, "FULLWIDTH LEFT PARENTHESIS", "", "FULLWIDTH LEFT PARENTHESIS" },
b75a7d8f
A
1662 {0xffe5, "FULLWIDTH YEN SIGN", "", "FULLWIDTH YEN SIGN" },
1663 {0xffff, "", "", "<noncharacter-FFFF>" },
729e4ab9
A
1664 {0x1d0c5, "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", "",
1665 "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS",
1666 "BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS"},
b75a7d8f
A
1667 {0x23456, "CJK UNIFIED IDEOGRAPH-23456", "", "CJK UNIFIED IDEOGRAPH-23456" }
1668};
1669
1670static UBool
1671enumCharNamesFn(void *context,
1672 UChar32 code, UCharNameChoice nameChoice,
1673 const char *name, int32_t length) {
1674 int32_t *pCount=(int32_t *)context;
729e4ab9 1675 const char *expected;
b75a7d8f
A
1676 int i;
1677
1678 if(length<=0 || length!=(int32_t)strlen(name)) {
1679 /* should not be called with an empty string or invalid length */
1680 log_err("u_enumCharName(0x%lx)=%s but length=%ld\n", name, length);
1681 return TRUE;
1682 }
1683
1684 ++*pCount;
2ca993e8 1685 for(i=0; i<UPRV_LENGTHOF(names); ++i) {
b75a7d8f
A
1686 if(code==(UChar32)names[i].code) {
1687 switch (nameChoice) {
1688 case U_EXTENDED_CHAR_NAME:
1689 if(0!=strcmp(name, names[i].extName)) {
1690 log_err("u_enumCharName(0x%lx - Extended)=%s instead of %s\n", code, name, names[i].extName);
1691 }
1692 break;
1693 case U_UNICODE_CHAR_NAME:
1694 if(0!=strcmp(name, names[i].name)) {
1695 log_err("u_enumCharName(0x%lx)=%s instead of %s\n", code, name, names[i].name);
1696 }
1697 break;
1698 case U_UNICODE_10_CHAR_NAME:
729e4ab9
A
1699 expected=names[i].oldName;
1700 if(expected[0]==0 || 0!=strcmp(name, expected)) {
1701 log_err("u_enumCharName(0x%lx - 1.0)=%s instead of %s\n", code, name, expected);
1702 }
1703 break;
1704 case U_CHAR_NAME_ALIAS:
1705 expected=names[i].alias;
1706 if(expected==NULL || expected[0]==0 || 0!=strcmp(name, expected)) {
1707 log_err("u_enumCharName(0x%lx - alias)=%s instead of %s\n", code, name, expected);
b75a7d8f
A
1708 }
1709 break;
1710 case U_CHAR_NAME_CHOICE_COUNT:
1711 break;
1712 }
1713 break;
1714 }
1715 }
1716 return TRUE;
1717}
1718
1719struct enumExtCharNamesContext {
1720 uint32_t length;
1721 int32_t last;
1722};
1723
1724static UBool
1725enumExtCharNamesFn(void *context,
1726 UChar32 code, UCharNameChoice nameChoice,
1727 const char *name, int32_t length) {
1728 struct enumExtCharNamesContext *ecncp = (struct enumExtCharNamesContext *) context;
1729
1730 if (ecncp->last != (int32_t) code - 1) {
1731 if (ecncp->last < 0) {
1732 log_err("u_enumCharName(0x%lx - Ext) after u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x%lx - Ext)\n", code, ecncp->last, ecncp->last + 1);
1733 } else {
1734 log_err("u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x0 - Ext)\n", code);
1735 }
1736 }
1737 ecncp->last = (int32_t) code;
1738
1739 if (!*name) {
1740 log_err("u_enumCharName(0x%lx - Ext) should not be an empty string\n", code);
1741 }
1742
1743 return enumCharNamesFn(&ecncp->length, code, nameChoice, name, length);
1744}
1745
1746/**
1747 * This can be made more efficient by moving it into putil.c and having
1748 * it directly access the ebcdic translation tables.
1749 * TODO: If we get this method in putil.c, then delete it from here.
1750 */
1751static UChar
1752u_charToUChar(char c) {
1753 UChar uc;
1754 u_charsToUChars(&c, &uc, 1);
1755 return uc;
1756}
1757
1758static void
1759TestCharNames() {
1760 static char name[80];
1761 UErrorCode errorCode=U_ZERO_ERROR;
1762 struct enumExtCharNamesContext extContext;
729e4ab9 1763 const char *expected;
b75a7d8f
A
1764 int32_t length;
1765 UChar32 c;
1766 int32_t i;
1767
1768 log_verbose("Testing uprv_getMaxCharNameLength()\n");
1769 length=uprv_getMaxCharNameLength();
1770 if(length==0) {
1771 /* no names data available */
1772 return;
1773 }
1774 if(length<83) { /* Unicode 3.2 max char name length */
1775 log_err("uprv_getMaxCharNameLength()=%d is too short");
1776 }
1777 /* ### TODO same tests for max ISO comment length as for max name length */
1778
1779 log_verbose("Testing u_charName()\n");
2ca993e8 1780 for(i=0; i<UPRV_LENGTHOF(names); ++i) {
b75a7d8f
A
1781 /* modern Unicode character name */
1782 length=u_charName(names[i].code, U_UNICODE_CHAR_NAME, name, sizeof(name), &errorCode);
1783 if(U_FAILURE(errorCode)) {
1784 log_err("u_charName(0x%lx) error %s\n", names[i].code, u_errorName(errorCode));
1785 return;
1786 }
1787 if(length<0 || 0!=strcmp(name, names[i].name) || length!=(uint16_t)strlen(name)) {
1788 log_err("u_charName(0x%lx) gets: %s (length %ld) instead of: %s\n", names[i].code, name, length, names[i].name);
1789 }
1790
1791 /* find the modern name */
1792 if (*names[i].name) {
1793 c=u_charFromName(U_UNICODE_CHAR_NAME, names[i].name, &errorCode);
1794 if(U_FAILURE(errorCode)) {
1795 log_err("u_charFromName(%s) error %s\n", names[i].name, u_errorName(errorCode));
1796 return;
1797 }
1798 if(c!=(UChar32)names[i].code) {
1799 log_err("u_charFromName(%s) gets 0x%lx instead of 0x%lx\n", names[i].name, c, names[i].code);
1800 }
1801 }
1802
1803 /* Unicode 1.0 character name */
1804 length=u_charName(names[i].code, U_UNICODE_10_CHAR_NAME, name, sizeof(name), &errorCode);
1805 if(U_FAILURE(errorCode)) {
1806 log_err("u_charName(0x%lx - 1.0) error %s\n", names[i].code, u_errorName(errorCode));
1807 return;
1808 }
1809 if(length<0 || (length>0 && 0!=strcmp(name, names[i].oldName)) || length!=(uint16_t)strlen(name)) {
1810 log_err("u_charName(0x%lx - 1.0) gets %s length %ld instead of nothing or %s\n", names[i].code, name, length, names[i].oldName);
1811 }
1812
1813 /* find the Unicode 1.0 name if it is stored (length>0 means that we could read it) */
1814 if(names[i].oldName[0]!=0 /* && length>0 */) {
1815 c=u_charFromName(U_UNICODE_10_CHAR_NAME, names[i].oldName, &errorCode);
1816 if(U_FAILURE(errorCode)) {
1817 log_err("u_charFromName(%s - 1.0) error %s\n", names[i].oldName, u_errorName(errorCode));
1818 return;
1819 }
1820 if(c!=(UChar32)names[i].code) {
1821 log_err("u_charFromName(%s - 1.0) gets 0x%lx instead of 0x%lx\n", names[i].oldName, c, names[i].code);
1822 }
1823 }
729e4ab9
A
1824
1825 /* Unicode character name alias */
1826 length=u_charName(names[i].code, U_CHAR_NAME_ALIAS, name, sizeof(name), &errorCode);
1827 if(U_FAILURE(errorCode)) {
1828 log_err("u_charName(0x%lx - alias) error %s\n", names[i].code, u_errorName(errorCode));
1829 return;
1830 }
1831 expected=names[i].alias;
1832 if(expected==NULL) {
1833 expected="";
1834 }
1835 if(length<0 || (length>0 && 0!=strcmp(name, expected)) || length!=(uint16_t)strlen(name)) {
1836 log_err("u_charName(0x%lx - alias) gets %s length %ld instead of nothing or %s\n",
1837 names[i].code, name, length, expected);
1838 }
1839
1840 /* find the Unicode character name alias if it is stored (length>0 means that we could read it) */
1841 if(expected[0]!=0 /* && length>0 */) {
1842 c=u_charFromName(U_CHAR_NAME_ALIAS, expected, &errorCode);
1843 if(U_FAILURE(errorCode)) {
1844 log_err("u_charFromName(%s - alias) error %s\n",
1845 expected, u_errorName(errorCode));
1846 return;
1847 }
1848 if(c!=(UChar32)names[i].code) {
1849 log_err("u_charFromName(%s - alias) gets 0x%lx instead of 0x%lx\n",
1850 expected, c, names[i].code);
1851 }
1852 }
b75a7d8f
A
1853 }
1854
1855 /* test u_enumCharNames() */
1856 length=0;
1857 errorCode=U_ZERO_ERROR;
1858 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumCharNamesFn, &length, U_UNICODE_CHAR_NAME, &errorCode);
1859 if(U_FAILURE(errorCode) || length<94140) {
1860 log_err("u_enumCharNames(%ld..%lx) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE, u_errorName(errorCode), length);
1861 }
1862
1863 extContext.length = 0;
1864 extContext.last = -1;
1865 errorCode=U_ZERO_ERROR;
1866 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumExtCharNamesFn, &extContext, U_EXTENDED_CHAR_NAME, &errorCode);
1867 if(U_FAILURE(errorCode) || extContext.length<UCHAR_MAX_VALUE + 1) {
1868 log_err("u_enumCharNames(%ld..0x%lx - Extended) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, u_errorName(errorCode), extContext.length);
1869 }
1870
1871 /* test that u_charFromName() uppercases the input name, i.e., works with mixed-case names (new in 2.0) */
1872 if(0x61!=u_charFromName(U_UNICODE_CHAR_NAME, "LATin smALl letTER A", &errorCode)) {
1873 log_err("u_charFromName(U_UNICODE_CHAR_NAME, \"LATin smALl letTER A\") did not find U+0061 (%s)\n", u_errorName(errorCode));
1874 }
1875
1876 /* Test getCharNameCharacters */
729e4ab9 1877 if(!getTestOption(QUICK_OPTION)) {
b75a7d8f
A
1878 enum { BUFSIZE = 256 };
1879 UErrorCode ec = U_ZERO_ERROR;
1880 char buf[BUFSIZE];
1881 int32_t maxLength;
1882 UChar32 cp;
1883 UChar pat[BUFSIZE], dumbPat[BUFSIZE];
1884 int32_t l1, l2;
1885 UBool map[256];
1886 UBool ok;
1887
1888 USet* set = uset_open(1, 0); /* empty set */
1889 USet* dumb = uset_open(1, 0); /* empty set */
1890
1891 /*
1892 * uprv_getCharNameCharacters() will likely return more lowercase
1893 * letters than actual character names contain because
1894 * it includes all the characters in lowercased names of
1895 * general categories, for the full possible set of extended names.
1896 */
374ca955
A
1897 {
1898 USetAdder sa={
1899 NULL,
1900 uset_add,
1901 uset_addRange,
73c04bcf
A
1902 uset_addString,
1903 NULL /* don't need remove() */
374ca955
A
1904 };
1905 sa.set=set;
1906 uprv_getCharNameCharacters(&sa);
1907 }
b75a7d8f
A
1908
1909 /* build set the dumb (but sure-fire) way */
374ca955 1910 for (i=0; i<256; ++i) {
b75a7d8f 1911 map[i] = FALSE;
374ca955 1912 }
b75a7d8f
A
1913
1914 maxLength=0;
1915 for (cp=0; cp<0x110000; ++cp) {
1916 int32_t len = u_charName(cp, U_EXTENDED_CHAR_NAME,
1917 buf, BUFSIZE, &ec);
1918 if (U_FAILURE(ec)) {
1919 log_err("FAIL: u_charName failed when it shouldn't\n");
1920 uset_close(set);
1921 uset_close(dumb);
1922 return;
1923 }
1924 if(len>maxLength) {
1925 maxLength=len;
1926 }
1927
1928 for (i=0; i<len; ++i) {
1929 if (!map[(uint8_t) buf[i]]) {
1930 uset_add(dumb, (UChar32)u_charToUChar(buf[i]));
1931 map[(uint8_t) buf[i]] = TRUE;
1932 }
1933 }
374ca955
A
1934
1935 /* test for leading/trailing whitespace */
1936 if(buf[0]==' ' || buf[0]=='\t' || buf[len-1]==' ' || buf[len-1]=='\t') {
1937 log_err("u_charName(U+%04x) returns a name with leading or trailing whitespace\n", cp);
1938 }
1939 }
1940
1941 if(map[(uint8_t)'\t']) {
1942 log_err("u_charName() returned a name with a TAB for some code point\n", cp);
b75a7d8f
A
1943 }
1944
1945 length=uprv_getMaxCharNameLength();
1946 if(length!=maxLength) {
1947 log_err("uprv_getMaxCharNameLength()=%d differs from the maximum length %d of all extended names\n",
1948 length, maxLength);
1949 }
1950
1951 /* compare the sets. Where is my uset_equals?!! */
1952 ok=TRUE;
1953 for(i=0; i<256; ++i) {
1954 if(uset_contains(set, i)!=uset_contains(dumb, i)) {
1955 if(0x61<=i && i<=0x7a /* a-z */ && uset_contains(set, i) && !uset_contains(dumb, i)) {
1956 /* ignore lowercase a-z that are in set but not in dumb */
1957 ok=TRUE;
1958 } else {
1959 ok=FALSE;
1960 break;
1961 }
1962 }
1963 }
1964
1965 l1 = uset_toPattern(set, pat, BUFSIZE, TRUE, &ec);
1966 l2 = uset_toPattern(dumb, dumbPat, BUFSIZE, TRUE, &ec);
1967 if (U_FAILURE(ec)) {
1968 log_err("FAIL: uset_toPattern failed when it shouldn't\n");
1969 uset_close(set);
1970 uset_close(dumb);
1971 return;
1972 }
1973
1974 if (l1 >= BUFSIZE) {
1975 l1 = BUFSIZE-1;
1976 pat[l1] = 0;
1977 }
1978 if (l2 >= BUFSIZE) {
1979 l2 = BUFSIZE-1;
1980 dumbPat[l2] = 0;
1981 }
1982
1983 if (!ok) {
b75a7d8f 1984 log_err("FAIL: uprv_getCharNameCharacters() returned %s, expected %s (too many lowercase a-z are ok)\n",
374ca955 1985 aescstrdup(pat, l1), aescstrdup(dumbPat, l2));
729e4ab9 1986 } else if(getTestOption(VERBOSITY_OPTION)) {
374ca955 1987 log_verbose("Ok: uprv_getCharNameCharacters() returned %s\n", aescstrdup(pat, l1));
b75a7d8f
A
1988 }
1989
1990 uset_close(set);
1991 uset_close(dumb);
1992 }
1993
1994 /* ### TODO: test error cases and other interesting things */
1995}
1996
b331163b
A
1997static void
1998TestUCharFromNameUnderflow() {
1999 // Ticket #10889: Underflow crash when there is no dash.
2000 UErrorCode errorCode=U_ZERO_ERROR;
2001 UChar32 c=u_charFromName(U_EXTENDED_CHAR_NAME, "<NO BREAK SPACE>", &errorCode);
2002 if(U_SUCCESS(errorCode)) {
2003 log_err("u_charFromName(<NO BREAK SPACE>) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
2004 }
2005
2006 // Test related edge cases.
2007 errorCode=U_ZERO_ERROR;
2008 c=u_charFromName(U_EXTENDED_CHAR_NAME, "<-00a0>", &errorCode);
2009 if(U_SUCCESS(errorCode)) {
2010 log_err("u_charFromName(<-00a0>) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
2011 }
2012
2013 errorCode=U_ZERO_ERROR;
2014 c=u_charFromName(U_EXTENDED_CHAR_NAME, "<control->", &errorCode);
2015 if(U_SUCCESS(errorCode)) {
2016 log_err("u_charFromName(<control->) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
2017 }
2018
2019 errorCode=U_ZERO_ERROR;
2020 c=u_charFromName(U_EXTENDED_CHAR_NAME, "<control-111111>", &errorCode);
2021 if(U_SUCCESS(errorCode)) {
2022 log_err("u_charFromName(<control-111111>) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
2023 }
2024}
2025
b75a7d8f
A
2026/* test u_isMirrored() and u_charMirror() ----------------------------------- */
2027
2028static void
2029TestMirroring() {
73c04bcf
A
2030 USet *set;
2031 UErrorCode errorCode;
2032
2033 UChar32 start, end, c2, c3;
2034 int32_t i;
2035
2036 U_STRING_DECL(mirroredPattern, "[:Bidi_Mirrored:]", 17);
2037
2038 U_STRING_INIT(mirroredPattern, "[:Bidi_Mirrored:]", 17);
2039
b75a7d8f
A
2040 log_verbose("Testing u_isMirrored()\n");
2041 if(!(u_isMirrored(0x28) && u_isMirrored(0xbb) && u_isMirrored(0x2045) && u_isMirrored(0x232a) &&
2042 !u_isMirrored(0x27) && !u_isMirrored(0x61) && !u_isMirrored(0x284) && !u_isMirrored(0x3400)
2043 )
2044 ) {
2045 log_err("u_isMirrored() does not work correctly\n");
2046 }
2047
2048 log_verbose("Testing u_charMirror()\n");
2049 if(!(u_charMirror(0x3c)==0x3e && u_charMirror(0x5d)==0x5b && u_charMirror(0x208d)==0x208e && u_charMirror(0x3017)==0x3016 &&
73c04bcf 2050 u_charMirror(0xbb)==0xab && u_charMirror(0x2215)==0x29F5 && u_charMirror(0x29F5)==0x2215 && /* large delta between the code points */
46f4442e
A
2051 u_charMirror(0x2e)==0x2e && u_charMirror(0x6f3)==0x6f3 && u_charMirror(0x301c)==0x301c && u_charMirror(0xa4ab)==0xa4ab &&
2052 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
2053 u_charMirror(0x2018)==0x2018 && u_charMirror(0x201b)==0x201b && u_charMirror(0x301d)==0x301d
b75a7d8f
A
2054 )
2055 ) {
2056 log_err("u_charMirror() does not work correctly\n");
2057 }
73c04bcf
A
2058
2059 /* verify that Bidi_Mirroring_Glyph roundtrips */
2060 errorCode=U_ZERO_ERROR;
2061 set=uset_openPattern(mirroredPattern, 17, &errorCode);
2062
2063 if (U_FAILURE(errorCode)) {
729e4ab9 2064 log_data_err("uset_openPattern(mirroredPattern, 17, &errorCode) failed!\n");
73c04bcf
A
2065 } else {
2066 for(i=0; 0==uset_getItem(set, i, &start, &end, NULL, 0, &errorCode); ++i) {
2067 do {
2068 c2=u_charMirror(start);
2069 c3=u_charMirror(c2);
2070 if(c3!=start) {
2071 log_err("u_charMirror() does not roundtrip: U+%04lx->U+%04lx->U+%04lx\n", (long)start, (long)c2, (long)c3);
2072 }
57a6839d
A
2073 c3=u_getBidiPairedBracket(start);
2074 if(u_getIntPropertyValue(start, UCHAR_BIDI_PAIRED_BRACKET_TYPE)==U_BPT_NONE) {
2075 if(c3!=start) {
2076 log_err("u_getBidiPairedBracket(U+%04lx) != self for bpt(c)==None\n",
2077 (long)start);
2078 }
2079 } else {
2080 if(c3!=c2) {
2081 log_err("u_getBidiPairedBracket(U+%04lx) != U+%04lx = bmg(c)'\n",
2082 (long)start, (long)c2);
2083 }
2084 }
73c04bcf
A
2085 } while(++start<=end);
2086 }
2087 }
2088
2089 uset_close(set);
b75a7d8f
A
2090}
2091
2092
2093struct RunTestData
2094{
2095 const char *runText;
2096 UScriptCode runCode;
2097};
2098
2099typedef struct RunTestData RunTestData;
2100
2101static void
2102CheckScriptRuns(UScriptRun *scriptRun, int32_t *runStarts, const RunTestData *testData, int32_t nRuns,
2103 const char *prefix)
2104{
2105 int32_t run, runStart, runLimit;
2106 UScriptCode runCode;
2107
2108 /* iterate over all the runs */
2109 run = 0;
2110 while (uscript_nextRun(scriptRun, &runStart, &runLimit, &runCode)) {
2111 if (runStart != runStarts[run]) {
2112 log_err("%s: incorrect start offset for run %d: expected %d, got %d\n",
2113 prefix, run, runStarts[run], runStart);
2114 }
2115
2116 if (runLimit != runStarts[run + 1]) {
2117 log_err("%s: incorrect limit offset for run %d: expected %d, got %d\n",
2118 prefix, run, runStarts[run + 1], runLimit);
2119 }
2120
2121 if (runCode != testData[run].runCode) {
2122 log_err("%s: incorrect script for run %d: expected \"%s\", got \"%s\"\n",
2123 prefix, run, uscript_getName(testData[run].runCode), uscript_getName(runCode));
2124 }
2125
2126 run += 1;
2127
2128 /* stop when we've seen all the runs we expect to see */
2129 if (run >= nRuns) {
2130 break;
2131 }
2132 }
2133
2134 /* Complain if we didn't see then number of runs we expected */
2135 if (run != nRuns) {
2136 log_err("%s: incorrect number of runs: expected %d, got %d\n", prefix, run, nRuns);
2137 }
2138}
2139
2140static void
2141TestUScriptRunAPI()
2142{
374ca955 2143 static const RunTestData testData1[] = {
b75a7d8f
A
2144 {"\\u0020\\u0946\\u0939\\u093F\\u0928\\u094D\\u0926\\u0940\\u0020", USCRIPT_DEVANAGARI},
2145 {"\\u0627\\u0644\\u0639\\u0631\\u0628\\u064A\\u0629\\u0020", USCRIPT_ARABIC},
2146 {"\\u0420\\u0443\\u0441\\u0441\\u043A\\u0438\\u0439\\u0020", USCRIPT_CYRILLIC},
2147 {"English (", USCRIPT_LATIN},
2148 {"\\u0E44\\u0E17\\u0E22", USCRIPT_THAI},
2149 {") ", USCRIPT_LATIN},
2150 {"\\u6F22\\u5B75", USCRIPT_HAN},
2151 {"\\u3068\\u3072\\u3089\\u304C\\u306A\\u3068", USCRIPT_HIRAGANA},
2152 {"\\u30AB\\u30BF\\u30AB\\u30CA", USCRIPT_KATAKANA},
2153 {"\\U00010400\\U00010401\\U00010402\\U00010403", USCRIPT_DESERET}
2154 };
374ca955
A
2155
2156 static const RunTestData testData2[] = {
2157 {"((((((((((abc))))))))))", USCRIPT_LATIN}
2158 };
2159
2160 static const struct {
2161 const RunTestData *testData;
2162 int32_t nRuns;
2163 } testDataEntries[] = {
b331163b
A
2164 {testData1, UPRV_LENGTHOF(testData1)},
2165 {testData2, UPRV_LENGTHOF(testData2)}
374ca955
A
2166 };
2167
b331163b 2168 static const int32_t nTestEntries = UPRV_LENGTHOF(testDataEntries);
374ca955
A
2169 int32_t testEntry;
2170
2171 for (testEntry = 0; testEntry < nTestEntries; testEntry += 1) {
2172 UChar testString[1024];
2173 int32_t runStarts[256];
2174 int32_t nTestRuns = testDataEntries[testEntry].nRuns;
2175 const RunTestData *testData = testDataEntries[testEntry].testData;
2176
2177 int32_t run, stringLimit;
2178 UScriptRun *scriptRun = NULL;
2179 UErrorCode err;
2180
2181 /*
2182 * Fill in the test string and the runStarts array.
2183 */
2184 stringLimit = 0;
2185 for (run = 0; run < nTestRuns; run += 1) {
2186 runStarts[run] = stringLimit;
2187 stringLimit += u_unescape(testData[run].runText, &testString[stringLimit], 1024 - stringLimit);
2188 /*stringLimit -= 1;*/
2189 }
2190
2191 /* The limit of the last run */
2192 runStarts[nTestRuns] = stringLimit;
2193
2194 /*
2195 * Make sure that calling uscript_OpenRun with a NULL text pointer
2196 * and a non-zero text length returns the correct error.
2197 */
2198 err = U_ZERO_ERROR;
2199 scriptRun = uscript_openRun(NULL, stringLimit, &err);
2200
2201 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2202 log_err("uscript_openRun(NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2203 }
2204
2205 if (scriptRun != NULL) {
2206 log_err("uscript_openRun(NULL, stringLimit, &err) returned a non-NULL result.\n");
2207 uscript_closeRun(scriptRun);
2208 }
2209
2210 /*
2211 * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2212 * and a zero text length returns the correct error.
2213 */
2214 err = U_ZERO_ERROR;
2215 scriptRun = uscript_openRun(testString, 0, &err);
2216
2217 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2218 log_err("uscript_openRun(testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2219 }
2220
2221 if (scriptRun != NULL) {
2222 log_err("uscript_openRun(testString, 0, &err) returned a non-NULL result.\n");
2223 uscript_closeRun(scriptRun);
2224 }
2225
2226 /*
2227 * Make sure that calling uscript_openRun with a NULL text pointer
2228 * and a zero text length doesn't return an error.
2229 */
2230 err = U_ZERO_ERROR;
2231 scriptRun = uscript_openRun(NULL, 0, &err);
2232
2233 if (U_FAILURE(err)) {
2234 log_err("Got error %s from uscript_openRun(NULL, 0, &err)\n", u_errorName(err));
2235 }
2236
2237 /* Make sure that the empty iterator doesn't find any runs */
2238 if (uscript_nextRun(scriptRun, NULL, NULL, NULL)) {
2239 log_err("uscript_nextRun(...) returned TRUE for an empty iterator.\n");
2240 }
2241
2242 /*
2243 * Make sure that calling uscript_setRunText with a NULL text pointer
2244 * and a non-zero text length returns the correct error.
2245 */
2246 err = U_ZERO_ERROR;
2247 uscript_setRunText(scriptRun, NULL, stringLimit, &err);
2248
2249 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2250 log_err("uscript_setRunText(scriptRun, NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2251 }
2252
2253 /*
2254 * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2255 * and a zero text length returns the correct error.
2256 */
2257 err = U_ZERO_ERROR;
2258 uscript_setRunText(scriptRun, testString, 0, &err);
2259
2260 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2261 log_err("uscript_setRunText(scriptRun, testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2262 }
2263
2264 /*
2265 * Now call uscript_setRunText on the empty iterator
2266 * and make sure that it works.
2267 */
2268 err = U_ZERO_ERROR;
2269 uscript_setRunText(scriptRun, testString, stringLimit, &err);
2270
2271 if (U_FAILURE(err)) {
2272 log_err("Got error %s from uscript_setRunText(...)\n", u_errorName(err));
2273 } else {
2274 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_setRunText");
2275 }
2276
b75a7d8f 2277 uscript_closeRun(scriptRun);
374ca955
A
2278
2279 /*
2280 * Now open an interator over the testString
2281 * using uscript_openRun and make sure that it works
2282 */
2283 scriptRun = uscript_openRun(testString, stringLimit, &err);
2284
2285 if (U_FAILURE(err)) {
2286 log_err("Got error %s from uscript_openRun(...)\n", u_errorName(err));
2287 } else {
2288 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_openRun");
2289 }
2290
2291 /* Now reset the iterator, and make sure
2292 * that it still works.
2293 */
2294 uscript_resetRun(scriptRun);
2295
2296 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_resetRun");
2297
2298 /* Close the iterator */
b75a7d8f
A
2299 uscript_closeRun(scriptRun);
2300 }
b75a7d8f
A
2301}
2302
2303/* test additional, non-core properties */
2304static void
2305TestAdditionalProperties() {
2306 /* test data for u_charAge() */
2307 static const struct {
2308 UChar32 c;
2309 UVersionInfo version;
2310 } charAges[]={
2311 {0x41, { 1, 1, 0, 0 }},
2312 {0xffff, { 1, 1, 0, 0 }},
2313 {0x20ab, { 2, 0, 0, 0 }},
2314 {0x2fffe, { 2, 0, 0, 0 }},
2315 {0x20ac, { 2, 1, 0, 0 }},
2316 {0xfb1d, { 3, 0, 0, 0 }},
2317 {0x3f4, { 3, 1, 0, 0 }},
2318 {0x10300, { 3, 1, 0, 0 }},
2319 {0x220, { 3, 2, 0, 0 }},
2320 {0xff60, { 3, 2, 0, 0 }}
2321 };
2322
2323 /* test data for u_hasBinaryProperty() */
46f4442e 2324 static const int32_t
b75a7d8f
A
2325 props[][3]={ /* code point, property, value */
2326 { 0x0627, UCHAR_ALPHABETIC, TRUE },
2327 { 0x1034a, UCHAR_ALPHABETIC, TRUE },
2328 { 0x2028, UCHAR_ALPHABETIC, FALSE },
2329
2330 { 0x0066, UCHAR_ASCII_HEX_DIGIT, TRUE },
2331 { 0x0067, UCHAR_ASCII_HEX_DIGIT, FALSE },
2332
2333 { 0x202c, UCHAR_BIDI_CONTROL, TRUE },
2334 { 0x202f, UCHAR_BIDI_CONTROL, FALSE },
2335
2336 { 0x003c, UCHAR_BIDI_MIRRORED, TRUE },
2337 { 0x003d, UCHAR_BIDI_MIRRORED, FALSE },
2338
46f4442e
A
2339 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
2340 { 0x2018, UCHAR_BIDI_MIRRORED, FALSE },
2341 { 0x201d, UCHAR_BIDI_MIRRORED, FALSE },
2342 { 0x201f, UCHAR_BIDI_MIRRORED, FALSE },
2343 { 0x301e, UCHAR_BIDI_MIRRORED, FALSE },
2344
b75a7d8f
A
2345 { 0x058a, UCHAR_DASH, TRUE },
2346 { 0x007e, UCHAR_DASH, FALSE },
2347
2348 { 0x0c4d, UCHAR_DIACRITIC, TRUE },
2349 { 0x3000, UCHAR_DIACRITIC, FALSE },
2350
2351 { 0x0e46, UCHAR_EXTENDER, TRUE },
2352 { 0x0020, UCHAR_EXTENDER, FALSE },
2353
2354#if !UCONFIG_NO_NORMALIZATION
2355 { 0xfb1d, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2356 { 0x1d15f, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2357 { 0xfb1e, UCHAR_FULL_COMPOSITION_EXCLUSION, FALSE },
374ca955
A
2358
2359 { 0x110a, UCHAR_NFD_INERT, TRUE }, /* Jamo L */
2360 { 0x0308, UCHAR_NFD_INERT, FALSE },
2361
2362 { 0x1164, UCHAR_NFKD_INERT, TRUE }, /* Jamo V */
2363 { 0x1d79d, UCHAR_NFKD_INERT, FALSE }, /* math compat version of xi */
2364
2365 { 0x0021, UCHAR_NFC_INERT, TRUE }, /* ! */
2366 { 0x0061, UCHAR_NFC_INERT, FALSE }, /* a */
2367 { 0x00e4, UCHAR_NFC_INERT, FALSE }, /* a-umlaut */
2368 { 0x0102, UCHAR_NFC_INERT, FALSE }, /* a-breve */
2369 { 0xac1c, UCHAR_NFC_INERT, FALSE }, /* Hangul LV */
2370 { 0xac1d, UCHAR_NFC_INERT, TRUE }, /* Hangul LVT */
2371
2372 { 0x1d79d, UCHAR_NFKC_INERT, FALSE }, /* math compat version of xi */
2373 { 0x2a6d6, UCHAR_NFKC_INERT, TRUE }, /* Han, last of CJK ext. B */
2374
2375 { 0x00e4, UCHAR_SEGMENT_STARTER, TRUE },
2376 { 0x0308, UCHAR_SEGMENT_STARTER, FALSE },
2377 { 0x110a, UCHAR_SEGMENT_STARTER, TRUE }, /* Jamo L */
2378 { 0x1164, UCHAR_SEGMENT_STARTER, FALSE },/* Jamo V */
2379 { 0xac1c, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LV */
2380 { 0xac1d, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LVT */
b75a7d8f
A
2381#endif
2382
2383 { 0x0044, UCHAR_HEX_DIGIT, TRUE },
2384 { 0xff46, UCHAR_HEX_DIGIT, TRUE },
2385 { 0x0047, UCHAR_HEX_DIGIT, FALSE },
2386
2387 { 0x30fb, UCHAR_HYPHEN, TRUE },
2388 { 0xfe58, UCHAR_HYPHEN, FALSE },
2389
2390 { 0x2172, UCHAR_ID_CONTINUE, TRUE },
2391 { 0x0307, UCHAR_ID_CONTINUE, TRUE },
2392 { 0x005c, UCHAR_ID_CONTINUE, FALSE },
2393
2394 { 0x2172, UCHAR_ID_START, TRUE },
2395 { 0x007a, UCHAR_ID_START, TRUE },
2396 { 0x0039, UCHAR_ID_START, FALSE },
2397
2398 { 0x4db5, UCHAR_IDEOGRAPHIC, TRUE },
2399 { 0x2f999, UCHAR_IDEOGRAPHIC, TRUE },
2400 { 0x2f99, UCHAR_IDEOGRAPHIC, FALSE },
2401
2402 { 0x200c, UCHAR_JOIN_CONTROL, TRUE },
2403 { 0x2029, UCHAR_JOIN_CONTROL, FALSE },
2404
2405 { 0x1d7bc, UCHAR_LOWERCASE, TRUE },
2406 { 0x0345, UCHAR_LOWERCASE, TRUE },
2407 { 0x0030, UCHAR_LOWERCASE, FALSE },
2408
2409 { 0x1d7a9, UCHAR_MATH, TRUE },
2410 { 0x2135, UCHAR_MATH, TRUE },
2411 { 0x0062, UCHAR_MATH, FALSE },
2412
2413 { 0xfde1, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2414 { 0x10ffff, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2415 { 0x10fffd, UCHAR_NONCHARACTER_CODE_POINT, FALSE },
2416
2417 { 0x0022, UCHAR_QUOTATION_MARK, TRUE },
2418 { 0xff62, UCHAR_QUOTATION_MARK, TRUE },
2419 { 0xd840, UCHAR_QUOTATION_MARK, FALSE },
2420
2421 { 0x061f, UCHAR_TERMINAL_PUNCTUATION, TRUE },
2422 { 0xe003f, UCHAR_TERMINAL_PUNCTUATION, FALSE },
2423
2424 { 0x1d44a, UCHAR_UPPERCASE, TRUE },
2425 { 0x2162, UCHAR_UPPERCASE, TRUE },
2426 { 0x0345, UCHAR_UPPERCASE, FALSE },
2427
2428 { 0x0020, UCHAR_WHITE_SPACE, TRUE },
2429 { 0x202f, UCHAR_WHITE_SPACE, TRUE },
2430 { 0x3001, UCHAR_WHITE_SPACE, FALSE },
2431
2432 { 0x0711, UCHAR_XID_CONTINUE, TRUE },
2433 { 0x1d1aa, UCHAR_XID_CONTINUE, TRUE },
2434 { 0x007c, UCHAR_XID_CONTINUE, FALSE },
2435
2436 { 0x16ee, UCHAR_XID_START, TRUE },
2437 { 0x23456, UCHAR_XID_START, TRUE },
2438 { 0x1d1aa, UCHAR_XID_START, FALSE },
2439
2440 /*
2441 * Version break:
2442 * The following properties are only supported starting with the
2443 * Unicode version indicated in the second field.
2444 */
374ca955 2445 { -1, 0x320, 0 },
b75a7d8f
A
2446
2447 { 0x180c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2448 { 0xfe02, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2449 { 0x1801, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, FALSE },
2450
729e4ab9
A
2451 { 0x0149, UCHAR_DEPRECATED, TRUE }, /* changed in Unicode 5.2 */
2452 { 0x0341, UCHAR_DEPRECATED, FALSE }, /* changed in Unicode 5.2 */
2ca993e8 2453 { 0xe0001, UCHAR_DEPRECATED, TRUE }, /* changed from Unicode 5 to 5.1 */
46f4442e 2454 { 0xe0100, UCHAR_DEPRECATED, FALSE },
b75a7d8f
A
2455
2456 { 0x00a0, UCHAR_GRAPHEME_BASE, TRUE },
2457 { 0x0a4d, UCHAR_GRAPHEME_BASE, FALSE },
46f4442e
A
2458 { 0xff9d, UCHAR_GRAPHEME_BASE, TRUE },
2459 { 0xff9f, UCHAR_GRAPHEME_BASE, FALSE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
b75a7d8f
A
2460
2461 { 0x0300, UCHAR_GRAPHEME_EXTEND, TRUE },
46f4442e
A
2462 { 0xff9d, UCHAR_GRAPHEME_EXTEND, FALSE },
2463 { 0xff9f, UCHAR_GRAPHEME_EXTEND, TRUE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
b75a7d8f
A
2464 { 0x0603, UCHAR_GRAPHEME_EXTEND, FALSE },
2465
2466 { 0x0a4d, UCHAR_GRAPHEME_LINK, TRUE },
2467 { 0xff9f, UCHAR_GRAPHEME_LINK, FALSE },
2468
2469 { 0x2ff7, UCHAR_IDS_BINARY_OPERATOR, TRUE },
2470 { 0x2ff3, UCHAR_IDS_BINARY_OPERATOR, FALSE },
2471
2472 { 0x2ff3, UCHAR_IDS_TRINARY_OPERATOR, TRUE },
2473 { 0x2f03, UCHAR_IDS_TRINARY_OPERATOR, FALSE },
2474
2475 { 0x0ec1, UCHAR_LOGICAL_ORDER_EXCEPTION, TRUE },
2476 { 0xdcba, UCHAR_LOGICAL_ORDER_EXCEPTION, FALSE },
2477
2478 { 0x2e9b, UCHAR_RADICAL, TRUE },
2479 { 0x4e00, UCHAR_RADICAL, FALSE },
2480
2481 { 0x012f, UCHAR_SOFT_DOTTED, TRUE },
2482 { 0x0049, UCHAR_SOFT_DOTTED, FALSE },
2483
2484 { 0xfa11, UCHAR_UNIFIED_IDEOGRAPH, TRUE },
2485 { 0xfa12, UCHAR_UNIFIED_IDEOGRAPH, FALSE },
2486
73c04bcf 2487 { -1, 0x401, 0 }, /* version break for Unicode 4.0.1 */
374ca955
A
2488
2489 { 0x002e, UCHAR_S_TERM, TRUE },
2490 { 0x0061, UCHAR_S_TERM, FALSE },
2491
2492 { 0x180c, UCHAR_VARIATION_SELECTOR, TRUE },
2493 { 0xfe03, UCHAR_VARIATION_SELECTOR, TRUE },
2494 { 0xe01ef, UCHAR_VARIATION_SELECTOR, TRUE },
2495 { 0xe0200, UCHAR_VARIATION_SELECTOR, FALSE },
2496
b75a7d8f
A
2497 /* enum/integer type properties */
2498
2499 /* UCHAR_BIDI_CLASS tested for assigned characters in TestUnicodeData() */
2500 /* test default Bidi classes for unassigned code points */
2501 { 0x0590, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
73c04bcf 2502 { 0x05cf, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
b75a7d8f 2503 { 0x05ed, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
73c04bcf
A
2504 { 0x07f2, UCHAR_BIDI_CLASS, U_DIR_NON_SPACING_MARK }, /* Nko, new in Unicode 5.0 */
2505 { 0x07fe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, /* unassigned R */
4388f060 2506 { 0x089f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
b75a7d8f
A
2507 { 0xfb37, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2508 { 0xfb42, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2509 { 0x10806, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2510 { 0x10909, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2511 { 0x10fe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2512
b331163b 2513 { 0x061d, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
b75a7d8f
A
2514 { 0x063f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2515 { 0x070e, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2516 { 0x0775, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2517 { 0xfbc2, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2518 { 0xfd90, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2519 { 0xfefe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2520
2521 { 0x02AF, UCHAR_BLOCK, UBLOCK_IPA_EXTENSIONS },
2522 { 0x0C4E, UCHAR_BLOCK, UBLOCK_TELUGU },
2523 { 0x155A, UCHAR_BLOCK, UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS },
2524 { 0x1717, UCHAR_BLOCK, UBLOCK_TAGALOG },
374ca955 2525 { 0x1900, UCHAR_BLOCK, UBLOCK_LIMBU },
0f5d89e8 2526 { 0x0870, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
b75a7d8f
A
2527 { 0x3040, UCHAR_BLOCK, UBLOCK_HIRAGANA },
2528 { 0x1D0FF, UCHAR_BLOCK, UBLOCK_BYZANTINE_MUSICAL_SYMBOLS },
374ca955 2529 { 0x50000, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
b75a7d8f 2530 { 0xEFFFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
374ca955 2531 { 0x10D0FF, UCHAR_BLOCK, UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B },
b75a7d8f
A
2532
2533 /* UCHAR_CANONICAL_COMBINING_CLASS tested for assigned characters in TestUnicodeData() */
2534 { 0xd7d7, UCHAR_CANONICAL_COMBINING_CLASS, 0 },
2535
2536 { 0x00A0, UCHAR_DECOMPOSITION_TYPE, U_DT_NOBREAK },
2537 { 0x00A8, UCHAR_DECOMPOSITION_TYPE, U_DT_COMPAT },
2538 { 0x00bf, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2539 { 0x00c0, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2540 { 0x1E9B, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2541 { 0xBCDE, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2542 { 0xFB5D, UCHAR_DECOMPOSITION_TYPE, U_DT_MEDIAL },
2543 { 0x1D736, UCHAR_DECOMPOSITION_TYPE, U_DT_FONT },
2544 { 0xe0033, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2545
2546 { 0x0009, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2547 { 0x0020, UCHAR_EAST_ASIAN_WIDTH, U_EA_NARROW },
2548 { 0x00B1, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2549 { 0x20A9, UCHAR_EAST_ASIAN_WIDTH, U_EA_HALFWIDTH },
2550 { 0x2FFB, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2551 { 0x3000, UCHAR_EAST_ASIAN_WIDTH, U_EA_FULLWIDTH },
2552 { 0x35bb, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2553 { 0x58bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2554 { 0xD7A3, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2555 { 0xEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2556 { 0x1D198, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2557 { 0x20000, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2558 { 0x2F8C7, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2559 { 0x3a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, /* plane 3 got default W values in Unicode 4 */
2560 { 0x5a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2561 { 0xFEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2562 { 0x10EEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2563
2564 /* UCHAR_GENERAL_CATEGORY tested for assigned characters in TestUnicodeData() */
729e4ab9
A
2565 { 0xd7c7, UCHAR_GENERAL_CATEGORY, 0 },
2566 { 0xd7d7, UCHAR_GENERAL_CATEGORY, U_OTHER_LETTER }, /* changed in Unicode 5.2 */
b75a7d8f
A
2567
2568 { 0x0444, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2569 { 0x0639, UCHAR_JOINING_GROUP, U_JG_AIN },
2570 { 0x072A, UCHAR_JOINING_GROUP, U_JG_DALATH_RISH },
2571 { 0x0647, UCHAR_JOINING_GROUP, U_JG_HEH },
2572 { 0x06C1, UCHAR_JOINING_GROUP, U_JG_HEH_GOAL },
b75a7d8f
A
2573
2574 { 0x200C, UCHAR_JOINING_TYPE, U_JT_NON_JOINING },
2575 { 0x200D, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2576 { 0x0639, UCHAR_JOINING_TYPE, U_JT_DUAL_JOINING },
2577 { 0x0640, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2578 { 0x06C3, UCHAR_JOINING_TYPE, U_JT_RIGHT_JOINING },
2579 { 0x0300, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2580 { 0x070F, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2581 { 0xe0033, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2582
2583 /* TestUnicodeData() verifies that no assigned character has "XX" (unknown) */
2584 { 0xe7e7, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2585 { 0x10fffd, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2586 { 0x0028, UCHAR_LINE_BREAK, U_LB_OPEN_PUNCTUATION },
2587 { 0x232A, UCHAR_LINE_BREAK, U_LB_CLOSE_PUNCTUATION },
2588 { 0x3401, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2589 { 0x4e02, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
b75a7d8f
A
2590 { 0x20004, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2591 { 0xf905, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2592 { 0xdb7e, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2593 { 0xdbfd, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2594 { 0xdffc, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2595 { 0x2762, UCHAR_LINE_BREAK, U_LB_EXCLAMATION },
2596 { 0x002F, UCHAR_LINE_BREAK, U_LB_BREAK_SYMBOLS },
2597 { 0x1D49C, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2598 { 0x1731, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2599
2600 /* UCHAR_NUMERIC_TYPE tested in TestNumericProperties() */
2601
0f5d89e8 2602 /* UCHAR_SCRIPT tested in cucdapi.c TestUScriptCodeAPI() */
b75a7d8f 2603
729e4ab9 2604 { 0x10ff, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
b75a7d8f
A
2605 { 0x1100, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2606 { 0x1111, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2607 { 0x1159, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
729e4ab9
A
2608 { 0x115a, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2609 { 0x115e, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
b75a7d8f
A
2610 { 0x115f, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2611
729e4ab9
A
2612 { 0xa95f, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2613 { 0xa960, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2614 { 0xa97c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2615 { 0xa97d, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2616
b75a7d8f
A
2617 { 0x1160, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2618 { 0x1161, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2619 { 0x1172, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2620 { 0x11a2, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
729e4ab9
A
2621 { 0x11a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2622 { 0x11a7, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2623
2624 { 0xd7af, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2625 { 0xd7b0, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2626 { 0xd7c6, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2627 { 0xd7c7, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
b75a7d8f
A
2628
2629 { 0x11a8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2630 { 0x11b8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2631 { 0x11c8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2632 { 0x11f9, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
729e4ab9
A
2633 { 0x11fa, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2634 { 0x11ff, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2635 { 0x1200, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
b75a7d8f 2636
729e4ab9
A
2637 { 0xd7ca, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2638 { 0xd7cb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2639 { 0xd7fb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2640 { 0xd7fc, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
b75a7d8f
A
2641
2642 { 0xac00, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2643 { 0xac1c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2644 { 0xc5ec, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2645 { 0xd788, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2646
2647 { 0xac01, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2648 { 0xac1b, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2649 { 0xac1d, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2650 { 0xc5ee, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2651 { 0xd7a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2652
2653 { 0xd7a4, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2654
73c04bcf
A
2655 { -1, 0x410, 0 }, /* version break for Unicode 4.1 */
2656
2657 { 0x00d7, UCHAR_PATTERN_SYNTAX, TRUE },
2658 { 0xfe45, UCHAR_PATTERN_SYNTAX, TRUE },
2659 { 0x0061, UCHAR_PATTERN_SYNTAX, FALSE },
2660
2661 { 0x0020, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2662 { 0x0085, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2663 { 0x200f, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2664 { 0x00a0, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2665 { 0x3000, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2666
2667 { 0x1d200, UCHAR_BLOCK, UBLOCK_ANCIENT_GREEK_MUSICAL_NOTATION },
2668 { 0x2c8e, UCHAR_BLOCK, UBLOCK_COPTIC },
2669 { 0xfe17, UCHAR_BLOCK, UBLOCK_VERTICAL_FORMS },
2670
2671 { 0x1a00, UCHAR_SCRIPT, USCRIPT_BUGINESE },
2672 { 0x2cea, UCHAR_SCRIPT, USCRIPT_COPTIC },
2673 { 0xa82b, UCHAR_SCRIPT, USCRIPT_SYLOTI_NAGRI },
2674 { 0x103d0, UCHAR_SCRIPT, USCRIPT_OLD_PERSIAN },
2675
2676 { 0xcc28, UCHAR_LINE_BREAK, U_LB_H2 },
2677 { 0xcc29, UCHAR_LINE_BREAK, U_LB_H3 },
2678 { 0xac03, UCHAR_LINE_BREAK, U_LB_H3 },
2679 { 0x115f, UCHAR_LINE_BREAK, U_LB_JL },
2680 { 0x11aa, UCHAR_LINE_BREAK, U_LB_JT },
2681 { 0x11a1, UCHAR_LINE_BREAK, U_LB_JV },
2682
2683 { 0xb2c9, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_LVT },
2684 { 0x036f, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_EXTEND },
2685 { 0x0000, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_CONTROL },
2686 { 0x1160, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_V },
2687
2688 { 0x05f4, UCHAR_WORD_BREAK, U_WB_MIDLETTER },
2689 { 0x4ef0, UCHAR_WORD_BREAK, U_WB_OTHER },
2690 { 0x19d9, UCHAR_WORD_BREAK, U_WB_NUMERIC },
2691 { 0x2044, UCHAR_WORD_BREAK, U_WB_MIDNUM },
2692
2693 { 0xfffd, UCHAR_SENTENCE_BREAK, U_SB_OTHER },
2694 { 0x1ffc, UCHAR_SENTENCE_BREAK, U_SB_UPPER },
2695 { 0xff63, UCHAR_SENTENCE_BREAK, U_SB_CLOSE },
2696 { 0x2028, UCHAR_SENTENCE_BREAK, U_SB_SEP },
2697
729e4ab9
A
2698 { -1, 0x520, 0 }, /* version break for Unicode 5.2 */
2699
4388f060
A
2700 /* unassigned code points in new default Bidi R blocks */
2701 { 0x1ede4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2702 { 0x1efe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2703
729e4ab9
A
2704 /* test some script codes >127 */
2705 { 0xa6e6, UCHAR_SCRIPT, USCRIPT_BAMUM },
2706 { 0xa4d0, UCHAR_SCRIPT, USCRIPT_LISU },
2707 { 0x10a7f, UCHAR_SCRIPT, USCRIPT_OLD_SOUTH_ARABIAN },
2708
2709 { -1, 0x600, 0 }, /* version break for Unicode 6.0 */
2710
2711 /* value changed in Unicode 6.0 */
2712 { 0x06C3, UCHAR_JOINING_GROUP, U_JG_TEH_MARBUTA_GOAL },
2713
4388f060
A
2714 { -1, 0x610, 0 }, /* version break for Unicode 6.1 */
2715
2716 /* unassigned code points in new/changed default Bidi AL blocks */
2717 { 0x08ba, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2718 { 0x1eee4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2719
57a6839d
A
2720 { -1, 0x630, 0 }, /* version break for Unicode 6.3 */
2721
2722 /* unassigned code points in the currency symbols block now default to ET */
2723 { 0x20C0, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR },
2724 { 0x20CF, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR },
2725
2726 /* new property in Unicode 6.3 */
2727 { 0x0027, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE },
2728 { 0x0028, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN },
2729 { 0x0029, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE },
2730 { 0xFF5C, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE },
2731 { 0xFF5B, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN },
2732 { 0xFF5D, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE },
2733
b331163b
A
2734 { -1, 0x700, 0 }, /* version break for Unicode 7.0 */
2735
2736 /* new character range with Joining_Group values */
2737 { 0x10ABF, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2738 { 0x10AC0, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_ALEPH },
2739 { 0x10AC1, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_BETH },
2740 { 0x10AEF, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_HUNDRED },
2741 { 0x10AF0, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2742
6be67b06
A
2743 { -1, 0xa00, 0 }, // version break for Unicode 10
2744
2745 { 0x1F1E5, UCHAR_REGIONAL_INDICATOR, FALSE },
2746 { 0x1F1E7, UCHAR_REGIONAL_INDICATOR, TRUE },
2747 { 0x1F1FF, UCHAR_REGIONAL_INDICATOR, TRUE },
2748 { 0x1F200, UCHAR_REGIONAL_INDICATOR, FALSE },
2749
2750 { 0x0600, UCHAR_PREPENDED_CONCATENATION_MARK, TRUE },
2751 { 0x0606, UCHAR_PREPENDED_CONCATENATION_MARK, FALSE },
2752 { 0x110BD, UCHAR_PREPENDED_CONCATENATION_MARK, TRUE },
2753
b75a7d8f
A
2754 /* undefined UProperty values */
2755 { 0x61, 0x4a7, 0 },
2756 { 0x234bc, 0x15ed, 0 }
2757 };
2758
2759 UVersionInfo version;
2760 UChar32 c;
2761 int32_t i, result, uVersion;
2762 UProperty which;
2763
2764 /* what is our Unicode version? */
2765 u_getUnicodeVersion(version);
374ca955 2766 uVersion=((int32_t)version[0]<<8)|(version[1]<<4)|version[2]; /* major/minor/update version numbers */
b75a7d8f
A
2767
2768 u_charAge(0x20, version);
2769 if(version[0]==0) {
2770 /* no additional properties available */
2771 log_err("TestAdditionalProperties: no additional properties available, not tested\n");
2772 return;
2773 }
2774
2775 /* test u_charAge() */
2ca993e8 2776 for(i=0; i<UPRV_LENGTHOF(charAges); ++i) {
b75a7d8f
A
2777 u_charAge(charAges[i].c, version);
2778 if(0!=memcmp(version, charAges[i].version, sizeof(UVersionInfo))) {
2779 log_err("error: u_charAge(U+%04lx)={ %u, %u, %u, %u } instead of { %u, %u, %u, %u }\n",
2780 charAges[i].c,
2781 version[0], version[1], version[2], version[3],
2782 charAges[i].version[0], charAges[i].version[1], charAges[i].version[2], charAges[i].version[3]);
2783 }
2784 }
2785
2786 if( u_getIntPropertyMinValue(UCHAR_DASH)!=0 ||
2787 u_getIntPropertyMinValue(UCHAR_BIDI_CLASS)!=0 ||
2788 u_getIntPropertyMinValue(UCHAR_BLOCK)!=0 || /* j2478 */
2789 u_getIntPropertyMinValue(UCHAR_SCRIPT)!=0 || /*JB#2410*/
2790 u_getIntPropertyMinValue(0x2345)!=0
2791 ) {
2792 log_err("error: u_getIntPropertyMinValue() wrong\n");
2793 }
73c04bcf
A
2794 if( u_getIntPropertyMaxValue(UCHAR_DASH)!=1) {
2795 log_err("error: u_getIntPropertyMaxValue(UCHAR_DASH) wrong\n");
2796 }
2797 if( u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE)!=1) {
2798 log_err("error: u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE) wrong\n");
2799 }
46f4442e 2800 if( u_getIntPropertyMaxValue((UProperty)(UCHAR_BINARY_LIMIT-1))!=1) {
73c04bcf
A
2801 log_err("error: u_getIntPropertyMaxValue(UCHAR_BINARY_LIMIT-1) wrong\n");
2802 }
2803 if( u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS)!=(int32_t)U_CHAR_DIRECTION_COUNT-1 ) {
2804 log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS) wrong\n");
2805 }
2806 if( u_getIntPropertyMaxValue(UCHAR_BLOCK)!=(int32_t)UBLOCK_COUNT-1 ) {
2807 log_err("error: u_getIntPropertyMaxValue(UCHAR_BLOCK) wrong\n");
2808 }
2809 if(u_getIntPropertyMaxValue(UCHAR_LINE_BREAK)!=(int32_t)U_LB_COUNT-1) {
2810 log_err("error: u_getIntPropertyMaxValue(UCHAR_LINE_BREAK) wrong\n");
2811 }
2812 if(u_getIntPropertyMaxValue(UCHAR_SCRIPT)!=(int32_t)USCRIPT_CODE_LIMIT-1) {
2813 log_err("error: u_getIntPropertyMaxValue(UCHAR_SCRIPT) wrong\n");
2814 }
2815 if(u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE)!=(int32_t)U_NT_COUNT-1) {
2816 log_err("error: u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE) wrong\n");
2817 }
2818 if(u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY)!=(int32_t)U_CHAR_CATEGORY_COUNT-1) {
2819 log_err("error: u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY) wrong\n");
2820 }
2821 if(u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE)!=(int32_t)U_HST_COUNT-1) {
2822 log_err("error: u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE) wrong\n");
2823 }
2824 if(u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK)!=(int32_t)U_GCB_COUNT-1) {
2825 log_err("error: u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK) wrong\n");
2826 }
2827 if(u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK)!=(int32_t)U_SB_COUNT-1) {
2828 log_err("error: u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK) wrong\n");
2829 }
2830 if(u_getIntPropertyMaxValue(UCHAR_WORD_BREAK)!=(int32_t)U_WB_COUNT-1) {
2831 log_err("error: u_getIntPropertyMaxValue(UCHAR_WORD_BREAK) wrong\n");
2832 }
57a6839d
A
2833 if(u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE)!=(int32_t)U_BPT_COUNT-1) {
2834 log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE) wrong\n");
2835 }
73c04bcf
A
2836 /*JB#2410*/
2837 if( u_getIntPropertyMaxValue(0x2345)!=-1) {
2838 log_err("error: u_getIntPropertyMaxValue(0x2345) wrong\n");
2839 }
2840 if( u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) != (int32_t) (U_DT_COUNT - 1)) {
2841 log_err("error: u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) wrong\n");
2842 }
2843 if( u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) != (int32_t) (U_JG_COUNT -1)) {
2844 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) wrong\n");
2845 }
2846 if( u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) != (int32_t) (U_JT_COUNT -1)) {
2847 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) wrong\n");
2848 }
2849 if( u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) != (int32_t) (U_EA_COUNT -1)) {
2850 log_err("error: u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) wrong\n");
b75a7d8f
A
2851 }
2852
2853 /* test u_hasBinaryProperty() and u_getIntPropertyValue() */
2ca993e8 2854 for(i=0; i<UPRV_LENGTHOF(props); ++i) {
729e4ab9
A
2855 const char *whichName;
2856
b75a7d8f
A
2857 if(props[i][0]<0) {
2858 /* Unicode version break */
2859 if(uVersion<props[i][1]) {
2860 break; /* do not test properties that are not yet supported */
2861 } else {
2862 continue; /* skip this row */
2863 }
2864 }
2865
2866 c=(UChar32)props[i][0];
2867 which=(UProperty)props[i][1];
729e4ab9 2868 whichName=u_getPropertyName(which, U_LONG_PROPERTY_NAME);
b75a7d8f
A
2869
2870 if(which<UCHAR_INT_START) {
2871 result=u_hasBinaryProperty(c, which);
2872 if(result!=props[i][2]) {
729e4ab9
A
2873 log_data_err("error: u_hasBinaryProperty(U+%04lx, %s)=%d is wrong (props[%d]) - (Are you missing data?)\n",
2874 c, whichName, result, i);
b75a7d8f
A
2875 }
2876 }
2877
2878 result=u_getIntPropertyValue(c, which);
2879 if(result!=props[i][2]) {
729e4ab9
A
2880 log_data_err("error: u_getIntPropertyValue(U+%04lx, %s)=%d is wrong, should be %d (props[%d]) - (Are you missing data?)\n",
2881 c, whichName, result, props[i][2], i);
b75a7d8f
A
2882 }
2883
2884 /* test separate functions, too */
2885 switch((UProperty)props[i][1]) {
2886 case UCHAR_ALPHABETIC:
2887 if(u_isUAlphabetic((UChar32)props[i][0])!=(UBool)props[i][2]) {
2888 log_err("error: u_isUAlphabetic(U+%04lx)=%d is wrong (props[%d])\n",
2889 props[i][0], result, i);
2890 }
2891 break;
2892 case UCHAR_LOWERCASE:
2893 if(u_isULowercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2894 log_err("error: u_isULowercase(U+%04lx)=%d is wrong (props[%d])\n",
2895 props[i][0], result, i);
2896 }
2897 break;
2898 case UCHAR_UPPERCASE:
2899 if(u_isUUppercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2900 log_err("error: u_isUUppercase(U+%04lx)=%d is wrong (props[%d])\n",
2901 props[i][0], result, i);
2902 }
2903 break;
2904 case UCHAR_WHITE_SPACE:
2905 if(u_isUWhiteSpace((UChar32)props[i][0])!=(UBool)props[i][2]) {
2906 log_err("error: u_isUWhiteSpace(U+%04lx)=%d is wrong (props[%d])\n",
2907 props[i][0], result, i);
2908 }
2909 break;
2910 default:
2911 break;
2912 }
2913 }
2914}
2915
2916static void
2917TestNumericProperties(void) {
2918 /* see UnicodeData.txt, DerivedNumericValues.txt */
2919 static const struct {
2920 UChar32 c;
2921 int32_t type;
2922 double numValue;
2923 } values[]={
2924 { 0x0F33, U_NT_NUMERIC, -1./2. },
2925 { 0x0C66, U_NT_DECIMAL, 0 },
2926 { 0x96f6, U_NT_NUMERIC, 0 },
729e4ab9
A
2927 { 0xa833, U_NT_NUMERIC, 1./16. },
2928 { 0x2152, U_NT_NUMERIC, 1./10. },
2929 { 0x2151, U_NT_NUMERIC, 1./9. },
2930 { 0x1245f, U_NT_NUMERIC, 1./8. },
2931 { 0x2150, U_NT_NUMERIC, 1./7. },
b75a7d8f 2932 { 0x2159, U_NT_NUMERIC, 1./6. },
729e4ab9
A
2933 { 0x09f6, U_NT_NUMERIC, 3./16. },
2934 { 0x2155, U_NT_NUMERIC, 1./5. },
b75a7d8f
A
2935 { 0x00BD, U_NT_NUMERIC, 1./2. },
2936 { 0x0031, U_NT_DECIMAL, 1. },
2937 { 0x4e00, U_NT_NUMERIC, 1. },
2938 { 0x58f1, U_NT_NUMERIC, 1. },
2939 { 0x10320, U_NT_NUMERIC, 1. },
2940 { 0x0F2B, U_NT_NUMERIC, 3./2. },
2941 { 0x00B2, U_NT_DIGIT, 2. },
2942 { 0x5f10, U_NT_NUMERIC, 2. },
2943 { 0x1813, U_NT_DECIMAL, 3. },
2944 { 0x5f0e, U_NT_NUMERIC, 3. },
2945 { 0x2173, U_NT_NUMERIC, 4. },
2946 { 0x8086, U_NT_NUMERIC, 4. },
2947 { 0x278E, U_NT_DIGIT, 5. },
2948 { 0x1D7F2, U_NT_DECIMAL, 6. },
2949 { 0x247A, U_NT_DIGIT, 7. },
2950 { 0x7396, U_NT_NUMERIC, 9. },
2951 { 0x1372, U_NT_NUMERIC, 10. },
2952 { 0x216B, U_NT_NUMERIC, 12. },
2953 { 0x16EE, U_NT_NUMERIC, 17. },
2954 { 0x249A, U_NT_NUMERIC, 19. },
2955 { 0x303A, U_NT_NUMERIC, 30. },
2956 { 0x5345, U_NT_NUMERIC, 30. },
2957 { 0x32B2, U_NT_NUMERIC, 37. },
2958 { 0x1375, U_NT_NUMERIC, 40. },
2959 { 0x10323, U_NT_NUMERIC, 50. },
2960 { 0x0BF1, U_NT_NUMERIC, 100. },
2961 { 0x964c, U_NT_NUMERIC, 100. },
2962 { 0x217E, U_NT_NUMERIC, 500. },
2963 { 0x2180, U_NT_NUMERIC, 1000. },
2964 { 0x4edf, U_NT_NUMERIC, 1000. },
2965 { 0x2181, U_NT_NUMERIC, 5000. },
2966 { 0x137C, U_NT_NUMERIC, 10000. },
2967 { 0x4e07, U_NT_NUMERIC, 10000. },
51004dcb
A
2968 { 0x12432, U_NT_NUMERIC, 216000. },
2969 { 0x12433, U_NT_NUMERIC, 432000. },
b75a7d8f
A
2970 { 0x4ebf, U_NT_NUMERIC, 100000000. },
2971 { 0x5146, U_NT_NUMERIC, 1000000000000. },
729e4ab9 2972 { -1, U_NT_NONE, U_NO_NUMERIC_VALUE },
b75a7d8f
A
2973 { 0x61, U_NT_NONE, U_NO_NUMERIC_VALUE },
2974 { 0x3000, U_NT_NONE, U_NO_NUMERIC_VALUE },
2975 { 0xfffe, U_NT_NONE, U_NO_NUMERIC_VALUE },
2976 { 0x10301, U_NT_NONE, U_NO_NUMERIC_VALUE },
2977 { 0xe0033, U_NT_NONE, U_NO_NUMERIC_VALUE },
729e4ab9
A
2978 { 0x10ffff, U_NT_NONE, U_NO_NUMERIC_VALUE },
2979 { 0x110000, U_NT_NONE, U_NO_NUMERIC_VALUE }
b75a7d8f
A
2980 };
2981
2982 double nv;
2983 UChar32 c;
2984 int32_t i, type;
2985
b331163b 2986 for(i=0; i<UPRV_LENGTHOF(values); ++i) {
b75a7d8f
A
2987 c=values[i].c;
2988 type=u_getIntPropertyValue(c, UCHAR_NUMERIC_TYPE);
2989 nv=u_getNumericValue(c);
2990
2991 if(type!=values[i].type) {
2992 log_err("UCHAR_NUMERIC_TYPE(U+%04lx)=%d should be %d\n", c, type, values[i].type);
2993 }
2994 if(0.000001 <= fabs(nv - values[i].numValue)) {
2995 log_err("u_getNumericValue(U+%04lx)=%g should be %g\n", c, nv, values[i].numValue);
2996 }
2997 }
2998}
2999
3000/**
3001 * Test the property names and property value names API.
3002 */
3003static void
3004TestPropertyNames(void) {
3005 int32_t p, v, choice=0, rev;
3006 UBool atLeastSomething = FALSE;
3007
3008 for (p=0; ; ++p) {
46f4442e 3009 UProperty propEnum = (UProperty)p;
b75a7d8f
A
3010 UBool sawProp = FALSE;
3011 if(p > 10 && !atLeastSomething) {
3012 log_data_err("Never got anything after 10 tries.\nYour data is probably fried. Quitting this test\n", p, choice);
3013 return;
3014 }
3015
3016 for (choice=0; ; ++choice) {
46f4442e 3017 const char* name = u_getPropertyName(propEnum, (UPropertyNameChoice)choice);
b75a7d8f 3018 if (name) {
46f4442e
A
3019 if (!sawProp)
3020 log_verbose("prop 0x%04x+%2d:", p&~0xfff, p&0xfff);
b75a7d8f
A
3021 log_verbose("%d=\"%s\"", choice, name);
3022 sawProp = TRUE;
3023 atLeastSomething = TRUE;
3024
3025 /* test reverse mapping */
3026 rev = u_getPropertyEnum(name);
3027 if (rev != p) {
3028 log_err("Property round-trip failure: %d -> %s -> %d\n",
3029 p, name, rev);
3030 }
3031 }
3032 if (!name && choice>0) break;
3033 }
3034 if (sawProp) {
3035 /* looks like a valid property; check the values */
46f4442e 3036 const char* pname = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
b75a7d8f
A
3037 int32_t max = 0;
3038 if (p == UCHAR_CANONICAL_COMBINING_CLASS) {
3039 max = 255;
3040 } else if (p == UCHAR_GENERAL_CATEGORY_MASK) {
3041 /* it's far too slow to iterate all the way up to
3042 the real max, U_GC_P_MASK */
3043 max = U_GC_NL_MASK;
3044 } else if (p == UCHAR_BLOCK) {
3045 /* UBlockCodes, unlike other values, start at 1 */
3046 max = 1;
3047 }
3048 log_verbose("\n");
3049 for (v=-1; ; ++v) {
3050 UBool sawValue = FALSE;
3051 for (choice=0; ; ++choice) {
46f4442e 3052 const char* vname = u_getPropertyValueName(propEnum, v, (UPropertyNameChoice)choice);
b75a7d8f
A
3053 if (vname) {
3054 if (!sawValue) log_verbose(" %s, value %d:", pname, v);
3055 log_verbose("%d=\"%s\"", choice, vname);
3056 sawValue = TRUE;
3057
3058 /* test reverse mapping */
46f4442e 3059 rev = u_getPropertyValueEnum(propEnum, vname);
b75a7d8f
A
3060 if (rev != v) {
3061 log_err("Value round-trip failure (%s): %d -> %s -> %d\n",
3062 pname, v, vname, rev);
3063 }
3064 }
3065 if (!vname && choice>0) break;
3066 }
3067 if (sawValue) {
3068 log_verbose("\n");
3069 }
3070 if (!sawValue && v>=max) break;
3071 }
3072 }
3073 if (!sawProp) {
3074 if (p>=UCHAR_STRING_LIMIT) {
3075 break;
3076 } else if (p>=UCHAR_DOUBLE_LIMIT) {
3077 p = UCHAR_STRING_START - 1;
3078 } else if (p>=UCHAR_MASK_LIMIT) {
3079 p = UCHAR_DOUBLE_START - 1;
3080 } else if (p>=UCHAR_INT_LIMIT) {
3081 p = UCHAR_MASK_START - 1;
3082 } else if (p>=UCHAR_BINARY_LIMIT) {
3083 p = UCHAR_INT_START - 1;
3084 }
3085 }
3086 }
3087}
3088
3089/**
3090 * Test the property values API. See JB#2410.
3091 */
3092static void
3093TestPropertyValues(void) {
3094 int32_t i, p, min, max;
3095 UErrorCode ec;
3096
3097 /* Min should be 0 for everything. */
3098 /* Until JB#2478 is fixed, the one exception is UCHAR_BLOCK. */
3099 for (p=UCHAR_INT_START; p<UCHAR_INT_LIMIT; ++p) {
46f4442e
A
3100 UProperty propEnum = (UProperty)p;
3101 min = u_getIntPropertyMinValue(propEnum);
b75a7d8f
A
3102 if (min != 0) {
3103 if (p == UCHAR_BLOCK) {
3104 /* This is okay...for now. See JB#2487.
3105 TODO Update this for JB#2487. */
3106 } else {
3107 const char* name;
46f4442e
A
3108 name = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
3109 if (name == NULL)
3110 name = "<ERROR>";
b75a7d8f
A
3111 log_err("FAIL: u_getIntPropertyMinValue(%s) = %d, exp. 0\n",
3112 name, min);
3113 }
3114 }
3115 }
3116
3117 if( u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY_MASK)!=0 ||
3118 u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY_MASK)!=-1) {
3119 log_err("error: u_getIntPropertyMin/MaxValue(UCHAR_GENERAL_CATEGORY_MASK) is wrong\n");
3120 }
3121
3122 /* Max should be -1 for invalid properties. */
46f4442e 3123 max = u_getIntPropertyMaxValue(UCHAR_INVALID_CODE);
b75a7d8f
A
3124 if (max != -1) {
3125 log_err("FAIL: u_getIntPropertyMaxValue(-1) = %d, exp. -1\n",
3126 max);
3127 }
3128
73c04bcf 3129 /* Script should return USCRIPT_INVALID_CODE for an invalid code point. */
b75a7d8f
A
3130 for (i=0; i<2; ++i) {
3131 int32_t script;
3132 const char* desc;
3133 ec = U_ZERO_ERROR;
3134 switch (i) {
3135 case 0:
3136 script = uscript_getScript(-1, &ec);
3137 desc = "uscript_getScript(-1)";
3138 break;
3139 case 1:
3140 script = u_getIntPropertyValue(-1, UCHAR_SCRIPT);
3141 desc = "u_getIntPropertyValue(-1, UCHAR_SCRIPT)";
3142 break;
3143 default:
3144 log_err("Internal test error. Too many scripts\n");
3145 return;
3146 }
3147 /* We don't explicitly test ec. It should be U_FAILURE but it
3148 isn't documented as such. */
73c04bcf 3149 if (script != (int32_t)USCRIPT_INVALID_CODE) {
b75a7d8f
A
3150 log_err("FAIL: %s = %d, exp. 0\n",
3151 desc, script);
3152 }
3153 }
3154}
3155
b75a7d8f
A
3156/* various tests for consistency of UCD data and API behavior */
3157static void
3158TestConsistency() {
b75a7d8f
A
3159 char buffer[300];
3160 USet *set1, *set2, *set3, *set4;
3161 UErrorCode errorCode;
3162
b75a7d8f
A
3163 UChar32 start, end;
3164 int32_t i, length;
3165
3166 U_STRING_DECL(hyphenPattern, "[:Hyphen:]", 10);
3167 U_STRING_DECL(dashPattern, "[:Dash:]", 8);
3168 U_STRING_DECL(lowerPattern, "[:Lowercase:]", 13);
3169 U_STRING_DECL(formatPattern, "[:Cf:]", 6);
3170 U_STRING_DECL(alphaPattern, "[:Alphabetic:]", 14);
3171
73c04bcf
A
3172 U_STRING_DECL(mathBlocksPattern,
3173 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
57a6839d 3174 214);
73c04bcf
A
3175 U_STRING_DECL(mathPattern, "[:Math:]", 8);
3176 U_STRING_DECL(unassignedPattern, "[:Cn:]", 6);
3177 U_STRING_DECL(unknownPattern, "[:sc=Unknown:]", 14);
3178 U_STRING_DECL(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3179
b75a7d8f
A
3180 U_STRING_INIT(hyphenPattern, "[:Hyphen:]", 10);
3181 U_STRING_INIT(dashPattern, "[:Dash:]", 8);
3182 U_STRING_INIT(lowerPattern, "[:Lowercase:]", 13);
3183 U_STRING_INIT(formatPattern, "[:Cf:]", 6);
3184 U_STRING_INIT(alphaPattern, "[:Alphabetic:]", 14);
3185
73c04bcf
A
3186 U_STRING_INIT(mathBlocksPattern,
3187 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
57a6839d 3188 214);
73c04bcf
A
3189 U_STRING_INIT(mathPattern, "[:Math:]", 8);
3190 U_STRING_INIT(unassignedPattern, "[:Cn:]", 6);
3191 U_STRING_INIT(unknownPattern, "[:sc=Unknown:]", 14);
3192 U_STRING_INIT(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3193
b75a7d8f
A
3194 /*
3195 * It used to be that UCD.html and its precursors said
3196 * "Those dashes used to mark connections between pieces of words,
3197 * plus the Katakana middle dot."
3198 *
3199 * Unicode 4 changed 00AD Soft Hyphen to Cf and removed it from Dash
3200 * but not from Hyphen.
729e4ab9 3201 * UTC 94 (2003mar) decided to leave it that way and to change UCD.html.
b75a7d8f
A
3202 * Therefore, do not show errors when testing the Hyphen property.
3203 */
3204 log_verbose("Starting with Unicode 4, inconsistencies with [:Hyphen:] are\n"
3205 "known to the UTC and not considered errors.\n");
3206
3207 errorCode=U_ZERO_ERROR;
3208 set1=uset_openPattern(hyphenPattern, 10, &errorCode);
3209 set2=uset_openPattern(dashPattern, 8, &errorCode);
3210 if(U_SUCCESS(errorCode)) {
3211 /* remove the Katakana middle dot(s) from set1 */
3212 uset_remove(set1, 0x30fb);
3213 uset_remove(set1, 0xff65); /* halfwidth variant */
3214 showAMinusB(set1, set2, "[:Hyphen:]", "[:Dash:]", FALSE);
3215 } else {
729e4ab9 3216 log_data_err("error opening [:Hyphen:] or [:Dash:] - %s (Are you missing data?)\n", u_errorName(errorCode));
b75a7d8f
A
3217 }
3218
3219 /* check that Cf is neither Hyphen nor Dash nor Alphabetic */
3220 set3=uset_openPattern(formatPattern, 6, &errorCode);
3221 set4=uset_openPattern(alphaPattern, 14, &errorCode);
3222 if(U_SUCCESS(errorCode)) {
3223 showAIntersectB(set3, set1, "[:Cf:]", "[:Hyphen:]", FALSE);
3224 showAIntersectB(set3, set2, "[:Cf:]", "[:Dash:]", TRUE);
3225 showAIntersectB(set3, set4, "[:Cf:]", "[:Alphabetic:]", TRUE);
3226 } else {
729e4ab9 3227 log_data_err("error opening [:Cf:] or [:Alpbabetic:] - %s (Are you missing data?)\n", u_errorName(errorCode));
b75a7d8f
A
3228 }
3229
3230 uset_close(set1);
3231 uset_close(set2);
3232 uset_close(set3);
3233 uset_close(set4);
3234
3235 /*
3236 * Check that each lowercase character has "small" in its name
3237 * and not "capital".
3238 * There are some such characters, some of which seem odd.
3239 * Use the verbose flag to see these notices.
3240 */
3241 errorCode=U_ZERO_ERROR;
3242 set1=uset_openPattern(lowerPattern, 13, &errorCode);
3243 if(U_SUCCESS(errorCode)) {
3244 for(i=0;; ++i) {
3245 length=uset_getItem(set1, i, &start, &end, NULL, 0, &errorCode);
3246 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
3247 break; /* done */
3248 }
3249 if(U_FAILURE(errorCode)) {
3250 log_err("error iterating over [:Lowercase:] at item %d: %s\n",
3251 i, u_errorName(errorCode));
3252 break;
3253 }
3254 if(length!=0) {
3255 break; /* done with code points, got a string or -1 */
3256 }
3257
3258 while(start<=end) {
3259 length=u_charName(start, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode);
3260 if(U_FAILURE(errorCode)) {
4388f060 3261 log_data_err("error getting the name of U+%04x - %s\n", start, u_errorName(errorCode));
b75a7d8f 3262 errorCode=U_ZERO_ERROR;
b75a7d8f
A
3263 }
3264 if( (strstr(buffer, "SMALL")==NULL || strstr(buffer, "CAPITAL")!=NULL) &&
3265 strstr(buffer, "SMALL CAPITAL")==NULL
3266 ) {
3267 log_verbose("info: [:Lowercase:] contains U+%04x whose name does not suggest lowercase: %s\n", start, buffer);
3268 }
3269 ++start;
3270 }
3271 }
3272 } else {
729e4ab9 3273 log_data_err("error opening [:Lowercase:] - %s (Are you missing data?)\n", u_errorName(errorCode));
b75a7d8f 3274 }
b75a7d8f 3275 uset_close(set1);
73c04bcf
A
3276
3277 /* verify that all assigned characters in Math blocks are exactly Math characters */
3278 errorCode=U_ZERO_ERROR;
3279 set1=uset_openPattern(mathBlocksPattern, -1, &errorCode);
3280 set2=uset_openPattern(mathPattern, 8, &errorCode);
3281 set3=uset_openPattern(unassignedPattern, 6, &errorCode);
3282 if(U_SUCCESS(errorCode)) {
3283 uset_retainAll(set2, set1); /* [math blocks]&[:Math:] */
3284 uset_complement(set3); /* assigned characters */
3285 uset_retainAll(set1, set3); /* [math blocks]&[assigned] */
3286 compareUSets(set1, set2,
3287 "[assigned Math block chars]", "[math blocks]&[:Math:]",
3288 TRUE);
3289 } else {
729e4ab9 3290 log_data_err("error opening [math blocks] or [:Math:] or [:Cn:] - %s (Are you missing data?)\n", u_errorName(errorCode));
73c04bcf
A
3291 }
3292 uset_close(set1);
3293 uset_close(set2);
3294 uset_close(set3);
3295
3296 /* new in Unicode 5.0: exactly all unassigned+PUA+surrogate code points have script=Unknown */
3297 errorCode=U_ZERO_ERROR;
3298 set1=uset_openPattern(unknownPattern, 14, &errorCode);
3299 set2=uset_openPattern(reservedPattern, 20, &errorCode);
3300 if(U_SUCCESS(errorCode)) {
3301 compareUSets(set1, set2,
3302 "[:sc=Unknown:]", "[[:Cn:][:Co:][:Cs:]]",
3303 TRUE);
3304 } else {
729e4ab9 3305 log_data_err("error opening [:sc=Unknown:] or [[:Cn:][:Co:][:Cs:]] - %s (Are you missing data?)\n", u_errorName(errorCode));
73c04bcf
A
3306 }
3307 uset_close(set1);
3308 uset_close(set2);
b75a7d8f 3309}
374ca955 3310
73c04bcf
A
3311/* test case folding, compare return values with CaseFolding.txt ------------ */
3312
3313/* bit set for which case foldings for a character have been tested already */
3314enum {
3315 CF_SIMPLE=1,
3316 CF_FULL=2,
3317 CF_TURKIC=4,
3318 CF_ALL=7
3319};
3320
3321static void
3322testFold(UChar32 c, int which,
3323 UChar32 simple, UChar32 turkic,
3324 const UChar *full, int32_t fullLength,
3325 const UChar *turkicFull, int32_t turkicFullLength) {
3326 UChar s[2], t[32];
3327 UChar32 c2;
3328 int32_t length, length2;
3329
3330 UErrorCode errorCode=U_ZERO_ERROR;
3331
3332 length=0;
3333 U16_APPEND_UNSAFE(s, length, c);
3334
3335 if((which&CF_SIMPLE)!=0 && (c2=u_foldCase(c, 0))!=simple) {
3336 log_err("u_foldCase(U+%04lx, default)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3337 }
3338 if((which&CF_FULL)!=0) {
b331163b 3339 length2=u_strFoldCase(t, UPRV_LENGTHOF(t), s, length, 0, &errorCode);
73c04bcf
A
3340 if(length2!=fullLength || 0!=u_memcmp(t, full, fullLength)) {
3341 log_err("u_strFoldCase(U+%04lx, default) does not fold properly\n", (long)c);
3342 }
3343 }
3344 if((which&CF_TURKIC)!=0) {
3345 if((c2=u_foldCase(c, U_FOLD_CASE_EXCLUDE_SPECIAL_I))!=turkic) {
3346 log_err("u_foldCase(U+%04lx, turkic)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3347 }
3348
b331163b 3349 length2=u_strFoldCase(t, UPRV_LENGTHOF(t), s, length, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &errorCode);
73c04bcf
A
3350 if(length2!=turkicFullLength || 0!=u_memcmp(t, turkicFull, length2)) {
3351 log_err("u_strFoldCase(U+%04lx, turkic) does not fold properly\n", (long)c);
3352 }
3353 }
3354}
3355
3356/* test that c case-folds to itself */
3357static void
3358testFoldToSelf(UChar32 c, int which) {
3359 UChar s[2];
3360 int32_t length;
3361
3362 length=0;
3363 U16_APPEND_UNSAFE(s, length, c);
3364 testFold(c, which, c, c, s, length, s, length);
3365}
3366
3367struct CaseFoldingData {
3368 USet *notSeen;
3369 UChar32 prev, prevSimple;
3370 UChar prevFull[32];
3371 int32_t prevFullLength;
3372 int which;
3373};
3374typedef struct CaseFoldingData CaseFoldingData;
3375
3376static void U_CALLCONV
3377caseFoldingLineFn(void *context,
3378 char *fields[][2], int32_t fieldCount,
3379 UErrorCode *pErrorCode) {
3380 CaseFoldingData *pData=(CaseFoldingData *)context;
3381 char *end;
3382 UChar full[32];
3383 UChar32 c, prev, simple;
3384 int32_t count;
3385 int which;
3386 char status;
3387
3388 /* get code point */
4388f060
A
3389 const char *s=u_skipWhitespace(fields[0][0]);
3390 if(0==strncmp(s, "0000..10FFFF", 12)) {
3391 /*
3392 * Ignore the line
3393 * # @missing: 0000..10FFFF; C; <code point>
3394 * because maps-to-self is already our default, and this line breaks this parser.
3395 */
3396 return;
3397 }
3398 c=(UChar32)strtoul(s, &end, 16);
73c04bcf
A
3399 end=(char *)u_skipWhitespace(end);
3400 if(end<=fields[0][0] || end!=fields[0][1]) {
3401 log_err("syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
3402 *pErrorCode=U_PARSE_ERROR;
3403 return;
3404 }
3405
3406 /* get the status of this mapping */
3407 status=*u_skipWhitespace(fields[1][0]);
3408 if(status!='C' && status!='S' && status!='F' && status!='T') {
3409 log_err("unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
3410 *pErrorCode=U_PARSE_ERROR;
3411 return;
3412 }
3413
3414 /* get the mapping */
3415 count=u_parseString(fields[2][0], full, 32, (uint32_t *)&simple, pErrorCode);
3416 if(U_FAILURE(*pErrorCode)) {
3417 log_err("error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
3418 return;
3419 }
3420
3421 /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
3422 if(count==0 || count>2 || (count==2 && U16_IS_SINGLE(full[1]))) {
3423 simple=c;
3424 }
3425
3426 if(c!=(prev=pData->prev)) {
3427 /*
3428 * Test remaining mappings for the previous code point.
3429 * If a turkic folding was not mentioned, then it should fold the same
3430 * as the regular simple case folding.
3431 */
4388f060 3432 UChar prevString[2];
73c04bcf
A
3433 int32_t length;
3434
3435 length=0;
4388f060 3436 U16_APPEND_UNSAFE(prevString, length, prev);
73c04bcf
A
3437 testFold(prev, (~pData->which)&CF_ALL,
3438 prev, pData->prevSimple,
4388f060 3439 prevString, length,
73c04bcf
A
3440 pData->prevFull, pData->prevFullLength);
3441 pData->prev=pData->prevSimple=c;
3442 length=0;
3443 U16_APPEND_UNSAFE(pData->prevFull, length, c);
3444 pData->prevFullLength=length;
3445 pData->which=0;
3446 }
3447
3448 /*
3449 * Turn the status into a bit set of case foldings to test.
3450 * Remember non-Turkic case foldings as defaults for Turkic mode.
3451 */
3452 switch(status) {
3453 case 'C':
3454 which=CF_SIMPLE|CF_FULL;
3455 pData->prevSimple=simple;
3456 u_memcpy(pData->prevFull, full, count);
3457 pData->prevFullLength=count;
3458 break;
3459 case 'S':
3460 which=CF_SIMPLE;
3461 pData->prevSimple=simple;
3462 break;
3463 case 'F':
3464 which=CF_FULL;
3465 u_memcpy(pData->prevFull, full, count);
3466 pData->prevFullLength=count;
3467 break;
3468 case 'T':
3469 which=CF_TURKIC;
3470 break;
3471 default:
3472 which=0;
3473 break; /* won't happen because of test above */
3474 }
3475
3476 testFold(c, which, simple, simple, full, count, full, count);
3477
3478 /* remember which case foldings of c have been tested */
3479 pData->which|=which;
3480
3481 /* remove c from the set of ones not mentioned in CaseFolding.txt */
3482 uset_remove(pData->notSeen, c);
3483}
3484
3485static void
3486TestCaseFolding() {
3487 CaseFoldingData data={ NULL };
3488 char *fields[3][2];
3489 UErrorCode errorCode;
3490
3491 static char *lastLine= (char *)"10FFFF; C; 10FFFF;";
3492
3493 errorCode=U_ZERO_ERROR;
3494 /* test BMP & plane 1 - nothing interesting above */
3495 data.notSeen=uset_open(0, 0x1ffff);
3496 data.prevFullLength=1; /* length of full case folding of U+0000 */
3497
3498 parseUCDFile("CaseFolding.txt", fields, 3, caseFoldingLineFn, &data, &errorCode);
3499 if(U_SUCCESS(errorCode)) {
3500 int32_t i, start, end;
3501
3502 /* add a pseudo-last line to finish testing of the actual last one */
3503 fields[0][0]=lastLine;
3504 fields[0][1]=lastLine+6;
3505 fields[1][0]=lastLine+7;
3506 fields[1][1]=lastLine+9;
3507 fields[2][0]=lastLine+10;
3508 fields[2][1]=lastLine+17;
3509 caseFoldingLineFn(&data, fields, 3, &errorCode);
3510
3511 /* verify that all code points that are not mentioned in CaseFolding.txt fold to themselves */
3512 for(i=0;
3513 0==uset_getItem(data.notSeen, i, &start, &end, NULL, 0, &errorCode) &&
3514 U_SUCCESS(errorCode);
3515 ++i
3516 ) {
3517 do {
3518 testFoldToSelf(start, CF_ALL);
3519 } while(++start<=end);
3520 }
3521 }
3522
3523 uset_close(data.notSeen);
374ca955 3524}