]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/cintltst/cucdtst.c
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / test / cintltst / cucdtst.c
CommitLineData
b75a7d8f
A
1/********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 1997-2003, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6/********************************************************************************
7*
8* File CUCDTST.C
9*
10* Modification History:
11* Name Description
12* Madhu Katragadda Ported for C API, added tests for string functions
13*********************************************************************************
14*/
15
16#include <string.h>
17#include <math.h>
18#include <stdlib.h>
19
20#include "unicode/utypes.h"
21#include "unicode/uchar.h"
22#include "unicode/putil.h"
23#include "unicode/ustring.h"
24#include "unicode/uloc.h"
25
26#include "cintltst.h"
27#include "uparse.h"
28#include "uprops.h"
29#include "usc_impl.h"
30#include "unormimp.h"
31
32#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
33
34/* prototypes --------------------------------------------------------------- */
35
36static void TestUpperLower(void);
37static void TestLetterNumber(void);
38static void TestMisc(void);
39static void TestPOSIX(void);
40static void TestControlPrint(void);
41static void TestIdentifier(void);
42static void TestUnicodeData(void);
43static void TestCodeUnit(void);
44static void TestCodePoint(void);
45static void TestCharLength(void);
46static void TestCharNames(void);
47static void TestMirroring(void);
48 void TestUScriptCodeAPI(void); /* defined in cucdapi.c */
49static void TestUScriptRunAPI(void);
50static void TestAdditionalProperties(void);
51static void TestNumericProperties(void);
52static void TestPropertyNames(void);
53static void TestPropertyValues(void);
54static void TestConsistency(void);
55
56/* internal methods used */
57static int32_t MakeProp(char* str);
58static int32_t MakeDir(char* str);
59
60#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
61
62/* test data ---------------------------------------------------------------- */
63
64static const UChar LAST_CHAR_CODE_IN_FILE = 0xFFFD;
65static const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf";
66static const int32_t tagValues[] =
67 {
68 /* Mn */ U_NON_SPACING_MARK,
69 /* Mc */ U_COMBINING_SPACING_MARK,
70 /* Me */ U_ENCLOSING_MARK,
71 /* Nd */ U_DECIMAL_DIGIT_NUMBER,
72 /* Nl */ U_LETTER_NUMBER,
73 /* No */ U_OTHER_NUMBER,
74 /* Zs */ U_SPACE_SEPARATOR,
75 /* Zl */ U_LINE_SEPARATOR,
76 /* Zp */ U_PARAGRAPH_SEPARATOR,
77 /* Cc */ U_CONTROL_CHAR,
78 /* Cf */ U_FORMAT_CHAR,
79 /* Cs */ U_SURROGATE,
80 /* Co */ U_PRIVATE_USE_CHAR,
81 /* Cn */ U_UNASSIGNED,
82 /* Lu */ U_UPPERCASE_LETTER,
83 /* Ll */ U_LOWERCASE_LETTER,
84 /* Lt */ U_TITLECASE_LETTER,
85 /* Lm */ U_MODIFIER_LETTER,
86 /* Lo */ U_OTHER_LETTER,
87 /* Pc */ U_CONNECTOR_PUNCTUATION,
88 /* Pd */ U_DASH_PUNCTUATION,
89 /* Ps */ U_START_PUNCTUATION,
90 /* Pe */ U_END_PUNCTUATION,
91 /* Po */ U_OTHER_PUNCTUATION,
92 /* Sm */ U_MATH_SYMBOL,
93 /* Sc */ U_CURRENCY_SYMBOL,
94 /* Sk */ U_MODIFIER_SYMBOL,
95 /* So */ U_OTHER_SYMBOL,
96 /* Pi */ U_INITIAL_PUNCTUATION,
97 /* Pf */ U_FINAL_PUNCTUATION
98 };
99
100static const char dirStrings[][5] = {
101 "L",
102 "R",
103 "EN",
104 "ES",
105 "ET",
106 "AN",
107 "CS",
108 "B",
109 "S",
110 "WS",
111 "ON",
112 "LRE",
113 "LRO",
114 "AL",
115 "RLE",
116 "RLO",
117 "PDF",
118 "NSM",
119 "BN"
120};
121
122void addUnicodeTest(TestNode** root);
123
124void addUnicodeTest(TestNode** root)
125{
126 addTest(root, &TestUnicodeData, "tsutil/cucdtst/TestUnicodeData");
127 addTest(root, &TestCodeUnit, "tsutil/cucdtst/TestCodeUnit");
128 addTest(root, &TestCodePoint, "tsutil/cucdtst/TestCodePoint");
129 addTest(root, &TestCharLength, "tsutil/cucdtst/TestCharLength");
130 addTest(root, &TestAdditionalProperties, "tsutil/cucdtst/TestAdditionalProperties");
131 addTest(root, &TestNumericProperties, "tsutil/cucdtst/TestNumericProperties");
132 addTest(root, &TestUpperLower, "tsutil/cucdtst/TestUpperLower");
133 addTest(root, &TestLetterNumber, "tsutil/cucdtst/TestLetterNumber");
134 addTest(root, &TestMisc, "tsutil/cucdtst/TestMisc");
135 addTest(root, &TestPOSIX, "tsutil/cucdtst/TestPOSIX");
136 addTest(root, &TestControlPrint, "tsutil/cucdtst/TestControlPrint");
137 addTest(root, &TestIdentifier, "tsutil/cucdtst/TestIdentifier");
138 addTest(root, &TestCharNames, "tsutil/cucdtst/TestCharNames");
139 addTest(root, &TestMirroring, "tsutil/cucdtst/TestMirroring");
140 addTest(root, &TestUScriptCodeAPI, "tsutil/cucdtst/TestUScriptCodeAPI");
141 addTest(root, &TestUScriptRunAPI, "tsutil/cucdtst/TestUScriptRunAPI");
142 addTest(root, &TestPropertyNames, "tsutil/cucdtst/TestPropertyNames");
143 addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues");
144 addTest(root, &TestConsistency, "tsutil/cucdtst/TestConsistency");
145}
146
147/*==================================================== */
148/* test u_toupper() and u_tolower() */
149/*==================================================== */
150static void TestUpperLower()
151{
152 const UChar upper[] = {0x41, 0x42, 0x00b2, 0x01c4, 0x01c6, 0x01c9, 0x01c8, 0x01c9, 0x000c, 0x0000};
153 const UChar lower[] = {0x61, 0x62, 0x00b2, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01c9, 0x000c, 0x0000};
154 U_STRING_DECL(upperTest, "abcdefg123hij.?:klmno", 21);
155 U_STRING_DECL(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
156 int32_t i;
157
158 U_STRING_INIT(upperTest, "abcdefg123hij.?:klmno", 21);
159 U_STRING_INIT(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
160
161/*
162Checks LetterLike Symbols which were previously a source of confusion
163[Bertrand A. D. 02/04/98]
164*/
165 for (i=0x2100;i<0x2138;i++)
166 {
167 if(i!=0x2126 && i!=0x212a && i!=0x212b)
168 {
169 if (i != (int)u_tolower(i)) /* itself */
170 log_err("Failed case conversion with itself: U+%04x\n", i);
171 if (i != (int)u_toupper(i))
172 log_err("Failed case conversion with itself: U+%04x\n", i);
173 }
174 }
175
176 for(i=0; i < u_strlen(upper); i++){
177 if(u_tolower(upper[i]) != lower[i]){
178 log_err("FAILED u_tolower() for %lx Expected %lx Got %lx\n", upper[i], lower[i], u_tolower(upper[i]));
179 }
180 }
181
182 log_verbose("testing upper lower\n");
183 for (i = 0; i < 21; i++) {
184
185 if (u_isalpha(upperTest[i]) && !u_islower(upperTest[i]))
186 {
187 log_err("Failed isLowerCase test at %c\n", upperTest[i]);
188 }
189 else if (u_isalpha(lowerTest[i]) && !u_isupper(lowerTest[i]))
190 {
191 log_err("Failed isUpperCase test at %c\n", lowerTest[i]);
192 }
193 else if (upperTest[i] != u_tolower(lowerTest[i]))
194 {
195 log_err("Failed case conversion from %c To %c :\n", lowerTest[i], upperTest[i]);
196 }
197 else if (lowerTest[i] != u_toupper(upperTest[i]))
198 {
199 log_err("Failed case conversion : %c To %c \n", upperTest[i], lowerTest[i]);
200 }
201 else if (upperTest[i] != u_tolower(upperTest[i]))
202 {
203 log_err("Failed case conversion with itself: %c\n", upperTest[i]);
204 }
205 else if (lowerTest[i] != u_toupper(lowerTest[i]))
206 {
207 log_err("Failed case conversion with itself: %c\n", lowerTest[i]);
208 }
209 }
210 log_verbose("done testing upper lower\n");
211
212 log_verbose("testing u_istitle\n");
213 {
214 static const UChar expected[] = {
215 0x1F88,
216 0x1F89,
217 0x1F8A,
218 0x1F8B,
219 0x1F8C,
220 0x1F8D,
221 0x1F8E,
222 0x1F8F,
223 0x1F88,
224 0x1F89,
225 0x1F8A,
226 0x1F8B,
227 0x1F8C,
228 0x1F8D,
229 0x1F8E,
230 0x1F8F,
231 0x1F98,
232 0x1F99,
233 0x1F9A,
234 0x1F9B,
235 0x1F9C,
236 0x1F9D,
237 0x1F9E,
238 0x1F9F,
239 0x1F98,
240 0x1F99,
241 0x1F9A,
242 0x1F9B,
243 0x1F9C,
244 0x1F9D,
245 0x1F9E,
246 0x1F9F,
247 0x1FA8,
248 0x1FA9,
249 0x1FAA,
250 0x1FAB,
251 0x1FAC,
252 0x1FAD,
253 0x1FAE,
254 0x1FAF,
255 0x1FA8,
256 0x1FA9,
257 0x1FAA,
258 0x1FAB,
259 0x1FAC,
260 0x1FAD,
261 0x1FAE,
262 0x1FAF,
263 0x1FBC,
264 0x1FBC,
265 0x1FCC,
266 0x1FCC,
267 0x1FFC,
268 0x1FFC,
269 };
270 int32_t num = sizeof(expected)/sizeof(expected[0]);
271 for(i=0; i<num; i++){
272 if(!u_istitle(expected[i])){
273 log_err("u_istitle failed for 0x%4X. Expected TRUE, got FALSE\n",expected[i]);
274 }
275 }
276
277 }
278}
279
280/* compare two sets, which is not easy with the current (ICU 2.4) C API... */
281
282static UBool
283showADiffB(const USet *a, const USet *b,
284 const char *a_name, const char *b_name,
285 UBool expect, UBool diffIsError) {
286 int32_t i, start, end, length;
287 UBool equal;
288 UErrorCode errorCode;
289
290 errorCode=U_ZERO_ERROR;
291 equal=TRUE;
292 i=0;
293 for(;;) {
294 length=uset_getItem(a, i, &start, &end, NULL, 0, &errorCode);
295 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
296 return equal; /* done */
297 }
298 if(U_FAILURE(errorCode)) {
299 log_err("error comparing %s with %s at item %d: %s\n",
300 a_name, b_name, i, u_errorName(errorCode));
301 return FALSE;
302 }
303 if(length!=0) {
304 return equal; /* done with code points, got a string or -1 */
305 }
306
307 if(expect!=uset_containsRange(b, start, end)) {
308 equal=FALSE;
309 while(start<=end) {
310 if(expect!=uset_contains(b, start)) {
311 if(diffIsError) {
312 if(expect) {
313 log_err("error: %s contains U+%04x but %s does not\n", a_name, start, b_name);
314 } else {
315 log_err("error: %s and %s both contain U+%04x but should not intersect\n", a_name, b_name, start);
316 }
317 } else {
318 if(expect) {
319 log_verbose("info: %s contains U+%04x but %s does not\n", a_name, start, b_name);
320 } else {
321 log_verbose("info: %s and %s both contain U+%04x but should not intersect\n", a_name, b_name, start);
322 }
323 }
324 }
325 ++start;
326 }
327 }
328
329 ++i;
330 }
331}
332
333static UBool
334showAMinusB(const USet *a, const USet *b,
335 const char *a_name, const char *b_name,
336 UBool diffIsError) {
337 return showADiffB(a, b, a_name, b_name, TRUE, diffIsError);
338}
339
340static UBool
341showAIntersectB(const USet *a, const USet *b,
342 const char *a_name, const char *b_name,
343 UBool diffIsError) {
344 return showADiffB(a, b, a_name, b_name, FALSE, diffIsError);
345}
346
347static UBool
348compareUSets(const USet *a, const USet *b,
349 const char *a_name, const char *b_name,
350 UBool diffIsError) {
351 return
352 showAMinusB(a, b, a_name, b_name, diffIsError) &&
353 showAMinusB(b, a, b_name, a_name, diffIsError);
354}
355
356/* test isLetter(u_isapha()) and isDigit(u_isdigit()) */
357static void TestLetterNumber()
358{
359 UChar i = 0x0000;
360
361 log_verbose("Testing for isalpha\n");
362 for (i = 0x0041; i < 0x005B; i++) {
363 if (!u_isalpha(i))
364 {
365 log_err("Failed isLetter test at %.4X\n", i);
366 }
367 }
368 for (i = 0x0660; i < 0x066A; i++) {
369 if (u_isalpha(i))
370 {
371 log_err("Failed isLetter test with numbers at %.4X\n", i);
372 }
373 }
374
375 log_verbose("Testing for isdigit\n");
376 for (i = 0x0660; i < 0x066A; i++) {
377 if (!u_isdigit(i))
378 {
379 log_verbose("Failed isNumber test at %.4X\n", i);
380 }
381 }
382
383 log_verbose("Testing for isalnum\n");
384 for (i = 0x0041; i < 0x005B; i++) {
385 if (!u_isalnum(i))
386 {
387 log_err("Failed isAlNum test at %.4X\n", i);
388 }
389 }
390 for (i = 0x0660; i < 0x066A; i++) {
391 if (!u_isalnum(i))
392 {
393 log_err("Failed isAlNum test at %.4X\n", i);
394 }
395 }
396
397 {
398 /*
399 * The following checks work only starting from Unicode 4.0.
400 * Check the version number here.
401 */
402 UVersionInfo version;
403 u_getUnicodeVersion(version);
404 if(version[0]<4) {
405 return;
406 }
407 }
408
409 {
410 /*
411 * Sanity check:
412 * Verify that exactly the digit characters have decimal digit values.
413 * This assumption is used in the implementation of u_digit()
414 * (which checks nt=de)
415 * compared with the parallel java.lang.Character.digit()
416 * (which checks Nd).
417 *
418 * This was not true in Unicode 3.2 and earlier.
419 * The following characters had decimal digit values but were No not Nd.
420 * (from DerivedNumericType-3.2.0.txt)
42100B2..00B3 ; decimal # No [2] SUPERSCRIPT TWO..SUPERSCRIPT THREE
42200B9 ; decimal # No SUPERSCRIPT ONE
4232070 ; decimal # No SUPERSCRIPT ZERO
4242074..2079 ; decimal # No [6] SUPERSCRIPT FOUR..SUPERSCRIPT NINE
4252080..2089 ; decimal # No [10] SUBSCRIPT ZERO..SUBSCRIPT NINE
426 */
427 U_STRING_DECL(digitsPattern, "[:Nd:]", 6);
428 U_STRING_DECL(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
429
430 USet *digits, *decimalValues;
431 UErrorCode errorCode;
432
433 U_STRING_INIT(digitsPattern, "[:Nd:]", 6);
434 U_STRING_INIT(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
435 errorCode=U_ZERO_ERROR;
436 digits=uset_openPattern(digitsPattern, 6, &errorCode);
437 decimalValues=uset_openPattern(decimalValuesPattern, 24, &errorCode);
438
439 if(U_SUCCESS(errorCode)) {
440 compareUSets(digits, decimalValues, "[:Nd:]", "[:Numeric_Type=Decimal:]", TRUE);
441 }
442
443 uset_close(digits);
444 uset_close(decimalValues);
445 }
446}
447
448/* Tests for isDefined(u_isdefined)(, isBaseForm(u_isbase()), isSpaceChar(u_isspace()), isWhiteSpace(), u_CharDigitValue() */
449static void TestMisc()
450{
451 static const UChar sampleSpaces[] = {0x0020, 0x00a0, 0x2000, 0x2001, 0x2005};
452 static const UChar sampleNonSpaces[] = {0x61, 0x62, 0x63, 0x64, 0x74};
453 static const UChar sampleUndefined[] = {0xfff1, 0xfff7, 0xfa6b };
454 static const UChar sampleDefined[] = {0x523E, 0x4f88, 0xfffd};
455 static const UChar sampleBase[] = {0x0061, 0x0031, 0x03d2};
456 static const UChar sampleNonBase[] = {0x002B, 0x0020, 0x203B};
457/* static const UChar sampleChars[] = {0x000a, 0x0045, 0x4e00, 0xDC00, 0xFFE8, 0xFFF0};*/
458 static const UChar sampleDigits[]= {0x0030, 0x0662, 0x0F23, 0x0ED5};
459 static const UChar sampleNonDigits[] = {0x0010, 0x0041, 0x0122, 0x68FE};
460 static const UChar sampleWhiteSpaces[] = {0x2008, 0x2009, 0x200a, 0x001c, 0x000c};
461 static const UChar sampleNonWhiteSpaces[] = {0x61, 0x62, 0x3c, 0x28, 0x3f};
462
463
464 static const int32_t sampleDigitValues[] = {0, 2, 3, 5};
465
466 uint32_t mask;
467
468 int32_t i;
469 char icuVersion[U_MAX_VERSION_STRING_LENGTH];
470 UVersionInfo realVersion;
471
472 memset(icuVersion, 0, U_MAX_VERSION_STRING_LENGTH);
473
474 log_verbose("Testing for isspace and nonspaces\n");
475 for (i = 0; i < 5; i++) {
476 if (!(u_isspace(sampleSpaces[i])) ||
477 (u_isspace(sampleNonSpaces[i])))
478 {
479 log_err("Space char test error : %d or %d \n", (int32_t)sampleSpaces[i], (int32_t)sampleNonSpaces[i]);
480 }
481 if (!(u_isJavaSpaceChar(sampleSpaces[i])) ||
482 (u_isJavaSpaceChar(sampleNonSpaces[i])))
483 {
484 log_err("u_isJavaSpaceChar() test error : %d or %d \n", (int32_t)sampleSpaces[i], (int32_t)sampleNonSpaces[i]);
485 }
486 }
487
488 log_verbose("Testing for isspace and nonspaces\n");
489 for (i = 0; i < 5; i++) {
490 if (!(u_isWhitespace(sampleWhiteSpaces[i])) ||
491 (u_isWhitespace(sampleNonWhiteSpaces[i])))
492 {
493 log_err("White Space char test error : %lx or %lx \n", sampleWhiteSpaces[i], sampleNonWhiteSpaces[i]);
494 }
495 }
496
497 log_verbose("Testing for isdefined\n");
498 for (i = 0; i < 3; i++) {
499 if ((u_isdefined(sampleUndefined[i])) ||
500 !(u_isdefined(sampleDefined[i])))
501 {
502 log_err("Undefined char test error : U+%04x or U+%04x\n", (int32_t)sampleUndefined[i], (int32_t)sampleDefined[i]);
503 }
504 }
505
506 log_verbose("Testing for isbase\n");
507 for (i = 0; i < 3; i++) {
508 if ((u_isbase(sampleNonBase[i])) ||
509 !(u_isbase(sampleBase[i])))
510 {
511 log_err("Non-baseform char test error : U+%04x or U+%04x",(int32_t)sampleNonBase[i], (int32_t)sampleBase[i]);
512 }
513 }
514
515 log_verbose("Testing for isdigit \n");
516 for (i = 0; i < 4; i++) {
517 if ((u_isdigit(sampleDigits[i]) &&
518 (u_charDigitValue(sampleDigits[i])!= sampleDigitValues[i])) ||
519 (u_isdigit(sampleNonDigits[i]))) {
520 log_err("Digit char test error : %lx or %lx\n", sampleDigits[i], sampleNonDigits[i]);
521 }
522 }
523
524 /* Tests the ICU version #*/
525 u_getVersion(realVersion);
526 u_versionToString(realVersion, icuVersion);
527 if (strncmp(icuVersion, U_ICU_VERSION, uprv_min(strlen(icuVersion), strlen(U_ICU_VERSION))) != 0)
528 {
529 log_err("ICU version test failed. Header says=%s, got=%s \n", U_ICU_VERSION, icuVersion);
530 }
531#if defined(ICU_VERSION)
532 /* test only happens where we have configure.in with VERSION - sanity check. */
533 if(strcmp(U_ICU_VERSION, ICU_VERSION))
534 {
535 log_err("ICU version mismatch: Header says %s, build environment says %s.\n", U_ICU_VERSION, ICU_VERSION);
536 }
537#endif
538
539 /* test U_GC_... */
540 if(
541 U_GET_GC_MASK(0x41)!=U_GC_LU_MASK ||
542 U_GET_GC_MASK(0x662)!=U_GC_ND_MASK ||
543 U_GET_GC_MASK(0xa0)!=U_GC_ZS_MASK ||
544 U_GET_GC_MASK(0x28)!=U_GC_PS_MASK ||
545 U_GET_GC_MASK(0x2044)!=U_GC_SM_MASK ||
546 U_GET_GC_MASK(0xe0063)!=U_GC_CF_MASK
547 ) {
548 log_err("error: U_GET_GC_MASK does not work properly\n");
549 }
550
551 mask=0;
552 mask=(mask&~U_GC_CN_MASK)|U_GC_CN_MASK;
553
554 mask=(mask&~U_GC_LU_MASK)|U_GC_LU_MASK;
555 mask=(mask&~U_GC_LL_MASK)|U_GC_LL_MASK;
556 mask=(mask&~U_GC_LT_MASK)|U_GC_LT_MASK;
557 mask=(mask&~U_GC_LM_MASK)|U_GC_LM_MASK;
558 mask=(mask&~U_GC_LO_MASK)|U_GC_LO_MASK;
559
560 mask=(mask&~U_GC_MN_MASK)|U_GC_MN_MASK;
561 mask=(mask&~U_GC_ME_MASK)|U_GC_ME_MASK;
562 mask=(mask&~U_GC_MC_MASK)|U_GC_MC_MASK;
563
564 mask=(mask&~U_GC_ND_MASK)|U_GC_ND_MASK;
565 mask=(mask&~U_GC_NL_MASK)|U_GC_NL_MASK;
566 mask=(mask&~U_GC_NO_MASK)|U_GC_NO_MASK;
567
568 mask=(mask&~U_GC_ZS_MASK)|U_GC_ZS_MASK;
569 mask=(mask&~U_GC_ZL_MASK)|U_GC_ZL_MASK;
570 mask=(mask&~U_GC_ZP_MASK)|U_GC_ZP_MASK;
571
572 mask=(mask&~U_GC_CC_MASK)|U_GC_CC_MASK;
573 mask=(mask&~U_GC_CF_MASK)|U_GC_CF_MASK;
574 mask=(mask&~U_GC_CO_MASK)|U_GC_CO_MASK;
575 mask=(mask&~U_GC_CS_MASK)|U_GC_CS_MASK;
576
577 mask=(mask&~U_GC_PD_MASK)|U_GC_PD_MASK;
578 mask=(mask&~U_GC_PS_MASK)|U_GC_PS_MASK;
579 mask=(mask&~U_GC_PE_MASK)|U_GC_PE_MASK;
580 mask=(mask&~U_GC_PC_MASK)|U_GC_PC_MASK;
581 mask=(mask&~U_GC_PO_MASK)|U_GC_PO_MASK;
582
583 mask=(mask&~U_GC_SM_MASK)|U_GC_SM_MASK;
584 mask=(mask&~U_GC_SC_MASK)|U_GC_SC_MASK;
585 mask=(mask&~U_GC_SK_MASK)|U_GC_SK_MASK;
586 mask=(mask&~U_GC_SO_MASK)|U_GC_SO_MASK;
587
588 mask=(mask&~U_GC_PI_MASK)|U_GC_PI_MASK;
589 mask=(mask&~U_GC_PF_MASK)|U_GC_PF_MASK;
590
591 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
592 log_err("error: problems with U_GC_XX_MASK constants\n");
593 }
594
595 mask=0;
596 mask=(mask&~U_GC_C_MASK)|U_GC_C_MASK;
597 mask=(mask&~U_GC_L_MASK)|U_GC_L_MASK;
598 mask=(mask&~U_GC_M_MASK)|U_GC_M_MASK;
599 mask=(mask&~U_GC_N_MASK)|U_GC_N_MASK;
600 mask=(mask&~U_GC_Z_MASK)|U_GC_Z_MASK;
601 mask=(mask&~U_GC_P_MASK)|U_GC_P_MASK;
602 mask=(mask&~U_GC_S_MASK)|U_GC_S_MASK;
603
604 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
605 log_err("error: problems with U_GC_Y_MASK constants\n");
606 }
607 {
608 static const UChar32 digit[10]={ 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039 };
609 for(i=0; i<10; i++){
610 if(digit[i]!=u_forDigit(i,10)){
611 log_err("u_forDigit failed for %i. Expected: 0x%4X Got: 0x%4X\n",i,digit[i],u_forDigit(i,10));
612 }
613 }
614 }
615
616 /* test u_digit() */
617 {
618 static const struct {
619 UChar32 c;
620 int8_t radix, value;
621 } data[]={
622 /* base 16 */
623 { 0x0031, 16, 1 },
624 { 0x0038, 16, 8 },
625 { 0x0043, 16, 12 },
626 { 0x0066, 16, 15 },
627 { 0x00e4, 16, -1 },
628 { 0x0662, 16, 2 },
629 { 0x06f5, 16, 5 },
630 { 0xff13, 16, 3 },
631 { 0xff41, 16, 10 },
632
633 /* base 8 */
634 { 0x0031, 8, 1 },
635 { 0x0038, 8, -1 },
636 { 0x0043, 8, -1 },
637 { 0x0066, 8, -1 },
638 { 0x00e4, 8, -1 },
639 { 0x0662, 8, 2 },
640 { 0x06f5, 8, 5 },
641 { 0xff13, 8, 3 },
642 { 0xff41, 8, -1 },
643
644 /* base 36 */
645 { 0x5a, 36, 35 },
646 { 0x7a, 36, 35 },
647 { 0xff3a, 36, 35 },
648 { 0xff5a, 36, 35 },
649
650 /* wrong radix values */
651 { 0x0031, 1, -1 },
652 { 0xff3a, 37, -1 }
653 };
654
655 for(i=0; i<LENGTHOF(data); ++i) {
656 if(u_digit(data[i].c, data[i].radix)!=data[i].value) {
657 log_err("u_digit(U+%04x, %d)=%d expected %d\n",
658 data[i].c,
659 data[i].radix,
660 u_digit(data[i].c, data[i].radix),
661 data[i].value);
662 }
663 }
664 }
665}
666
667/* test C/POSIX-style functions --------------------------------------------- */
668
669/* bit flags */
670#define ISAL 1
671#define ISLO 2
672#define ISUP 4
673
674#define ISDI 8
675#define ISXD 0x10
676
677#define ISAN 0x20
678
679#define ISPU 0x40
680#define ISGR 0x80
681#define ISPR 0x100
682
683#define ISSP 0x200
684#define ISBL 0x400
685#define ISCN 0x800
686
687/* C/POSIX-style functions, in the same order as the bit flags */
688typedef UBool IsPOSIXClass(UChar32 c);
689
690static const struct {
691 IsPOSIXClass *fn;
692 const char *name;
693} posixClasses[]={
694 { u_isalpha, "isalpha" },
695 { u_islower, "islower" },
696 { u_isupper, "isupper" },
697 { u_isdigit, "isdigit" },
698 { u_isxdigit, "isxdigit" },
699 { u_isalnum, "isalnum" },
700 { u_ispunct, "ispunct" },
701 { u_isgraph, "isgraph" },
702 { u_isprint, "isprint" },
703 { u_isspace, "isspace" },
704 { u_isblank, "isblank" },
705 { u_iscntrl, "iscntrl" }
706};
707
708static const struct {
709 UChar32 c;
710 uint32_t posixResults;
711} posixData[]={
712 { 0x0008, ISCN }, /* backspace */
713 { 0x0009, ISSP|ISBL|ISCN }, /* TAB */
714 { 0x000a, ISSP| ISCN }, /* LF */
715 { 0x000c, ISSP| ISCN }, /* FF */
716 { 0x000d, ISSP| ISCN }, /* CR */
717 { 0x0020, ISPR|ISSP|ISBL }, /* space */
718 { 0x0021, ISPU|ISGR|ISPR }, /* ! */
719 { 0x0033, ISDI|ISXD|ISAN| ISGR|ISPR }, /* 3 */
720 { 0x0040, ISPU|ISGR|ISPR }, /* @ */
721 { 0x0041, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* A */
722 { 0x007a, ISAL|ISLO| ISAN| ISGR|ISPR }, /* z */
723 { 0x007b, ISPU|ISGR|ISPR }, /* { */
724 { 0x0085, ISSP| ISCN }, /* NEL */
725 { 0x00a0, ISPR|ISSP|ISBL }, /* NBSP */
726 { 0x00a4, ISGR|ISPR }, /* currency sign */
727 { 0x00e4, ISAL|ISLO| ISAN| ISGR|ISPR }, /* a-umlaut */
728 { 0x0300, ISGR|ISPR }, /* combining grave */
729 { 0x0600, ISCN }, /* arabic number sign */
730 { 0x0627, ISAL| ISAN| ISGR|ISPR }, /* alef */
731 { 0x0663, ISDI|ISXD|ISAN| ISGR|ISPR }, /* arabic 3 */
732 { 0x2002, ISPR|ISSP|ISBL }, /* en space */
733 { 0x2007, ISPR|ISSP|ISBL }, /* figure space */
734 { 0x2009, ISPR|ISSP|ISBL }, /* thin space */
735 { 0x200b, ISPR|ISSP }, /* ZWSP */
736 { 0x200e, ISCN }, /* LRM */
737 { 0x2028, ISPR|ISSP| ISCN }, /* LS */
738 { 0x2029, ISPR|ISSP| ISCN }, /* PS */
739 { 0x20ac, ISGR|ISPR }, /* Euro */
740 { 0xff15, ISDI|ISXD|ISAN| ISGR|ISPR }, /* fullwidth 5 */
741 { 0xff25, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* fullwidth E */
742 { 0xff35, ISAL| ISUP| ISAN| ISGR|ISPR }, /* fullwidth U */
743 { 0xff45, ISAL|ISLO| ISXD|ISAN| ISGR|ISPR }, /* fullwidth e */
744 { 0xff55, ISAL|ISLO| ISAN| ISGR|ISPR } /* fullwidth u */
745};
746
747static void
748TestPOSIX() {
749 uint32_t mask;
750 int32_t cl, i;
751 UBool expect;
752
753 mask=1;
754 for(cl=0; cl<12; ++cl) {
755 for(i=0; i<LENGTHOF(posixData); ++i) {
756 expect=(UBool)((posixData[i].posixResults&mask)!=0);
757 if(posixClasses[cl].fn(posixData[i].c)!=expect) {
758 log_err("u_%s(U+%04x)=%s is wrong\n",
759 posixClasses[cl].name, posixData[i].c, expect ? "FALSE" : "TRUE");
760 }
761 }
762 mask<<=1;
763 }
764}
765
766/* Tests for isControl(u_iscntrl()) and isPrintable(u_isprint()) */
767static void TestControlPrint()
768{
769 const UChar sampleControl[] = {0x1b, 0x97, 0x82, 0x2028, 0x2029, 0x200c, 0x202b};
770 const UChar sampleNonControl[] = {0x61, 0x0031, 0x00e2};
771 const UChar samplePrintable[] = {0x0042, 0x005f, 0x2014};
772 const UChar sampleNonPrintable[] = {0x200c, 0x009f, 0x001b};
773 UChar32 c;
774 int i;
775
776 log_verbose("Testing for iscontrol\n");
777 for (i = 0; i < LENGTHOF(sampleControl); i++) {
778 if (!u_iscntrl(sampleControl[i]))
779 {
780 log_err("Control char test error : U+%04x should be control but is not\n", (int32_t)sampleControl[i]);
781 }
782 }
783
784 log_verbose("Testing for !iscontrol\n");
785 for (i = 0; i < LENGTHOF(sampleNonControl); i++) {
786 if (u_iscntrl(sampleNonControl[i]))
787 {
788 log_err("Control char test error : U+%04x should not be control but is\n", (int32_t)sampleNonControl[i]);
789 }
790 }
791
792 log_verbose("testing for isprintable\n");
793 for (i = 0; i < 3; i++) {
794 if (!u_isprint(samplePrintable[i]))
795 {
796 log_err("Printable char test error : U+%04x should be printable but is not\n", (int32_t)samplePrintable[i]);
797 }
798 if (u_isprint(sampleNonPrintable[i]))
799 {
800 log_err("Printable char test error : U+%04x should not be printable but is\n", (int32_t)sampleNonPrintable[i]);
801 }
802 }
803
804 /* test all ISO 8 controls */
805 for(c=0; c<=0x9f; ++c) {
806 if(c==0x20) {
807 /* skip ASCII graphic characters and continue with DEL */
808 c=0x7f;
809 }
810 if(!u_iscntrl(c)) {
811 log_err("error: u_iscntrl(ISO 8 control U+%04x)=FALSE\n", c);
812 }
813 if(!u_isISOControl(c)) {
814 log_err("error: u_isISOControl(ISO 8 control U+%04x)=FALSE\n", c);
815 }
816 if(u_isprint(c)) {
817 log_err("error: u_isprint(ISO 8 control U+%04x)=TRUE\n", c);
818 }
819 }
820
821 /* test all Latin-1 graphic characters */
822 for(c=0x20; c<=0xff; ++c) {
823 if(c==0x7f) {
824 c=0xa0;
825 } else if(c==0xad) {
826 /* Unicode 4 changes 00AD Soft Hyphen to Cf (and it is in fact not printable) */
827 ++c;
828 }
829 if(!u_isprint(c)) {
830 log_err("error: u_isprint(Latin-1 graphic character U+%04x)=FALSE\n", c);
831 }
832 }
833}
834
835/* u_isJavaIDStart, u_isJavaIDPart, u_isIDStart(), u_isIDPart(), u_isIDIgnorable()*/
836static void TestIdentifier()
837{
838 const UChar sampleJavaIDStart[] = {0x0071, 0x00e4, 0x005f};
839 const UChar sampleNonJavaIDStart[] = {0x0020, 0x2030, 0x0082};
840 const UChar sampleJavaIDPart[] = {0x005f, 0x0032, 0x0045};
841 const UChar sampleNonJavaIDPart[] = {0x2030, 0x2020, 0x0020};
842 const UChar sampleUnicodeIDStart[] = {0x0250, 0x00e2, 0x0061};
843 const UChar sampleNonUnicodeIDStart[] = {0x2000, 0x000a, 0x2019};
844 const UChar sampleUnicodeIDPart[] = {0x005f, 0x0032, 0x0045};
845 const UChar sampleNonUnicodeIDPart[] = {0x2030, 0x00a3, 0x0020};
846 const UChar sampleIDIgnore[] = {0x0006, 0x0010, 0x206b};
847 const UChar sampleNonIDIgnore[] = {0x0075, 0x00a3, 0x0061};
848
849 int i;
850
851 log_verbose("Testing sampleJavaID start \n");
852 for (i = 0; i < 3; i++) {
853 if (!(u_isJavaIDStart(sampleJavaIDStart[i])) ||
854 (u_isJavaIDStart(sampleNonJavaIDStart[i])))
855 log_err("Java ID Start char test error : %lx or %lx\n",
856 sampleJavaIDStart[i], sampleNonJavaIDStart[i]);
857 }
858
859 log_verbose("Testing sampleJavaID part \n");
860 for (i = 0; i < 3; i++) {
861 if (!(u_isJavaIDPart(sampleJavaIDPart[i])) ||
862 (u_isJavaIDPart(sampleNonJavaIDPart[i])))
863 log_err("Java ID Part char test error : %lx or %lx\n",
864 sampleJavaIDPart[i], sampleNonJavaIDPart[i]);
865 }
866
867 log_verbose("Testing sampleUnicodeID start \n");
868 for (i = 0; i < 3; i++) {
869 /* T_test_logln_ustr((int32_t)i); */
870 if (!(u_isIDStart(sampleUnicodeIDStart[i])) ||
871 (u_isIDStart(sampleNonUnicodeIDStart[i])))
872 {
873 log_err("Unicode ID Start char test error : %lx or %lx\n", sampleUnicodeIDStart[i],
874 sampleNonUnicodeIDStart[i]);
875 }
876 }
877
878 log_verbose("Testing sample unicode ID part \n");
879 for (i = 2; i < 3; i++) { /* nos *** starts with 2 instead of 0, until clarified */
880 /* T_test_logln_ustr((int32_t)i); */
881 if (!(u_isIDPart(sampleUnicodeIDPart[i])) ||
882 (u_isIDPart(sampleNonUnicodeIDPart[i])))
883 {
884 log_err("Unicode ID Part char test error : %lx or %lx", sampleUnicodeIDPart[i], sampleNonUnicodeIDPart[i]);
885 }
886 }
887
888 log_verbose("Testing sampleId ignore\n");
889 for (i = 0; i < 3; i++) {
890 /*T_test_logln_ustr((int32_t)i); */
891 if (!(u_isIDIgnorable(sampleIDIgnore[i])) ||
892 (u_isIDIgnorable(sampleNonIDIgnore[i])))
893 {
894 log_err("ID ignorable char test error : U+%04x or U+%04x\n", sampleIDIgnore[i], sampleNonIDIgnore[i]);
895 }
896 }
897}
898
899/* for each line of UnicodeData.txt, check some of the properties */
900/*
901 * ### TODO
902 * This test fails incorrectly if the First or Last code point of a repetitive area
903 * is overridden, which is allowed and is encouraged for the PUAs.
904 * Currently, this means that both area First/Last and override lines are
905 * tested against the properties from the API,
906 * and the area boundary will not match and cause an error.
907 *
908 * This function should detect area boundaries and skip them for the test of individual
909 * code points' properties.
910 * Then it should check that the areas contain all the same properties except where overridden.
911 * For this, it would have had to set a flag for which code points were listed explicitly.
912 */
913static void U_CALLCONV
914unicodeDataLineFn(void *context,
915 char *fields[][2], int32_t fieldCount,
916 UErrorCode *pErrorCode)
917{
918 char buffer[100];
919 char *end;
920 uint32_t value;
921 UChar32 c;
922 int32_t i;
923 int8_t type;
924
925 /* get the character code, field 0 */
926 c=strtoul(fields[0][0], &end, 16);
927 if(end<=fields[0][0] || end!=fields[0][1]) {
928 log_err("error: syntax error in field 0 at %s\n", fields[0][0]);
929 return;
930 }
931 if((uint32_t)c>=UCHAR_MAX_VALUE + 1) {
932 log_err("error in UnicodeData.txt: code point %lu out of range\n", c);
933 return;
934 }
935
936 /* get general category, field 2 */
937 *fields[2][1]=0;
938 type = (int8_t)tagValues[MakeProp(fields[2][0])];
939 if(u_charType(c)!=type) {
940 log_err("error: u_charType(U+%04lx)==%u instead of %u\n", c, u_charType(c), type);
941 }
942 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
943 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
944 }
945
946 /* get canonical combining class, field 3 */
947 value=strtoul(fields[3][0], &end, 10);
948 if(end<=fields[3][0] || end!=fields[3][1]) {
949 log_err("error: syntax error in field 3 at code 0x%lx\n", c);
950 return;
951 }
952 if(value>255) {
953 log_err("error in UnicodeData.txt: combining class %lu out of range\n", value);
954 return;
955 }
956#if !UCONFIG_NO_NORMALIZATION
957 if(value!=u_getCombiningClass(c) || value!=(uint32_t)u_getIntPropertyValue(c, UCHAR_CANONICAL_COMBINING_CLASS)) {
958 log_err("error: u_getCombiningClass(U+%04lx)==%hu instead of %lu\n", c, u_getCombiningClass(c), value);
959 }
960#endif
961
962 /* get BiDi category, field 4 */
963 *fields[4][1]=0;
964 i=MakeDir(fields[4][0]);
965 if(i!=u_charDirection(c) || i!=u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)) {
966 log_err("error: u_charDirection(U+%04lx)==%u instead of %u (%s)\n", c, u_charDirection(c), MakeDir(fields[4][0]), fields[4][0]);
967 }
968
969 /* get ISO Comment, field 11 */
970 *fields[11][1]=0;
971 i=u_getISOComment(c, buffer, sizeof(buffer), pErrorCode);
972 if(U_FAILURE(*pErrorCode) || 0!=strcmp(fields[11][0], buffer)) {
973 log_err("error: u_getISOComment(U+%04lx) wrong (%s): \"%s\" should be \"%s\"\n",
974 c, u_errorName(*pErrorCode),
975 U_FAILURE(*pErrorCode) ? buffer : "[error]",
976 fields[11][0]);
977 }
978
979 /* get uppercase mapping, field 12 */
980 if(fields[12][0]!=fields[12][1]) {
981 value=strtoul(fields[12][0], &end, 16);
982 if(end!=fields[12][1]) {
983 log_err("error: syntax error in field 12 at code 0x%lx\n", c);
984 return;
985 }
986 if((UChar32)value!=u_toupper(c)) {
987 log_err("error: u_toupper(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_toupper(c), value);
988 }
989 } else {
990 /* no case mapping: the API must map the code point to itself */
991 if(c!=u_toupper(c)) {
992 log_err("error: U+%04lx does not have an uppercase mapping but u_toupper()==U+%04lx\n", c, u_toupper(c));
993 }
994 }
995
996 /* get lowercase mapping, field 13 */
997 if(fields[13][0]!=fields[13][1]) {
998 value=strtoul(fields[13][0], &end, 16);
999 if(end!=fields[13][1]) {
1000 log_err("error: syntax error in field 13 at code 0x%lx\n", c);
1001 return;
1002 }
1003 if((UChar32)value!=u_tolower(c)) {
1004 log_err("error: u_tolower(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_tolower(c), value);
1005 }
1006 } else {
1007 /* no case mapping: the API must map the code point to itself */
1008 if(c!=u_tolower(c)) {
1009 log_err("error: U+%04lx does not have a lowercase mapping but u_tolower()==U+%04lx\n", c, u_tolower(c));
1010 }
1011 }
1012
1013 /* get titlecase mapping, field 14 */
1014 if(fields[14][0]!=fields[14][1]) {
1015 value=strtoul(fields[14][0], &end, 16);
1016 if(end!=fields[14][1]) {
1017 log_err("error: syntax error in field 14 at code 0x%lx\n", c);
1018 return;
1019 }
1020 if((UChar32)value!=u_totitle(c)) {
1021 log_err("error: u_totitle(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_totitle(c), value);
1022 }
1023 } else {
1024 /* no case mapping: the API must map the code point to itself */
1025 if(c!=u_totitle(c)) {
1026 log_err("error: U+%04lx does not have a titlecase mapping but u_totitle()==U+%04lx\n", c, u_totitle(c));
1027 }
1028 }
1029}
1030
1031static UBool U_CALLCONV
1032enumTypeRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1033 static const UChar32 test[][2]={
1034 {0x41, U_UPPERCASE_LETTER},
1035 {0x308, U_NON_SPACING_MARK},
1036 {0xfffe, U_GENERAL_OTHER_TYPES},
1037 {0xe0041, U_FORMAT_CHAR},
1038 {0xeffff, U_UNASSIGNED}
1039 };
1040
1041 /* default Bidi classes for unassigned code points */
1042 static const int32_t defaultBidi[][2]={ /* { limit, class } */
1043 { 0x0590, U_LEFT_TO_RIGHT },
1044 { 0x0600, U_RIGHT_TO_LEFT },
1045 { 0x07C0, U_RIGHT_TO_LEFT_ARABIC },
1046 { 0x0900, U_RIGHT_TO_LEFT },
1047 { 0xFB1D, U_LEFT_TO_RIGHT },
1048 { 0xFB50, U_RIGHT_TO_LEFT },
1049 { 0xFE00, U_RIGHT_TO_LEFT_ARABIC },
1050 { 0xFE70, U_LEFT_TO_RIGHT },
1051 { 0xFF00, U_RIGHT_TO_LEFT_ARABIC },
1052 { 0x10800, U_LEFT_TO_RIGHT },
1053 { 0x11000, U_RIGHT_TO_LEFT },
1054 { 0x110000, U_LEFT_TO_RIGHT }
1055 };
1056
1057 UChar32 c;
1058 int i, count;
1059
1060 if(0!=strcmp((const char *)context, "a1")) {
1061 log_err("error: u_enumCharTypes() passes on an incorrect context pointer\n");
1062 return FALSE;
1063 }
1064
1065 count=sizeof(test)/sizeof(test[0]);
1066 for(i=0; i<count; ++i) {
1067 if(start<=test[i][0] && test[i][0]<limit) {
1068 if(type!=(UCharCategory)test[i][1]) {
1069 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld instead of U+%04lx with %ld\n",
1070 start, limit, (long)type, test[i][0], test[i][1]);
1071 }
1072 /* stop at the range that includes the last test code point */
1073 return i==(count-1) ? FALSE : TRUE;
1074 }
1075 }
1076
1077 if(start>test[count-1][0]) {
1078 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld after it should have stopped\n",
1079 start, limit, (long)type);
1080 return FALSE;
1081 }
1082
1083 /*
1084 * LineBreak.txt specifies:
1085 * # - Assigned characters that are not listed explicitly are given the value
1086 * # "AL".
1087 * # - Unassigned characters are given the value "XX".
1088 *
1089 * PUA characters are listed explicitly with "XX".
1090 * Verify that no assigned character has "XX".
1091 */
1092 if(type!=U_UNASSIGNED && type!=U_PRIVATE_USE_CHAR) {
1093 c=start;
1094 while(c<limit) {
1095 if(0==u_getIntPropertyValue(c, UCHAR_LINE_BREAK)) {
1096 log_err("error UCHAR_LINE_BREAK(assigned U+%04lx)=XX\n", c);
1097 }
1098 ++c;
1099 }
1100 }
1101
1102 /*
1103 * Verify default Bidi classes.
1104 * See table 3-7 "Bidirectional Character Types" in UAX #9.
1105 * http://www.unicode.org/reports/tr9/
1106 *
1107 * See also DerivedBidiClass.txt for Cn code points!
1108 */
1109 if(type==U_UNASSIGNED || type==U_PRIVATE_USE_CHAR) {
1110 /* enumerate the intersections of defaultBidi ranges with [start..limit[ */
1111 c=start;
1112 for(i=0; i<LENGTHOF(defaultBidi) && c<limit; ++i) {
1113 if((int32_t)c<defaultBidi[i][0]) {
1114 while(c<limit && (int32_t)c<defaultBidi[i][0]) {
1115 if( u_charDirection(c)!=(UCharDirection)defaultBidi[i][1] ||
1116 u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)!=defaultBidi[i][1]
1117 ) {
1118 log_err("error: u_charDirection(unassigned/PUA U+%04lx)=%s should be %s\n",
1119 c, dirStrings[u_charDirection(c)], dirStrings[defaultBidi[i][1]]);
1120 }
1121 ++c;
1122 }
1123 }
1124 }
1125 }
1126
1127 return TRUE;
1128}
1129
1130/* tests for several properties */
1131static void TestUnicodeData()
1132{
1133 char newPath[256];
1134 char backupPath[256];
1135 UVersionInfo expectVersionArray;
1136 UVersionInfo versionArray;
1137 char *fields[15][2];
1138 UErrorCode errorCode;
1139 UChar32 c;
1140 int8_t type;
1141
1142 /* Look inside ICU_DATA first */
1143 strcpy(newPath, u_getDataDirectory());
1144 strcat(newPath, ".." U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING "UnicodeData.txt");
1145
1146 /* As a fallback, try to guess where the source data was located
1147 * at the time ICU was built, and look there.
1148 */
1149 strcpy(backupPath, ctest_dataSrcDir());
1150 strcat(backupPath, U_FILE_SEP_STRING);
1151 strcat(backupPath, "unidata" U_FILE_SEP_STRING "UnicodeData.txt");
1152
1153 u_versionFromString(expectVersionArray, U_UNICODE_VERSION);
1154 u_getUnicodeVersion(versionArray);
1155 if(memcmp(versionArray, expectVersionArray, U_MAX_VERSION_LENGTH) != 0)
1156 {
1157 log_err("Testing u_getUnicodeVersion() - expected " U_UNICODE_VERSION " got %d.%d.%d.%d\n",
1158 versionArray[0], versionArray[1], versionArray[2], versionArray[3]);
1159 }
1160
1161#if defined(ICU_UNICODE_VERSION)
1162 /* test only happens where we have configure.in with UNICODE_VERSION - sanity check. */
1163 if(strcmp(U_UNICODE_VERSION, ICU_UNICODE_VERSION))
1164 {
1165 log_err("Testing configure.in's ICU_UNICODE_VERSION - expected " U_UNICODE_VERSION " got " ICU_UNICODE_VERSION "\n");
1166 }
1167#endif
1168
1169 if (ublock_getCode((UChar)0x0041) != UBLOCK_BASIC_LATIN || u_getIntPropertyValue(0x41, UCHAR_BLOCK)!=(int32_t)UBLOCK_BASIC_LATIN) {
1170 log_err("ublock_getCode(U+0041) property failed! Expected : %i Got: %i \n", UBLOCK_BASIC_LATIN,ublock_getCode((UChar)0x0041));
1171 }
1172
1173 errorCode=U_ZERO_ERROR;
1174 u_parseDelimitedFile(newPath, ';', fields, 15, unicodeDataLineFn, NULL, &errorCode);
1175 if(errorCode==U_FILE_ACCESS_ERROR) {
1176 errorCode=U_ZERO_ERROR;
1177 u_parseDelimitedFile(backupPath, ';', fields, 15, unicodeDataLineFn, NULL, &errorCode);
1178 }
1179 if(U_FAILURE(errorCode)) {
1180 log_err("error parsing UnicodeData.txt: %s\n", u_errorName(errorCode));
1181 return; /* if we couldn't parse UnicodeData.txt, we should return */
1182 }
1183
1184 /* sanity check on repeated properties */
1185 for(c=0xfffe; c<=0x10ffff;) {
1186 type=u_charType(c);
1187 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1188 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1189 }
1190 if(type!=U_UNASSIGNED) {
1191 log_err("error: u_charType(U+%04lx)!=U_UNASSIGNED (returns %d)\n", c, u_charType(c));
1192 }
1193 if((c&0xffff)==0xfffe) {
1194 ++c;
1195 } else {
1196 c+=0xffff;
1197 }
1198 }
1199
1200 /* test that PUA is not "unassigned" */
1201 for(c=0xe000; c<=0x10fffd;) {
1202 type=u_charType(c);
1203 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1204 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1205 }
1206 if(type==U_UNASSIGNED) {
1207 log_err("error: u_charType(U+%04lx)==U_UNASSIGNED\n", c);
1208 } else if(type!=U_PRIVATE_USE_CHAR) {
1209 log_verbose("PUA override: u_charType(U+%04lx)=%d\n", c, type);
1210 }
1211 if(c==0xf8ff) {
1212 c=0xf0000;
1213 } else if(c==0xffffd) {
1214 c=0x100000;
1215 } else {
1216 ++c;
1217 }
1218 }
1219
1220 /* test u_enumCharTypes() */
1221 u_enumCharTypes(enumTypeRange, "a1");
1222}
1223
1224static void TestCodeUnit(){
1225 const UChar codeunit[]={0x0000,0xe065,0x20ac,0xd7ff,0xd800,0xd841,0xd905,0xdbff,0xdc00,0xdc02,0xddee,0xdfff,0};
1226
1227 int32_t i;
1228
1229 for(i=0; i<(int32_t)(sizeof(codeunit)/sizeof(codeunit[0])); i++){
1230 UChar c=codeunit[i];
1231 if(i<4){
1232 if(!(UTF_IS_SINGLE(c)) || (UTF_IS_LEAD(c)) || (UTF_IS_TRAIL(c)) ||(UTF_IS_SURROGATE(c))){
1233 log_err("ERROR: U+%04x is a single", c);
1234 }
1235
1236 }
1237 if(i >= 4 && i< 8){
1238 if(!(UTF_IS_LEAD(c)) || UTF_IS_SINGLE(c) || UTF_IS_TRAIL(c) || !(UTF_IS_SURROGATE(c))){
1239 log_err("ERROR: U+%04x is a first surrogate", c);
1240 }
1241 }
1242 if(i >= 8 && i< 12){
1243 if(!(UTF_IS_TRAIL(c)) || UTF_IS_SINGLE(c) || UTF_IS_LEAD(c) || !(UTF_IS_SURROGATE(c))){
1244 log_err("ERROR: U+%04x is a second surrogate", c);
1245 }
1246 }
1247 }
1248
1249}
1250
1251static void TestCodePoint(){
1252 const UChar32 codePoint[]={
1253 /*surrogate, notvalid(codepoint), not a UnicodeChar, not Error */
1254 0xd800,
1255 0xdbff,
1256 0xdc00,
1257 0xdfff,
1258 0xdc04,
1259 0xd821,
1260 /*not a surrogate, valid, isUnicodeChar , not Error*/
1261 0x20ac,
1262 0xd7ff,
1263 0xe000,
1264 0xe123,
1265 0x0061,
1266 0xe065,
1267 0x20402,
1268 0x24506,
1269 0x23456,
1270 0x20402,
1271 0x10402,
1272 0x23456,
1273 /*not a surrogate, not valid, isUnicodeChar, isError */
1274 0x0015,
1275 0x009f,
1276 /*not a surrogate, not valid, not isUnicodeChar, isError */
1277 0xffff,
1278 0xfffe,
1279 };
1280 int32_t i;
1281 for(i=0; i<(int32_t)(sizeof(codePoint)/sizeof(codePoint[0])); i++){
1282 UChar32 c=codePoint[i];
1283 if(i<6){
1284 if(!UTF_IS_SURROGATE(c) || !U_IS_SURROGATE(c) || !U16_IS_SURROGATE(c)){
1285 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1286 }
1287 if(UTF_IS_VALID(c)){
1288 log_err("ERROR: isValid() failed for U+%04x\n", c);
1289 }
1290 if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){
1291 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1292 }
1293 if(UTF_IS_ERROR(c)){
1294 log_err("ERROR: isError() failed for U+%04x\n", c);
1295 }
1296 }else if(i >=6 && i<18){
1297 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
1298 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1299 }
1300 if(!UTF_IS_VALID(c)){
1301 log_err("ERROR: isValid() failed for U+%04x\n", c);
1302 }
1303 if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){
1304 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1305 }
1306 if(UTF_IS_ERROR(c)){
1307 log_err("ERROR: isError() failed for U+%04x\n", c);
1308 }
1309 }else if(i >=18 && i<20){
1310 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
1311 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1312 }
1313 if(UTF_IS_VALID(c)){
1314 log_err("ERROR: isValid() failed for U+%04x\n", c);
1315 }
1316 if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){
1317 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1318 }
1319 if(!UTF_IS_ERROR(c)){
1320 log_err("ERROR: isError() failed for U+%04x\n", c);
1321 }
1322 }
1323 else if(i >=18 && i<(int32_t)(sizeof(codePoint)/sizeof(codePoint[0]))){
1324 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
1325 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1326 }
1327 if(UTF_IS_VALID(c)){
1328 log_err("ERROR: isValid() failed for U+%04x\n", c);
1329 }
1330 if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){
1331 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1332 }
1333 if(!UTF_IS_ERROR(c)){
1334 log_err("ERROR: isError() failed for U+%04x\n", c);
1335 }
1336 }
1337 }
1338
1339}
1340
1341static void TestCharLength()
1342{
1343 const int32_t codepoint[]={
1344 1, 0x0061,
1345 1, 0xe065,
1346 1, 0x20ac,
1347 2, 0x20402,
1348 2, 0x23456,
1349 2, 0x24506,
1350 2, 0x20402,
1351 2, 0x10402,
1352 1, 0xd7ff,
1353 1, 0xe000
1354 };
1355
1356 int32_t i;
1357 UBool multiple;
1358 for(i=0; i<(int32_t)(sizeof(codepoint)/sizeof(codepoint[0])); i=(int16_t)(i+2)){
1359 UChar32 c=codepoint[i+1];
1360 if(UTF_CHAR_LENGTH(c) != codepoint[i] || U16_LENGTH(c) != codepoint[i]){
1361 log_err("The no: of code units for U+%04x:- Expected: %d Got: %d\n", c, codepoint[i], UTF_CHAR_LENGTH(c));
1362 }
1363 multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
1364 if(UTF_NEED_MULTIPLE_UCHAR(c) != multiple){
1365 log_err("ERROR: Unicode::needMultipleUChar() failed for U+%04x\n", c);
1366 }
1367 }
1368}
1369
1370/*internal functions ----*/
1371static int32_t MakeProp(char* str)
1372{
1373 int32_t result = 0;
1374 char* matchPosition =0;
1375
1376 matchPosition = strstr(tagStrings, str);
1377 if (matchPosition == 0)
1378 {
1379 log_err("unrecognized type letter ");
1380 log_err(str);
1381 }
1382 else result = ((matchPosition - tagStrings) / 2);
1383 return result;
1384}
1385
1386static int32_t MakeDir(char* str)
1387{
1388 int32_t pos = 0;
1389 for (pos = 0; pos < 19; pos++) {
1390 if (strcmp(str, dirStrings[pos]) == 0) {
1391 return pos;
1392 }
1393 }
1394 return -1;
1395}
1396
1397/* test u_charName() -------------------------------------------------------- */
1398
1399static const struct {
1400 uint32_t code;
1401 const char *name, *oldName, *extName;
1402} names[]={
1403 {0x0061, "LATIN SMALL LETTER A", "", "LATIN SMALL LETTER A"},
1404 {0x0284, "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK", "LATIN SMALL LETTER DOTLESS J BAR HOOK", "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK" },
1405 {0x3401, "CJK UNIFIED IDEOGRAPH-3401", "", "CJK UNIFIED IDEOGRAPH-3401" },
1406 {0x7fed, "CJK UNIFIED IDEOGRAPH-7FED", "", "CJK UNIFIED IDEOGRAPH-7FED" },
1407 {0xac00, "HANGUL SYLLABLE GA", "", "HANGUL SYLLABLE GA" },
1408 {0xd7a3, "HANGUL SYLLABLE HIH", "", "HANGUL SYLLABLE HIH" },
1409 {0xd800, "", "", "<lead surrogate-D800>" },
1410 {0xdc00, "", "", "<trail surrogate-DC00>" },
1411 {0xff08, "FULLWIDTH LEFT PARENTHESIS", "FULLWIDTH OPENING PARENTHESIS", "FULLWIDTH LEFT PARENTHESIS" },
1412 {0xffe5, "FULLWIDTH YEN SIGN", "", "FULLWIDTH YEN SIGN" },
1413 {0xffff, "", "", "<noncharacter-FFFF>" },
1414 {0x23456, "CJK UNIFIED IDEOGRAPH-23456", "", "CJK UNIFIED IDEOGRAPH-23456" }
1415};
1416
1417static UBool
1418enumCharNamesFn(void *context,
1419 UChar32 code, UCharNameChoice nameChoice,
1420 const char *name, int32_t length) {
1421 int32_t *pCount=(int32_t *)context;
1422 int i;
1423
1424 if(length<=0 || length!=(int32_t)strlen(name)) {
1425 /* should not be called with an empty string or invalid length */
1426 log_err("u_enumCharName(0x%lx)=%s but length=%ld\n", name, length);
1427 return TRUE;
1428 }
1429
1430 ++*pCount;
1431 for(i=0; i<sizeof(names)/sizeof(names[0]); ++i) {
1432 if(code==(UChar32)names[i].code) {
1433 switch (nameChoice) {
1434 case U_EXTENDED_CHAR_NAME:
1435 if(0!=strcmp(name, names[i].extName)) {
1436 log_err("u_enumCharName(0x%lx - Extended)=%s instead of %s\n", code, name, names[i].extName);
1437 }
1438 break;
1439 case U_UNICODE_CHAR_NAME:
1440 if(0!=strcmp(name, names[i].name)) {
1441 log_err("u_enumCharName(0x%lx)=%s instead of %s\n", code, name, names[i].name);
1442 }
1443 break;
1444 case U_UNICODE_10_CHAR_NAME:
1445 if(names[i].oldName[0]==0 || 0!=strcmp(name, names[i].oldName)) {
1446 log_err("u_enumCharName(0x%lx - 1.0)=%s instead of %s\n", code, name, names[i].oldName);
1447 }
1448 break;
1449 case U_CHAR_NAME_CHOICE_COUNT:
1450 break;
1451 }
1452 break;
1453 }
1454 }
1455 return TRUE;
1456}
1457
1458struct enumExtCharNamesContext {
1459 uint32_t length;
1460 int32_t last;
1461};
1462
1463static UBool
1464enumExtCharNamesFn(void *context,
1465 UChar32 code, UCharNameChoice nameChoice,
1466 const char *name, int32_t length) {
1467 struct enumExtCharNamesContext *ecncp = (struct enumExtCharNamesContext *) context;
1468
1469 if (ecncp->last != (int32_t) code - 1) {
1470 if (ecncp->last < 0) {
1471 log_err("u_enumCharName(0x%lx - Ext) after u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x%lx - Ext)\n", code, ecncp->last, ecncp->last + 1);
1472 } else {
1473 log_err("u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x0 - Ext)\n", code);
1474 }
1475 }
1476 ecncp->last = (int32_t) code;
1477
1478 if (!*name) {
1479 log_err("u_enumCharName(0x%lx - Ext) should not be an empty string\n", code);
1480 }
1481
1482 return enumCharNamesFn(&ecncp->length, code, nameChoice, name, length);
1483}
1484
1485/**
1486 * This can be made more efficient by moving it into putil.c and having
1487 * it directly access the ebcdic translation tables.
1488 * TODO: If we get this method in putil.c, then delete it from here.
1489 */
1490static UChar
1491u_charToUChar(char c) {
1492 UChar uc;
1493 u_charsToUChars(&c, &uc, 1);
1494 return uc;
1495}
1496
1497static void
1498TestCharNames() {
1499 static char name[80];
1500 UErrorCode errorCode=U_ZERO_ERROR;
1501 struct enumExtCharNamesContext extContext;
1502 int32_t length;
1503 UChar32 c;
1504 int32_t i;
1505
1506 log_verbose("Testing uprv_getMaxCharNameLength()\n");
1507 length=uprv_getMaxCharNameLength();
1508 if(length==0) {
1509 /* no names data available */
1510 return;
1511 }
1512 if(length<83) { /* Unicode 3.2 max char name length */
1513 log_err("uprv_getMaxCharNameLength()=%d is too short");
1514 }
1515 /* ### TODO same tests for max ISO comment length as for max name length */
1516
1517 log_verbose("Testing u_charName()\n");
1518 for(i=0; i<(int32_t)(sizeof(names)/sizeof(names[0])); ++i) {
1519 /* modern Unicode character name */
1520 length=u_charName(names[i].code, U_UNICODE_CHAR_NAME, name, sizeof(name), &errorCode);
1521 if(U_FAILURE(errorCode)) {
1522 log_err("u_charName(0x%lx) error %s\n", names[i].code, u_errorName(errorCode));
1523 return;
1524 }
1525 if(length<0 || 0!=strcmp(name, names[i].name) || length!=(uint16_t)strlen(name)) {
1526 log_err("u_charName(0x%lx) gets: %s (length %ld) instead of: %s\n", names[i].code, name, length, names[i].name);
1527 }
1528
1529 /* find the modern name */
1530 if (*names[i].name) {
1531 c=u_charFromName(U_UNICODE_CHAR_NAME, names[i].name, &errorCode);
1532 if(U_FAILURE(errorCode)) {
1533 log_err("u_charFromName(%s) error %s\n", names[i].name, u_errorName(errorCode));
1534 return;
1535 }
1536 if(c!=(UChar32)names[i].code) {
1537 log_err("u_charFromName(%s) gets 0x%lx instead of 0x%lx\n", names[i].name, c, names[i].code);
1538 }
1539 }
1540
1541 /* Unicode 1.0 character name */
1542 length=u_charName(names[i].code, U_UNICODE_10_CHAR_NAME, name, sizeof(name), &errorCode);
1543 if(U_FAILURE(errorCode)) {
1544 log_err("u_charName(0x%lx - 1.0) error %s\n", names[i].code, u_errorName(errorCode));
1545 return;
1546 }
1547 if(length<0 || (length>0 && 0!=strcmp(name, names[i].oldName)) || length!=(uint16_t)strlen(name)) {
1548 log_err("u_charName(0x%lx - 1.0) gets %s length %ld instead of nothing or %s\n", names[i].code, name, length, names[i].oldName);
1549 }
1550
1551 /* find the Unicode 1.0 name if it is stored (length>0 means that we could read it) */
1552 if(names[i].oldName[0]!=0 /* && length>0 */) {
1553 c=u_charFromName(U_UNICODE_10_CHAR_NAME, names[i].oldName, &errorCode);
1554 if(U_FAILURE(errorCode)) {
1555 log_err("u_charFromName(%s - 1.0) error %s\n", names[i].oldName, u_errorName(errorCode));
1556 return;
1557 }
1558 if(c!=(UChar32)names[i].code) {
1559 log_err("u_charFromName(%s - 1.0) gets 0x%lx instead of 0x%lx\n", names[i].oldName, c, names[i].code);
1560 }
1561 }
1562 }
1563
1564 /* test u_enumCharNames() */
1565 length=0;
1566 errorCode=U_ZERO_ERROR;
1567 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumCharNamesFn, &length, U_UNICODE_CHAR_NAME, &errorCode);
1568 if(U_FAILURE(errorCode) || length<94140) {
1569 log_err("u_enumCharNames(%ld..%lx) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE, u_errorName(errorCode), length);
1570 }
1571
1572 extContext.length = 0;
1573 extContext.last = -1;
1574 errorCode=U_ZERO_ERROR;
1575 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumExtCharNamesFn, &extContext, U_EXTENDED_CHAR_NAME, &errorCode);
1576 if(U_FAILURE(errorCode) || extContext.length<UCHAR_MAX_VALUE + 1) {
1577 log_err("u_enumCharNames(%ld..0x%lx - Extended) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, u_errorName(errorCode), extContext.length);
1578 }
1579
1580 /* test that u_charFromName() uppercases the input name, i.e., works with mixed-case names (new in 2.0) */
1581 if(0x61!=u_charFromName(U_UNICODE_CHAR_NAME, "LATin smALl letTER A", &errorCode)) {
1582 log_err("u_charFromName(U_UNICODE_CHAR_NAME, \"LATin smALl letTER A\") did not find U+0061 (%s)\n", u_errorName(errorCode));
1583 }
1584
1585 /* Test getCharNameCharacters */
1586 if(!QUICK) {
1587 enum { BUFSIZE = 256 };
1588 UErrorCode ec = U_ZERO_ERROR;
1589 char buf[BUFSIZE];
1590 int32_t maxLength;
1591 UChar32 cp;
1592 UChar pat[BUFSIZE], dumbPat[BUFSIZE];
1593 int32_t l1, l2;
1594 UBool map[256];
1595 UBool ok;
1596
1597 USet* set = uset_open(1, 0); /* empty set */
1598 USet* dumb = uset_open(1, 0); /* empty set */
1599
1600 /*
1601 * uprv_getCharNameCharacters() will likely return more lowercase
1602 * letters than actual character names contain because
1603 * it includes all the characters in lowercased names of
1604 * general categories, for the full possible set of extended names.
1605 */
1606 uprv_getCharNameCharacters(set);
1607
1608 /* build set the dumb (but sure-fire) way */
1609 for (i=0; i<256; ++i)
1610 map[i] = FALSE;
1611
1612 maxLength=0;
1613 for (cp=0; cp<0x110000; ++cp) {
1614 int32_t len = u_charName(cp, U_EXTENDED_CHAR_NAME,
1615 buf, BUFSIZE, &ec);
1616 if (U_FAILURE(ec)) {
1617 log_err("FAIL: u_charName failed when it shouldn't\n");
1618 uset_close(set);
1619 uset_close(dumb);
1620 return;
1621 }
1622 if(len>maxLength) {
1623 maxLength=len;
1624 }
1625
1626 for (i=0; i<len; ++i) {
1627 if (!map[(uint8_t) buf[i]]) {
1628 uset_add(dumb, (UChar32)u_charToUChar(buf[i]));
1629 map[(uint8_t) buf[i]] = TRUE;
1630 }
1631 }
1632 }
1633
1634 length=uprv_getMaxCharNameLength();
1635 if(length!=maxLength) {
1636 log_err("uprv_getMaxCharNameLength()=%d differs from the maximum length %d of all extended names\n",
1637 length, maxLength);
1638 }
1639
1640 /* compare the sets. Where is my uset_equals?!! */
1641 ok=TRUE;
1642 for(i=0; i<256; ++i) {
1643 if(uset_contains(set, i)!=uset_contains(dumb, i)) {
1644 if(0x61<=i && i<=0x7a /* a-z */ && uset_contains(set, i) && !uset_contains(dumb, i)) {
1645 /* ignore lowercase a-z that are in set but not in dumb */
1646 ok=TRUE;
1647 } else {
1648 ok=FALSE;
1649 break;
1650 }
1651 }
1652 }
1653
1654 l1 = uset_toPattern(set, pat, BUFSIZE, TRUE, &ec);
1655 l2 = uset_toPattern(dumb, dumbPat, BUFSIZE, TRUE, &ec);
1656 if (U_FAILURE(ec)) {
1657 log_err("FAIL: uset_toPattern failed when it shouldn't\n");
1658 uset_close(set);
1659 uset_close(dumb);
1660 return;
1661 }
1662
1663 if (l1 >= BUFSIZE) {
1664 l1 = BUFSIZE-1;
1665 pat[l1] = 0;
1666 }
1667 if (l2 >= BUFSIZE) {
1668 l2 = BUFSIZE-1;
1669 dumbPat[l2] = 0;
1670 }
1671
1672 if (!ok) {
1673 char c1[256], c2[256];
1674 u_UCharsToChars(pat, c1, l1);
1675 u_UCharsToChars(dumbPat, c2, l2);
1676 c1[l1] = c2[l2] = 0;
1677 log_err("FAIL: uprv_getCharNameCharacters() returned %s, expected %s (too many lowercase a-z are ok)\n",
1678 c1, c2);
1679 } else {
1680 char c1[256];
1681 u_UCharsToChars(pat, c1, l1);
1682 c1[l1] = 0;
1683 log_verbose("Ok: uprv_getCharNameCharacters() returned %s\n", c1);
1684 }
1685
1686 uset_close(set);
1687 uset_close(dumb);
1688 }
1689
1690 /* ### TODO: test error cases and other interesting things */
1691}
1692
1693/* test u_isMirrored() and u_charMirror() ----------------------------------- */
1694
1695static void
1696TestMirroring() {
1697 log_verbose("Testing u_isMirrored()\n");
1698 if(!(u_isMirrored(0x28) && u_isMirrored(0xbb) && u_isMirrored(0x2045) && u_isMirrored(0x232a) &&
1699 !u_isMirrored(0x27) && !u_isMirrored(0x61) && !u_isMirrored(0x284) && !u_isMirrored(0x3400)
1700 )
1701 ) {
1702 log_err("u_isMirrored() does not work correctly\n");
1703 }
1704
1705 log_verbose("Testing u_charMirror()\n");
1706 if(!(u_charMirror(0x3c)==0x3e && u_charMirror(0x5d)==0x5b && u_charMirror(0x208d)==0x208e && u_charMirror(0x3017)==0x3016 &&
1707 u_charMirror(0x2e)==0x2e && u_charMirror(0x6f3)==0x6f3 && u_charMirror(0x301c)==0x301c && u_charMirror(0xa4ab)==0xa4ab
1708 )
1709 ) {
1710 log_err("u_charMirror() does not work correctly\n");
1711 }
1712}
1713
1714
1715struct RunTestData
1716{
1717 const char *runText;
1718 UScriptCode runCode;
1719};
1720
1721typedef struct RunTestData RunTestData;
1722
1723static void
1724CheckScriptRuns(UScriptRun *scriptRun, int32_t *runStarts, const RunTestData *testData, int32_t nRuns,
1725 const char *prefix)
1726{
1727 int32_t run, runStart, runLimit;
1728 UScriptCode runCode;
1729
1730 /* iterate over all the runs */
1731 run = 0;
1732 while (uscript_nextRun(scriptRun, &runStart, &runLimit, &runCode)) {
1733 if (runStart != runStarts[run]) {
1734 log_err("%s: incorrect start offset for run %d: expected %d, got %d\n",
1735 prefix, run, runStarts[run], runStart);
1736 }
1737
1738 if (runLimit != runStarts[run + 1]) {
1739 log_err("%s: incorrect limit offset for run %d: expected %d, got %d\n",
1740 prefix, run, runStarts[run + 1], runLimit);
1741 }
1742
1743 if (runCode != testData[run].runCode) {
1744 log_err("%s: incorrect script for run %d: expected \"%s\", got \"%s\"\n",
1745 prefix, run, uscript_getName(testData[run].runCode), uscript_getName(runCode));
1746 }
1747
1748 run += 1;
1749
1750 /* stop when we've seen all the runs we expect to see */
1751 if (run >= nRuns) {
1752 break;
1753 }
1754 }
1755
1756 /* Complain if we didn't see then number of runs we expected */
1757 if (run != nRuns) {
1758 log_err("%s: incorrect number of runs: expected %d, got %d\n", prefix, run, nRuns);
1759 }
1760}
1761
1762static void
1763TestUScriptRunAPI()
1764{
1765 static const RunTestData testData[] = {
1766 {"\\u0020\\u0946\\u0939\\u093F\\u0928\\u094D\\u0926\\u0940\\u0020", USCRIPT_DEVANAGARI},
1767 {"\\u0627\\u0644\\u0639\\u0631\\u0628\\u064A\\u0629\\u0020", USCRIPT_ARABIC},
1768 {"\\u0420\\u0443\\u0441\\u0441\\u043A\\u0438\\u0439\\u0020", USCRIPT_CYRILLIC},
1769 {"English (", USCRIPT_LATIN},
1770 {"\\u0E44\\u0E17\\u0E22", USCRIPT_THAI},
1771 {") ", USCRIPT_LATIN},
1772 {"\\u6F22\\u5B75", USCRIPT_HAN},
1773 {"\\u3068\\u3072\\u3089\\u304C\\u306A\\u3068", USCRIPT_HIRAGANA},
1774 {"\\u30AB\\u30BF\\u30AB\\u30CA", USCRIPT_KATAKANA},
1775 {"\\U00010400\\U00010401\\U00010402\\U00010403", USCRIPT_DESERET}
1776 };
1777
1778 int32_t nTestRuns = sizeof testData / sizeof testData[0];
1779
1780 UChar testString[1024];
1781 int32_t runStarts[256];
1782
1783 int32_t run, stringLimit;
1784 UScriptRun *scriptRun = NULL;
1785 UErrorCode err;
1786
1787 /*
1788 * Fill in the test string and the runStarts array.
1789 */
1790 stringLimit = 0;
1791 for (run = 0; run < nTestRuns; run += 1) {
1792 runStarts[run] = stringLimit;
1793 stringLimit += u_unescape(testData[run].runText, &testString[stringLimit], 1024 - stringLimit);
1794 /*stringLimit -= 1;*/
1795 }
1796
1797 /* The limit of the last run */
1798 runStarts[nTestRuns] = stringLimit;
1799
1800 /*
1801 * Make sure that calling uscript_OpenRun with a NULL text pointer
1802 * and a non-zero text length returns the correct error.
1803 */
1804 err = U_ZERO_ERROR;
1805 scriptRun = uscript_openRun(NULL, stringLimit, &err);
1806
1807 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
1808 log_err("uscript_openRun(NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
1809 }
1810
1811 if (scriptRun != NULL) {
1812 log_err("uscript_openRun(NULL, stringLimit, &err) returned a non-NULL result.\n");
1813 uscript_closeRun(scriptRun);
1814 }
1815
1816 /*
1817 * Make sure that calling uscript_OpenRun with a non-NULL text pointer
1818 * and a zero text length returns the correct error.
1819 */
1820 err = U_ZERO_ERROR;
1821 scriptRun = uscript_openRun(testString, 0, &err);
1822
1823 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
1824 log_err("uscript_openRun(testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
1825 }
1826
1827 if (scriptRun != NULL) {
1828 log_err("uscript_openRun(testString, 0, &err) returned a non-NULL result.\n");
1829 uscript_closeRun(scriptRun);
1830 }
1831
1832 /*
1833 * Make sure that calling uscript_openRun with a NULL text pointer
1834 * and a zero text length doesn't return an error.
1835 */
1836 err = U_ZERO_ERROR;
1837 scriptRun = uscript_openRun(NULL, 0, &err);
1838
1839 if (U_FAILURE(err)) {
1840 log_err("Got error %s from uscript_openRun(NULL, 0, &err)\n", u_errorName(err));
1841 }
1842
1843 /* Make sure that the empty iterator doesn't find any runs */
1844 if (uscript_nextRun(scriptRun, NULL, NULL, NULL)) {
1845 log_err("uscript_nextRun(...) returned TRUE for an empty iterator.\n");
1846 }
1847
1848 /*
1849 * Make sure that calling uscript_setRunText with a NULL text pointer
1850 * and a non-zero text length returns the correct error.
1851 */
1852 err = U_ZERO_ERROR;
1853 uscript_setRunText(scriptRun, NULL, stringLimit, &err);
1854
1855 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
1856 log_err("uscript_setRunText(scriptRun, NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
1857 }
1858
1859 /*
1860 * Make sure that calling uscript_OpenRun with a non-NULL text pointer
1861 * and a zero text length returns the correct error.
1862 */
1863 err = U_ZERO_ERROR;
1864 uscript_setRunText(scriptRun, testString, 0, &err);
1865
1866 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
1867 log_err("uscript_setRunText(scriptRun, testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
1868 }
1869
1870 /*
1871 * Now call uscript_setRunText on the empty iterator
1872 * and make sure that it works.
1873 */
1874 err = U_ZERO_ERROR;
1875 uscript_setRunText(scriptRun, testString, stringLimit, &err);
1876
1877 if (U_FAILURE(err)) {
1878 log_err("Got error %s from uscript_setRunText(...)\n", u_errorName(err));
1879 } else {
1880 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_setRunText");
1881 }
1882
1883 uscript_closeRun(scriptRun);
1884
1885 /*
1886 * Now open an interator over the testString
1887 * using uscript_openRun and make sure that it works
1888 */
1889 scriptRun = uscript_openRun(testString, stringLimit, &err);
1890
1891 if (U_FAILURE(err)) {
1892 log_err("Got error %s from uscript_openRun(...)\n", u_errorName(err));
1893 } else {
1894 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_openRun");
1895 }
1896
1897 /* Now reset the iterator, and make sure
1898 * that it still works.
1899 */
1900 uscript_resetRun(scriptRun);
1901
1902 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_resetRun");
1903
1904 /* Close the iterator */
1905 uscript_closeRun(scriptRun);
1906}
1907
1908/* test additional, non-core properties */
1909static void
1910TestAdditionalProperties() {
1911 /* test data for u_charAge() */
1912 static const struct {
1913 UChar32 c;
1914 UVersionInfo version;
1915 } charAges[]={
1916 {0x41, { 1, 1, 0, 0 }},
1917 {0xffff, { 1, 1, 0, 0 }},
1918 {0x20ab, { 2, 0, 0, 0 }},
1919 {0x2fffe, { 2, 0, 0, 0 }},
1920 {0x20ac, { 2, 1, 0, 0 }},
1921 {0xfb1d, { 3, 0, 0, 0 }},
1922 {0x3f4, { 3, 1, 0, 0 }},
1923 {0x10300, { 3, 1, 0, 0 }},
1924 {0x220, { 3, 2, 0, 0 }},
1925 {0xff60, { 3, 2, 0, 0 }}
1926 };
1927
1928 /* test data for u_hasBinaryProperty() */
1929 static int32_t
1930 props[][3]={ /* code point, property, value */
1931 { 0x0627, UCHAR_ALPHABETIC, TRUE },
1932 { 0x1034a, UCHAR_ALPHABETIC, TRUE },
1933 { 0x2028, UCHAR_ALPHABETIC, FALSE },
1934
1935 { 0x0066, UCHAR_ASCII_HEX_DIGIT, TRUE },
1936 { 0x0067, UCHAR_ASCII_HEX_DIGIT, FALSE },
1937
1938 { 0x202c, UCHAR_BIDI_CONTROL, TRUE },
1939 { 0x202f, UCHAR_BIDI_CONTROL, FALSE },
1940
1941 { 0x003c, UCHAR_BIDI_MIRRORED, TRUE },
1942 { 0x003d, UCHAR_BIDI_MIRRORED, FALSE },
1943
1944 { 0x058a, UCHAR_DASH, TRUE },
1945 { 0x007e, UCHAR_DASH, FALSE },
1946
1947 { 0x0c4d, UCHAR_DIACRITIC, TRUE },
1948 { 0x3000, UCHAR_DIACRITIC, FALSE },
1949
1950 { 0x0e46, UCHAR_EXTENDER, TRUE },
1951 { 0x0020, UCHAR_EXTENDER, FALSE },
1952
1953#if !UCONFIG_NO_NORMALIZATION
1954 { 0xfb1d, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
1955 { 0x1d15f, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
1956 { 0xfb1e, UCHAR_FULL_COMPOSITION_EXCLUSION, FALSE },
1957#endif
1958
1959 { 0x0044, UCHAR_HEX_DIGIT, TRUE },
1960 { 0xff46, UCHAR_HEX_DIGIT, TRUE },
1961 { 0x0047, UCHAR_HEX_DIGIT, FALSE },
1962
1963 { 0x30fb, UCHAR_HYPHEN, TRUE },
1964 { 0xfe58, UCHAR_HYPHEN, FALSE },
1965
1966 { 0x2172, UCHAR_ID_CONTINUE, TRUE },
1967 { 0x0307, UCHAR_ID_CONTINUE, TRUE },
1968 { 0x005c, UCHAR_ID_CONTINUE, FALSE },
1969
1970 { 0x2172, UCHAR_ID_START, TRUE },
1971 { 0x007a, UCHAR_ID_START, TRUE },
1972 { 0x0039, UCHAR_ID_START, FALSE },
1973
1974 { 0x4db5, UCHAR_IDEOGRAPHIC, TRUE },
1975 { 0x2f999, UCHAR_IDEOGRAPHIC, TRUE },
1976 { 0x2f99, UCHAR_IDEOGRAPHIC, FALSE },
1977
1978 { 0x200c, UCHAR_JOIN_CONTROL, TRUE },
1979 { 0x2029, UCHAR_JOIN_CONTROL, FALSE },
1980
1981 { 0x1d7bc, UCHAR_LOWERCASE, TRUE },
1982 { 0x0345, UCHAR_LOWERCASE, TRUE },
1983 { 0x0030, UCHAR_LOWERCASE, FALSE },
1984
1985 { 0x1d7a9, UCHAR_MATH, TRUE },
1986 { 0x2135, UCHAR_MATH, TRUE },
1987 { 0x0062, UCHAR_MATH, FALSE },
1988
1989 { 0xfde1, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
1990 { 0x10ffff, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
1991 { 0x10fffd, UCHAR_NONCHARACTER_CODE_POINT, FALSE },
1992
1993 { 0x0022, UCHAR_QUOTATION_MARK, TRUE },
1994 { 0xff62, UCHAR_QUOTATION_MARK, TRUE },
1995 { 0xd840, UCHAR_QUOTATION_MARK, FALSE },
1996
1997 { 0x061f, UCHAR_TERMINAL_PUNCTUATION, TRUE },
1998 { 0xe003f, UCHAR_TERMINAL_PUNCTUATION, FALSE },
1999
2000 { 0x1d44a, UCHAR_UPPERCASE, TRUE },
2001 { 0x2162, UCHAR_UPPERCASE, TRUE },
2002 { 0x0345, UCHAR_UPPERCASE, FALSE },
2003
2004 { 0x0020, UCHAR_WHITE_SPACE, TRUE },
2005 { 0x202f, UCHAR_WHITE_SPACE, TRUE },
2006 { 0x3001, UCHAR_WHITE_SPACE, FALSE },
2007
2008 { 0x0711, UCHAR_XID_CONTINUE, TRUE },
2009 { 0x1d1aa, UCHAR_XID_CONTINUE, TRUE },
2010 { 0x007c, UCHAR_XID_CONTINUE, FALSE },
2011
2012 { 0x16ee, UCHAR_XID_START, TRUE },
2013 { 0x23456, UCHAR_XID_START, TRUE },
2014 { 0x1d1aa, UCHAR_XID_START, FALSE },
2015
2016 /*
2017 * Version break:
2018 * The following properties are only supported starting with the
2019 * Unicode version indicated in the second field.
2020 */
2021 { -1, 0x32, 0 },
2022
2023 { 0x180c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2024 { 0xfe02, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2025 { 0x1801, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, FALSE },
2026
2027 { 0x0341, UCHAR_DEPRECATED, TRUE },
2028 { 0xe0041, UCHAR_DEPRECATED, FALSE },
2029
2030 { 0x00a0, UCHAR_GRAPHEME_BASE, TRUE },
2031 { 0x0a4d, UCHAR_GRAPHEME_BASE, FALSE },
2032 { 0xff9f, UCHAR_GRAPHEME_BASE, TRUE }, /* changed from Unicode 3.2 to 4 */
2033
2034 { 0x0300, UCHAR_GRAPHEME_EXTEND, TRUE },
2035 { 0xff9f, UCHAR_GRAPHEME_EXTEND, FALSE }, /* changed from Unicode 3.2 to 4 */
2036 { 0x0603, UCHAR_GRAPHEME_EXTEND, FALSE },
2037
2038 { 0x0a4d, UCHAR_GRAPHEME_LINK, TRUE },
2039 { 0xff9f, UCHAR_GRAPHEME_LINK, FALSE },
2040
2041 { 0x2ff7, UCHAR_IDS_BINARY_OPERATOR, TRUE },
2042 { 0x2ff3, UCHAR_IDS_BINARY_OPERATOR, FALSE },
2043
2044 { 0x2ff3, UCHAR_IDS_TRINARY_OPERATOR, TRUE },
2045 { 0x2f03, UCHAR_IDS_TRINARY_OPERATOR, FALSE },
2046
2047 { 0x0ec1, UCHAR_LOGICAL_ORDER_EXCEPTION, TRUE },
2048 { 0xdcba, UCHAR_LOGICAL_ORDER_EXCEPTION, FALSE },
2049
2050 { 0x2e9b, UCHAR_RADICAL, TRUE },
2051 { 0x4e00, UCHAR_RADICAL, FALSE },
2052
2053 { 0x012f, UCHAR_SOFT_DOTTED, TRUE },
2054 { 0x0049, UCHAR_SOFT_DOTTED, FALSE },
2055
2056 { 0xfa11, UCHAR_UNIFIED_IDEOGRAPH, TRUE },
2057 { 0xfa12, UCHAR_UNIFIED_IDEOGRAPH, FALSE },
2058
2059 /* enum/integer type properties */
2060
2061 /* UCHAR_BIDI_CLASS tested for assigned characters in TestUnicodeData() */
2062 /* test default Bidi classes for unassigned code points */
2063 { 0x0590, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2064 { 0x05a2, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2065 { 0x05ed, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2066 { 0x07f2, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2067 { 0x08ba, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2068 { 0xfb37, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2069 { 0xfb42, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2070 { 0x10806, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2071 { 0x10909, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2072 { 0x10fe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2073
2074 { 0x0606, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2075 { 0x061c, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2076 { 0x063f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2077 { 0x070e, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2078 { 0x0775, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2079 { 0xfbc2, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2080 { 0xfd90, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2081 { 0xfefe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2082
2083 { 0x02AF, UCHAR_BLOCK, UBLOCK_IPA_EXTENSIONS },
2084 { 0x0C4E, UCHAR_BLOCK, UBLOCK_TELUGU },
2085 { 0x155A, UCHAR_BLOCK, UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS },
2086 { 0x1717, UCHAR_BLOCK, UBLOCK_TAGALOG },
2087 { 0x1AFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2088 { 0x3040, UCHAR_BLOCK, UBLOCK_HIRAGANA },
2089 { 0x1D0FF, UCHAR_BLOCK, UBLOCK_BYZANTINE_MUSICAL_SYMBOLS },
2090 { 0x10D0FF, UCHAR_BLOCK, UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B },
2091 { 0xEFFFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2092
2093 /* UCHAR_CANONICAL_COMBINING_CLASS tested for assigned characters in TestUnicodeData() */
2094 { 0xd7d7, UCHAR_CANONICAL_COMBINING_CLASS, 0 },
2095
2096 { 0x00A0, UCHAR_DECOMPOSITION_TYPE, U_DT_NOBREAK },
2097 { 0x00A8, UCHAR_DECOMPOSITION_TYPE, U_DT_COMPAT },
2098 { 0x00bf, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2099 { 0x00c0, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2100 { 0x1E9B, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2101 { 0xBCDE, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2102 { 0xFB5D, UCHAR_DECOMPOSITION_TYPE, U_DT_MEDIAL },
2103 { 0x1D736, UCHAR_DECOMPOSITION_TYPE, U_DT_FONT },
2104 { 0xe0033, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2105
2106 { 0x0009, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2107 { 0x0020, UCHAR_EAST_ASIAN_WIDTH, U_EA_NARROW },
2108 { 0x00B1, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2109 { 0x20A9, UCHAR_EAST_ASIAN_WIDTH, U_EA_HALFWIDTH },
2110 { 0x2FFB, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2111 { 0x3000, UCHAR_EAST_ASIAN_WIDTH, U_EA_FULLWIDTH },
2112 { 0x35bb, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2113 { 0x58bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2114 { 0xD7A3, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2115 { 0xEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2116 { 0x1D198, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2117 { 0x20000, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2118 { 0x2F8C7, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2119 { 0x3a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, /* plane 3 got default W values in Unicode 4 */
2120 { 0x5a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2121 { 0xFEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2122 { 0x10EEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2123
2124 /* UCHAR_GENERAL_CATEGORY tested for assigned characters in TestUnicodeData() */
2125 { 0xd7d7, UCHAR_GENERAL_CATEGORY, 0 },
2126
2127 { 0x0444, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2128 { 0x0639, UCHAR_JOINING_GROUP, U_JG_AIN },
2129 { 0x072A, UCHAR_JOINING_GROUP, U_JG_DALATH_RISH },
2130 { 0x0647, UCHAR_JOINING_GROUP, U_JG_HEH },
2131 { 0x06C1, UCHAR_JOINING_GROUP, U_JG_HEH_GOAL },
2132 { 0x06C3, UCHAR_JOINING_GROUP, U_JG_HAMZA_ON_HEH_GOAL },
2133
2134 { 0x200C, UCHAR_JOINING_TYPE, U_JT_NON_JOINING },
2135 { 0x200D, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2136 { 0x0639, UCHAR_JOINING_TYPE, U_JT_DUAL_JOINING },
2137 { 0x0640, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2138 { 0x06C3, UCHAR_JOINING_TYPE, U_JT_RIGHT_JOINING },
2139 { 0x0300, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2140 { 0x070F, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2141 { 0xe0033, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2142
2143 /* TestUnicodeData() verifies that no assigned character has "XX" (unknown) */
2144 { 0xe7e7, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2145 { 0x10fffd, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2146 { 0x0028, UCHAR_LINE_BREAK, U_LB_OPEN_PUNCTUATION },
2147 { 0x232A, UCHAR_LINE_BREAK, U_LB_CLOSE_PUNCTUATION },
2148 { 0x3401, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2149 { 0x4e02, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2150 { 0xac03, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2151 { 0x20004, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2152 { 0xf905, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2153 { 0xdb7e, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2154 { 0xdbfd, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2155 { 0xdffc, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2156 { 0x2762, UCHAR_LINE_BREAK, U_LB_EXCLAMATION },
2157 { 0x002F, UCHAR_LINE_BREAK, U_LB_BREAK_SYMBOLS },
2158 { 0x1D49C, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2159 { 0x1731, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2160
2161 /* UCHAR_NUMERIC_TYPE tested in TestNumericProperties() */
2162
2163 /* UCHAR_SCRIPT tested in TestUScriptCodeAPI() */
2164
2165 { 0x1100, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2166 { 0x1111, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2167 { 0x1159, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2168 { 0x115f, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2169
2170 { 0x1160, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2171 { 0x1161, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2172 { 0x1172, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2173 { 0x11a2, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2174
2175 { 0x11a8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2176 { 0x11b8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2177 { 0x11c8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2178 { 0x11f9, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2179
2180 { 0x115a, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2181 { 0x115e, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2182 { 0x11a3, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2183 { 0x11a7, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2184 { 0x11fa, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2185 { 0x11ff, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2186
2187 { 0xac00, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2188 { 0xac1c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2189 { 0xc5ec, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2190 { 0xd788, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2191
2192 { 0xac01, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2193 { 0xac1b, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2194 { 0xac1d, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2195 { 0xc5ee, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2196 { 0xd7a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2197
2198 { 0xd7a4, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2199
2200 /* undefined UProperty values */
2201 { 0x61, 0x4a7, 0 },
2202 { 0x234bc, 0x15ed, 0 }
2203 };
2204
2205 UVersionInfo version;
2206 UChar32 c;
2207 int32_t i, result, uVersion;
2208 UProperty which;
2209
2210 /* what is our Unicode version? */
2211 u_getUnicodeVersion(version);
2212 uVersion=(version[0]<<4)|version[1]; /* major/minor version numbers */
2213
2214 u_charAge(0x20, version);
2215 if(version[0]==0) {
2216 /* no additional properties available */
2217 log_err("TestAdditionalProperties: no additional properties available, not tested\n");
2218 return;
2219 }
2220
2221 /* test u_charAge() */
2222 for(i=0; i<sizeof(charAges)/sizeof(charAges[0]); ++i) {
2223 u_charAge(charAges[i].c, version);
2224 if(0!=memcmp(version, charAges[i].version, sizeof(UVersionInfo))) {
2225 log_err("error: u_charAge(U+%04lx)={ %u, %u, %u, %u } instead of { %u, %u, %u, %u }\n",
2226 charAges[i].c,
2227 version[0], version[1], version[2], version[3],
2228 charAges[i].version[0], charAges[i].version[1], charAges[i].version[2], charAges[i].version[3]);
2229 }
2230 }
2231
2232 if( u_getIntPropertyMinValue(UCHAR_DASH)!=0 ||
2233 u_getIntPropertyMinValue(UCHAR_BIDI_CLASS)!=0 ||
2234 u_getIntPropertyMinValue(UCHAR_BLOCK)!=0 || /* j2478 */
2235 u_getIntPropertyMinValue(UCHAR_SCRIPT)!=0 || /*JB#2410*/
2236 u_getIntPropertyMinValue(0x2345)!=0
2237 ) {
2238 log_err("error: u_getIntPropertyMinValue() wrong\n");
2239 }
2240
2241 if( u_getIntPropertyMaxValue(UCHAR_DASH)!=1 ||
2242 u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE)!=1 ||
2243 u_getIntPropertyMaxValue(UCHAR_BINARY_LIMIT-1)!=1 ||
2244 u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS)!=(int32_t)U_CHAR_DIRECTION_COUNT-1 ||
2245 u_getIntPropertyMaxValue(UCHAR_BLOCK)!=(int32_t)UBLOCK_COUNT-1 ||
2246 u_getIntPropertyMaxValue(UCHAR_LINE_BREAK)!=(int32_t)U_LB_COUNT-1 ||
2247 u_getIntPropertyMaxValue(UCHAR_SCRIPT)!=(int32_t)USCRIPT_CODE_LIMIT-1 ||
2248 u_getIntPropertyMaxValue(0x2345)!=-1 /*JB#2410*/ ||
2249 u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) != (int32_t) (U_DT_COUNT - 1) ||
2250 u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) != (int32_t) (U_JG_COUNT -1) ||
2251 u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) != (int32_t) (U_JT_COUNT -1) ||
2252 u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) != (int32_t) (U_EA_COUNT -1)
2253 ) {
2254 log_err("error: u_getIntPropertyMaxValue() wrong\n");
2255 }
2256
2257 /* test u_hasBinaryProperty() and u_getIntPropertyValue() */
2258 for(i=0; i<sizeof(props)/sizeof(props[0]); ++i) {
2259 if(props[i][0]<0) {
2260 /* Unicode version break */
2261 if(uVersion<props[i][1]) {
2262 break; /* do not test properties that are not yet supported */
2263 } else {
2264 continue; /* skip this row */
2265 }
2266 }
2267
2268 c=(UChar32)props[i][0];
2269 which=(UProperty)props[i][1];
2270
2271 if(which<UCHAR_INT_START) {
2272 result=u_hasBinaryProperty(c, which);
2273 if(result!=props[i][2]) {
2274 log_err("error: u_hasBinaryProperty(U+%04lx, %d)=%d is wrong (props[%d])\n",
2275 c, which, result, i);
2276 }
2277 }
2278
2279 result=u_getIntPropertyValue(c, which);
2280 if(result!=props[i][2]) {
2281 log_err("error: u_getIntPropertyValue(U+%04lx, 0x1000+%d)=%d is wrong, should be %d (props[%d])\n",
2282 c, (int32_t)which-0x1000, result, props[i][2], i);
2283 }
2284
2285 /* test separate functions, too */
2286 switch((UProperty)props[i][1]) {
2287 case UCHAR_ALPHABETIC:
2288 if(u_isUAlphabetic((UChar32)props[i][0])!=(UBool)props[i][2]) {
2289 log_err("error: u_isUAlphabetic(U+%04lx)=%d is wrong (props[%d])\n",
2290 props[i][0], result, i);
2291 }
2292 break;
2293 case UCHAR_LOWERCASE:
2294 if(u_isULowercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2295 log_err("error: u_isULowercase(U+%04lx)=%d is wrong (props[%d])\n",
2296 props[i][0], result, i);
2297 }
2298 break;
2299 case UCHAR_UPPERCASE:
2300 if(u_isUUppercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2301 log_err("error: u_isUUppercase(U+%04lx)=%d is wrong (props[%d])\n",
2302 props[i][0], result, i);
2303 }
2304 break;
2305 case UCHAR_WHITE_SPACE:
2306 if(u_isUWhiteSpace((UChar32)props[i][0])!=(UBool)props[i][2]) {
2307 log_err("error: u_isUWhiteSpace(U+%04lx)=%d is wrong (props[%d])\n",
2308 props[i][0], result, i);
2309 }
2310 break;
2311 default:
2312 break;
2313 }
2314 }
2315}
2316
2317static void
2318TestNumericProperties(void) {
2319 /* see UnicodeData.txt, DerivedNumericValues.txt */
2320 static const struct {
2321 UChar32 c;
2322 int32_t type;
2323 double numValue;
2324 } values[]={
2325 { 0x0F33, U_NT_NUMERIC, -1./2. },
2326 { 0x0C66, U_NT_DECIMAL, 0 },
2327 { 0x96f6, U_NT_NUMERIC, 0 },
2328 { 0x2159, U_NT_NUMERIC, 1./6. },
2329 { 0x00BD, U_NT_NUMERIC, 1./2. },
2330 { 0x0031, U_NT_DECIMAL, 1. },
2331 { 0x4e00, U_NT_NUMERIC, 1. },
2332 { 0x58f1, U_NT_NUMERIC, 1. },
2333 { 0x10320, U_NT_NUMERIC, 1. },
2334 { 0x0F2B, U_NT_NUMERIC, 3./2. },
2335 { 0x00B2, U_NT_DIGIT, 2. },
2336 { 0x5f10, U_NT_NUMERIC, 2. },
2337 { 0x1813, U_NT_DECIMAL, 3. },
2338 { 0x5f0e, U_NT_NUMERIC, 3. },
2339 { 0x2173, U_NT_NUMERIC, 4. },
2340 { 0x8086, U_NT_NUMERIC, 4. },
2341 { 0x278E, U_NT_DIGIT, 5. },
2342 { 0x1D7F2, U_NT_DECIMAL, 6. },
2343 { 0x247A, U_NT_DIGIT, 7. },
2344 { 0x7396, U_NT_NUMERIC, 9. },
2345 { 0x1372, U_NT_NUMERIC, 10. },
2346 { 0x216B, U_NT_NUMERIC, 12. },
2347 { 0x16EE, U_NT_NUMERIC, 17. },
2348 { 0x249A, U_NT_NUMERIC, 19. },
2349 { 0x303A, U_NT_NUMERIC, 30. },
2350 { 0x5345, U_NT_NUMERIC, 30. },
2351 { 0x32B2, U_NT_NUMERIC, 37. },
2352 { 0x1375, U_NT_NUMERIC, 40. },
2353 { 0x10323, U_NT_NUMERIC, 50. },
2354 { 0x0BF1, U_NT_NUMERIC, 100. },
2355 { 0x964c, U_NT_NUMERIC, 100. },
2356 { 0x217E, U_NT_NUMERIC, 500. },
2357 { 0x2180, U_NT_NUMERIC, 1000. },
2358 { 0x4edf, U_NT_NUMERIC, 1000. },
2359 { 0x2181, U_NT_NUMERIC, 5000. },
2360 { 0x137C, U_NT_NUMERIC, 10000. },
2361 { 0x4e07, U_NT_NUMERIC, 10000. },
2362 { 0x4ebf, U_NT_NUMERIC, 100000000. },
2363 { 0x5146, U_NT_NUMERIC, 1000000000000. },
2364 { 0x61, U_NT_NONE, U_NO_NUMERIC_VALUE },
2365 { 0x3000, U_NT_NONE, U_NO_NUMERIC_VALUE },
2366 { 0xfffe, U_NT_NONE, U_NO_NUMERIC_VALUE },
2367 { 0x10301, U_NT_NONE, U_NO_NUMERIC_VALUE },
2368 { 0xe0033, U_NT_NONE, U_NO_NUMERIC_VALUE },
2369 { 0x10ffff, U_NT_NONE, U_NO_NUMERIC_VALUE }
2370 };
2371
2372 double nv;
2373 UChar32 c;
2374 int32_t i, type;
2375
2376 for(i=0; i<LENGTHOF(values); ++i) {
2377 c=values[i].c;
2378 type=u_getIntPropertyValue(c, UCHAR_NUMERIC_TYPE);
2379 nv=u_getNumericValue(c);
2380
2381 if(type!=values[i].type) {
2382 log_err("UCHAR_NUMERIC_TYPE(U+%04lx)=%d should be %d\n", c, type, values[i].type);
2383 }
2384 if(0.000001 <= fabs(nv - values[i].numValue)) {
2385 log_err("u_getNumericValue(U+%04lx)=%g should be %g\n", c, nv, values[i].numValue);
2386 }
2387 }
2388}
2389
2390/**
2391 * Test the property names and property value names API.
2392 */
2393static void
2394TestPropertyNames(void) {
2395 int32_t p, v, choice=0, rev;
2396 UBool atLeastSomething = FALSE;
2397
2398 for (p=0; ; ++p) {
2399 UBool sawProp = FALSE;
2400 if(p > 10 && !atLeastSomething) {
2401 log_data_err("Never got anything after 10 tries.\nYour data is probably fried. Quitting this test\n", p, choice);
2402 return;
2403 }
2404
2405 for (choice=0; ; ++choice) {
2406 const char* name = u_getPropertyName(p, choice);
2407 if (name) {
2408 if (!sawProp) log_verbose("prop 0x%04x+%2d:", p&~0xfff, p&0xfff);
2409 log_verbose("%d=\"%s\"", choice, name);
2410 sawProp = TRUE;
2411 atLeastSomething = TRUE;
2412
2413 /* test reverse mapping */
2414 rev = u_getPropertyEnum(name);
2415 if (rev != p) {
2416 log_err("Property round-trip failure: %d -> %s -> %d\n",
2417 p, name, rev);
2418 }
2419 }
2420 if (!name && choice>0) break;
2421 }
2422 if (sawProp) {
2423 /* looks like a valid property; check the values */
2424 const char* pname = u_getPropertyName(p, U_LONG_PROPERTY_NAME);
2425 int32_t max = 0;
2426 if (p == UCHAR_CANONICAL_COMBINING_CLASS) {
2427 max = 255;
2428 } else if (p == UCHAR_GENERAL_CATEGORY_MASK) {
2429 /* it's far too slow to iterate all the way up to
2430 the real max, U_GC_P_MASK */
2431 max = U_GC_NL_MASK;
2432 } else if (p == UCHAR_BLOCK) {
2433 /* UBlockCodes, unlike other values, start at 1 */
2434 max = 1;
2435 }
2436 log_verbose("\n");
2437 for (v=-1; ; ++v) {
2438 UBool sawValue = FALSE;
2439 for (choice=0; ; ++choice) {
2440 const char* vname = u_getPropertyValueName(p, v, choice);
2441 if (vname) {
2442 if (!sawValue) log_verbose(" %s, value %d:", pname, v);
2443 log_verbose("%d=\"%s\"", choice, vname);
2444 sawValue = TRUE;
2445
2446 /* test reverse mapping */
2447 rev = u_getPropertyValueEnum(p, vname);
2448 if (rev != v) {
2449 log_err("Value round-trip failure (%s): %d -> %s -> %d\n",
2450 pname, v, vname, rev);
2451 }
2452 }
2453 if (!vname && choice>0) break;
2454 }
2455 if (sawValue) {
2456 log_verbose("\n");
2457 }
2458 if (!sawValue && v>=max) break;
2459 }
2460 }
2461 if (!sawProp) {
2462 if (p>=UCHAR_STRING_LIMIT) {
2463 break;
2464 } else if (p>=UCHAR_DOUBLE_LIMIT) {
2465 p = UCHAR_STRING_START - 1;
2466 } else if (p>=UCHAR_MASK_LIMIT) {
2467 p = UCHAR_DOUBLE_START - 1;
2468 } else if (p>=UCHAR_INT_LIMIT) {
2469 p = UCHAR_MASK_START - 1;
2470 } else if (p>=UCHAR_BINARY_LIMIT) {
2471 p = UCHAR_INT_START - 1;
2472 }
2473 }
2474 }
2475}
2476
2477/**
2478 * Test the property values API. See JB#2410.
2479 */
2480static void
2481TestPropertyValues(void) {
2482 int32_t i, p, min, max;
2483 UErrorCode ec;
2484
2485 /* Min should be 0 for everything. */
2486 /* Until JB#2478 is fixed, the one exception is UCHAR_BLOCK. */
2487 for (p=UCHAR_INT_START; p<UCHAR_INT_LIMIT; ++p) {
2488 min = u_getIntPropertyMinValue(p);
2489 if (min != 0) {
2490 if (p == UCHAR_BLOCK) {
2491 /* This is okay...for now. See JB#2487.
2492 TODO Update this for JB#2487. */
2493 } else {
2494 const char* name;
2495 name = u_getPropertyName(p, U_LONG_PROPERTY_NAME);
2496 if (name == NULL) name = "<ERROR>";
2497 log_err("FAIL: u_getIntPropertyMinValue(%s) = %d, exp. 0\n",
2498 name, min);
2499 }
2500 }
2501 }
2502
2503 if( u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY_MASK)!=0 ||
2504 u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY_MASK)!=-1) {
2505 log_err("error: u_getIntPropertyMin/MaxValue(UCHAR_GENERAL_CATEGORY_MASK) is wrong\n");
2506 }
2507
2508 /* Max should be -1 for invalid properties. */
2509 max = u_getIntPropertyMaxValue(-1);
2510 if (max != -1) {
2511 log_err("FAIL: u_getIntPropertyMaxValue(-1) = %d, exp. -1\n",
2512 max);
2513 }
2514
2515 /* Script should return 0 for an invalid code point. */
2516 for (i=0; i<2; ++i) {
2517 int32_t script;
2518 const char* desc;
2519 ec = U_ZERO_ERROR;
2520 switch (i) {
2521 case 0:
2522 script = uscript_getScript(-1, &ec);
2523 desc = "uscript_getScript(-1)";
2524 break;
2525 case 1:
2526 script = u_getIntPropertyValue(-1, UCHAR_SCRIPT);
2527 desc = "u_getIntPropertyValue(-1, UCHAR_SCRIPT)";
2528 break;
2529 default:
2530 log_err("Internal test error. Too many scripts\n");
2531 return;
2532 }
2533 /* We don't explicitly test ec. It should be U_FAILURE but it
2534 isn't documented as such. */
2535 if (script != 0) {
2536 log_err("FAIL: %s = %d, exp. 0\n",
2537 desc, script);
2538 }
2539 }
2540}
2541
2542/* add characters from a serialized set to a normal one */
2543static void
2544_setAddSerialized(USet *set, const USerializedSet *sset) {
2545 UChar32 start, end;
2546 int32_t i, count;
2547
2548 count=uset_getSerializedRangeCount(sset);
2549 for(i=0; i<count; ++i) {
2550 uset_getSerializedRange(sset, i, &start, &end);
2551 uset_addRange(set, start, end);
2552 }
2553}
2554
2555/* various tests for consistency of UCD data and API behavior */
2556static void
2557TestConsistency() {
2558#if !UCONFIG_NO_NORMALIZATION
2559 UChar buffer16[300];
2560#endif
2561 char buffer[300];
2562 USet *set1, *set2, *set3, *set4;
2563 UErrorCode errorCode;
2564
2565#if !UCONFIG_NO_NORMALIZATION
2566 USerializedSet sset;
2567#endif
2568 UChar32 start, end;
2569 int32_t i, length;
2570
2571 U_STRING_DECL(hyphenPattern, "[:Hyphen:]", 10);
2572 U_STRING_DECL(dashPattern, "[:Dash:]", 8);
2573 U_STRING_DECL(lowerPattern, "[:Lowercase:]", 13);
2574 U_STRING_DECL(formatPattern, "[:Cf:]", 6);
2575 U_STRING_DECL(alphaPattern, "[:Alphabetic:]", 14);
2576
2577 U_STRING_INIT(hyphenPattern, "[:Hyphen:]", 10);
2578 U_STRING_INIT(dashPattern, "[:Dash:]", 8);
2579 U_STRING_INIT(lowerPattern, "[:Lowercase:]", 13);
2580 U_STRING_INIT(formatPattern, "[:Cf:]", 6);
2581 U_STRING_INIT(alphaPattern, "[:Alphabetic:]", 14);
2582
2583 /*
2584 * It used to be that UCD.html and its precursors said
2585 * "Those dashes used to mark connections between pieces of words,
2586 * plus the Katakana middle dot."
2587 *
2588 * Unicode 4 changed 00AD Soft Hyphen to Cf and removed it from Dash
2589 * but not from Hyphen.
2590 * UTC 94 (2003mar) decided to leave it that way and to changed UCD.html.
2591 * Therefore, do not show errors when testing the Hyphen property.
2592 */
2593 log_verbose("Starting with Unicode 4, inconsistencies with [:Hyphen:] are\n"
2594 "known to the UTC and not considered errors.\n");
2595
2596 errorCode=U_ZERO_ERROR;
2597 set1=uset_openPattern(hyphenPattern, 10, &errorCode);
2598 set2=uset_openPattern(dashPattern, 8, &errorCode);
2599 if(U_SUCCESS(errorCode)) {
2600 /* remove the Katakana middle dot(s) from set1 */
2601 uset_remove(set1, 0x30fb);
2602 uset_remove(set1, 0xff65); /* halfwidth variant */
2603 showAMinusB(set1, set2, "[:Hyphen:]", "[:Dash:]", FALSE);
2604 } else {
2605 log_err("error opening [:Hyphen:] or [:Dash:] - %s\n", u_errorName(errorCode));
2606 }
2607
2608 /* check that Cf is neither Hyphen nor Dash nor Alphabetic */
2609 set3=uset_openPattern(formatPattern, 6, &errorCode);
2610 set4=uset_openPattern(alphaPattern, 14, &errorCode);
2611 if(U_SUCCESS(errorCode)) {
2612 showAIntersectB(set3, set1, "[:Cf:]", "[:Hyphen:]", FALSE);
2613 showAIntersectB(set3, set2, "[:Cf:]", "[:Dash:]", TRUE);
2614 showAIntersectB(set3, set4, "[:Cf:]", "[:Alphabetic:]", TRUE);
2615 } else {
2616 log_err("error opening [:Cf:] or [:Alpbabetic:] - %s\n", u_errorName(errorCode));
2617 }
2618
2619 uset_close(set1);
2620 uset_close(set2);
2621 uset_close(set3);
2622 uset_close(set4);
2623
2624 /*
2625 * Check that each lowercase character has "small" in its name
2626 * and not "capital".
2627 * There are some such characters, some of which seem odd.
2628 * Use the verbose flag to see these notices.
2629 */
2630 errorCode=U_ZERO_ERROR;
2631 set1=uset_openPattern(lowerPattern, 13, &errorCode);
2632 if(U_SUCCESS(errorCode)) {
2633 for(i=0;; ++i) {
2634 length=uset_getItem(set1, i, &start, &end, NULL, 0, &errorCode);
2635 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
2636 break; /* done */
2637 }
2638 if(U_FAILURE(errorCode)) {
2639 log_err("error iterating over [:Lowercase:] at item %d: %s\n",
2640 i, u_errorName(errorCode));
2641 break;
2642 }
2643 if(length!=0) {
2644 break; /* done with code points, got a string or -1 */
2645 }
2646
2647 while(start<=end) {
2648 length=u_charName(start, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode);
2649 if(U_FAILURE(errorCode)) {
2650 log_err("error getting the name of U+%04x - %s\n", start, u_errorName(errorCode));
2651 errorCode=U_ZERO_ERROR;
2652 continue;
2653 }
2654 if( (strstr(buffer, "SMALL")==NULL || strstr(buffer, "CAPITAL")!=NULL) &&
2655 strstr(buffer, "SMALL CAPITAL")==NULL
2656 ) {
2657 log_verbose("info: [:Lowercase:] contains U+%04x whose name does not suggest lowercase: %s\n", start, buffer);
2658 }
2659 ++start;
2660 }
2661 }
2662 } else {
2663 log_err("error opening [:Lowercase:] - %s\n", u_errorName(errorCode));
2664 }
2665 uset_close(set1);
2666
2667#if !UCONFIG_NO_NORMALIZATION
2668
2669 /*
2670 * Test for an example that unorm_getCanonStartSet() delivers
2671 * all characters that compose from the input one,
2672 * even in multiple steps.
2673 * For example, the set for "I" (0049) should contain both
2674 * I-diaeresis (00CF) and I-diaeresis-acute (1E2E).
2675 * In general, the set for the middle such character should be a subset
2676 * of the set for the first.
2677 */
2678 set1=uset_open(1, 0);
2679 set2=uset_open(1, 0);
2680
2681 unorm_getCanonStartSet(0x49, &sset);
2682 _setAddSerialized(set1, &sset);
2683
2684 /* enumerate all characters that are plausible to be latin letters */
2685 for(start=0xa0; start<0x2000; ++start) {
2686 if(unorm_getDecomposition(start, FALSE, buffer16, LENGTHOF(buffer16))>1 && buffer16[0]==0x49) {
2687 uset_add(set2, start);
2688 }
2689 }
2690
2691 compareUSets(set1, set2,
2692 "[canon start set of 0049]", "[all c with canon decomp with 0049]",
2693 TRUE);
2694 uset_close(set1);
2695 uset_close(set2);
2696
2697#endif
2698}