2 *******************************************************************************
4 * Copyright (C) 2009-2014, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: bidiconf.cpp
10 * tab size: 8 (not used)
13 * created on: 2009oct16
14 * created by: Markus W. Scherer
16 * BiDi conformance test, using the Unicode BidiTest.txt and BidiCharacterTest.txt files.
22 #include "unicode/utypes.h"
23 #include "unicode/ubidi.h"
24 #include "unicode/errorcode.h"
25 #include "unicode/localpointer.h"
26 #include "unicode/putil.h"
27 #include "unicode/unistr.h"
31 class BiDiConformanceTest
: public IntlTest
{
33 BiDiConformanceTest() :
34 directionBits(0), lineNumber(0), levelsCount(0), orderingCount(0),
37 void runIndexedTest(int32_t index
, UBool exec
, const char *&name
, char *par
=NULL
);
40 void TestBidiCharacterTest();
42 UBool
parseLevels(const char *&start
);
43 UBool
parseOrdering(const char *start
);
44 UBool
parseInputStringFromBiDiClasses(const char *&start
);
46 UBool
checkLevels(const UBiDiLevel actualLevels
[], int32_t actualCount
);
47 UBool
checkOrdering(UBiDi
*ubidi
);
49 void printErrorLine();
52 UBiDiLevel levels
[1000];
53 uint32_t directionBits
;
54 int32_t ordering
[1000];
57 int32_t orderingCount
;
59 UnicodeString inputString
;
60 const char *paraLevelName
;
61 char levelNameString
[12];
64 extern IntlTest
*createBiDiConformanceTest() {
65 return new BiDiConformanceTest();
68 void BiDiConformanceTest::runIndexedTest(int32_t index
, UBool exec
, const char *&name
, char * /*par*/) {
70 logln("TestSuite BiDiConformanceTest: ");
73 TESTCASE_AUTO(TestBidiTest
);
74 TESTCASE_AUTO(TestBidiCharacterTest
);
78 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer
, FILE, fclose
);
80 UBool
BiDiConformanceTest::parseLevels(const char *&start
) {
83 while(*start
!=0 && *(start
=u_skipWhitespace(start
))!=0 && *start
!=';') {
85 levels
[levelsCount
++]=UBIDI_DEFAULT_LTR
;
89 uint32_t value
=(uint32_t)strtoul(start
, &end
, 10);
90 if(end
<=start
|| (!U_IS_INV_WHITESPACE(*end
) && *end
!=0 && *end
!=';')
91 || value
>(UBIDI_MAX_EXPLICIT_LEVEL
+1)) {
92 errln("\nError on line %d: Levels parse error at %s", (int)lineNumber
, start
);
96 levels
[levelsCount
++]=(UBiDiLevel
)value
;
97 directionBits
|=(1<<(value
&1));
104 UBool
BiDiConformanceTest::parseOrdering(const char *start
) {
106 while(*start
!=0 && *(start
=u_skipWhitespace(start
))!=0 && *start
!=';') {
108 uint32_t value
=(uint32_t)strtoul(start
, &end
, 10);
109 if(end
<=start
|| (!U_IS_INV_WHITESPACE(*end
) && *end
!=0 && *end
!=';') || value
>=1000) {
110 errln("\nError on line %d: Reorder parse error at %s", (int)lineNumber
, start
);
114 ordering
[orderingCount
++]=(int32_t)value
;
120 static const UChar charFromBiDiClass
[U_CHAR_DIRECTION_COUNT
]={
140 // new in Unicode 6.3/ICU 52
149 static UCharDirection U_CALLCONV
150 biDiConfUBiDiClassCallback(const void * /*context*/, UChar32 c
) {
151 for(int i
=0; i
<U_CHAR_DIRECTION_COUNT
; ++i
) {
152 if(c
==charFromBiDiClass
[i
]) {
153 return (UCharDirection
)i
;
156 // Character not in our hardcoded table.
157 // Should not occur during testing.
158 return U_BIDI_CLASS_DEFAULT
;
163 static const int8_t biDiClassNameLengths
[U_CHAR_DIRECTION_COUNT
+1]={
164 1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, 3, 3, 0
167 UBool
BiDiConformanceTest::parseInputStringFromBiDiClasses(const char *&start
) {
168 inputString
.remove();
170 * Lengthy but fast BiDi class parser.
171 * A simple parser could terminate or extract the name string and use
172 * int32_t biDiClassInt=u_getPropertyValueEnum(UCHAR_BIDI_CLASS, bidiClassString);
173 * but that makes this test take significantly more time.
175 while(*start
!=0 && *(start
=u_skipWhitespace(start
))!=0 && *start
!=';') {
176 UCharDirection biDiClass
=U_CHAR_DIRECTION_COUNT
;
177 // Compare each character once until we have a match on
178 // a complete, short BiDi class name.
182 biDiClass
=U_LEFT_TO_RIGHT_EMBEDDING
;
183 } else if(start
[2]=='I') {
184 biDiClass
=U_LEFT_TO_RIGHT_ISOLATE
;
185 } else if(start
[2]=='O') {
186 biDiClass
=U_LEFT_TO_RIGHT_OVERRIDE
;
189 biDiClass
=U_LEFT_TO_RIGHT
;
191 } else if(start
[0]=='R') {
194 biDiClass
=U_RIGHT_TO_LEFT_EMBEDDING
;
195 } else if(start
[2]=='I') {
196 biDiClass
=U_RIGHT_TO_LEFT_ISOLATE
;
197 } else if(start
[2]=='O') {
198 biDiClass
=U_RIGHT_TO_LEFT_OVERRIDE
;
201 biDiClass
=U_RIGHT_TO_LEFT
;
203 } else if(start
[0]=='E') {
205 biDiClass
=U_EUROPEAN_NUMBER
;
206 } else if(start
[1]=='S') {
207 biDiClass
=U_EUROPEAN_NUMBER_SEPARATOR
;
208 } else if(start
[1]=='T') {
209 biDiClass
=U_EUROPEAN_NUMBER_TERMINATOR
;
211 } else if(start
[0]=='A') {
213 biDiClass
=U_RIGHT_TO_LEFT_ARABIC
;
214 } else if(start
[1]=='N') {
215 biDiClass
=U_ARABIC_NUMBER
;
217 } else if(start
[0]=='C' && start
[1]=='S') {
218 biDiClass
=U_COMMON_NUMBER_SEPARATOR
;
219 } else if(start
[0]=='B') {
221 biDiClass
=U_BOUNDARY_NEUTRAL
;
223 biDiClass
=U_BLOCK_SEPARATOR
;
225 } else if(start
[0]=='S') {
226 biDiClass
=U_SEGMENT_SEPARATOR
;
227 } else if(start
[0]=='W' && start
[1]=='S') {
228 biDiClass
=U_WHITE_SPACE_NEUTRAL
;
229 } else if(start
[0]=='O' && start
[1]=='N') {
230 biDiClass
=U_OTHER_NEUTRAL
;
231 } else if(start
[0]=='P' && start
[1]=='D') {
233 biDiClass
=U_POP_DIRECTIONAL_FORMAT
;
234 } else if(start
[2]=='I') {
235 biDiClass
=U_POP_DIRECTIONAL_ISOLATE
;
237 } else if(start
[0]=='N' && start
[1]=='S' && start
[2]=='M') {
238 biDiClass
=U_DIR_NON_SPACING_MARK
;
239 } else if(start
[0]=='F' && start
[1]=='S' && start
[2]=='I') {
240 biDiClass
=U_FIRST_STRONG_ISOLATE
;
242 // Now we verify that the class name is terminated properly,
243 // and not just the start of a longer word.
244 int8_t biDiClassNameLength
=biDiClassNameLengths
[biDiClass
];
245 char c
=start
[biDiClassNameLength
];
246 if(biDiClass
<U_CHAR_DIRECTION_COUNT
&& (U_IS_INV_WHITESPACE(c
) || c
==';' || c
==0)) {
247 inputString
.append(charFromBiDiClass
[biDiClass
]);
248 start
+=biDiClassNameLength
;
251 errln("\nError on line %d: BiDi class string not recognized at %s", (int)lineNumber
, start
);
258 void BiDiConformanceTest::TestBidiTest() {
259 IcuTestErrorCode
errorCode(*this, "TestBidiTest");
260 const char *sourceTestDataPath
=getSourceTestData(errorCode
);
261 if(errorCode
.logIfFailureAndReset("unable to find the source/test/testdata "
262 "folder (getSourceTestData())")) {
265 char bidiTestPath
[400];
266 strcpy(bidiTestPath
, sourceTestDataPath
);
267 strcat(bidiTestPath
, "BidiTest.txt");
268 LocalStdioFilePointer
bidiTestFile(fopen(bidiTestPath
, "r"));
269 if(bidiTestFile
.isNull()) {
270 errln("unable to open %s", bidiTestPath
);
273 LocalUBiDiPointer
ubidi(ubidi_open());
274 ubidi_setClassCallback(ubidi
.getAlias(), biDiConfUBiDiClassCallback
, NULL
,
275 NULL
, NULL
, errorCode
);
276 if(errorCode
.logIfFailureAndReset("ubidi_setClassCallback()")) {
283 // paraLevelName must be initialized in case the first non-comment line is in error
285 while(errorCount
<10 && fgets(line
, (int)sizeof(line
), bidiTestFile
.getAlias())!=NULL
) {
287 // Remove trailing comments and whitespace.
288 char *commentStart
=strchr(line
, '#');
289 if(commentStart
!=NULL
) {
293 const char *start
=u_skipWhitespace(line
);
295 continue; // Skip empty and comment-only lines.
299 if(0==strncmp(start
, "Levels:", 7)) {
301 if(!parseLevels(start
)) {
304 } else if(0==strncmp(start
, "Reorder:", 8)) {
305 if(!parseOrdering(start
+8)) {
309 // Skip unknown @Xyz: ...
311 if(!parseInputStringFromBiDiClasses(start
)) {
314 start
=u_skipWhitespace(start
);
316 errln("missing ; separator on input line %s", line
);
319 start
=u_skipWhitespace(start
+1);
321 uint32_t bitset
=(uint32_t)strtoul(start
, &end
, 16);
322 if(end
<=start
|| (!U_IS_INV_WHITESPACE(*end
) && *end
!=';' && *end
!=0)) {
323 errln("input bitset parse error at %s", start
);
326 // Loop over the bitset.
327 static const UBiDiLevel paraLevels
[]={ UBIDI_DEFAULT_LTR
, 0, 1, UBIDI_DEFAULT_RTL
};
328 static const char *const paraLevelNames
[]={ "auto/LTR", "LTR", "RTL", "auto/RTL" };
329 for(int i
=0; i
<=3; ++i
) {
331 ubidi_setPara(ubidi
.getAlias(), inputString
.getBuffer(), inputString
.length(),
332 paraLevels
[i
], NULL
, errorCode
);
333 const UBiDiLevel
*actualLevels
=ubidi_getLevels(ubidi
.getAlias(), errorCode
);
334 if(errorCode
.logIfFailureAndReset("ubidi_setPara() or ubidi_getLevels()")) {
335 errln("Input line %d: %s", (int)lineNumber
, line
);
338 paraLevelName
=paraLevelNames
[i
];
339 if(!checkLevels(actualLevels
, ubidi_getProcessedLength(ubidi
.getAlias()))) {
340 // continue outerLoop; does not exist in C++
341 // so just break out of the inner loop.
344 if(!checkOrdering(ubidi
.getAlias())) {
345 // continue outerLoop; does not exist in C++
346 // so just break out of the inner loop.
356 *******************************************************************************
358 * created on: 2013jul01
359 * created by: Matitiahu Allouche
361 This function performs a conformance test for implementations of the
362 Unicode Bidirectional Algorithm, specified in UAX #9: Unicode
363 Bidirectional Algorithm, at http://www.unicode.org/unicode/reports/tr9/
365 Each test case is represented in a single line which is read from a file
366 named BidiCharacter.txt. Empty, blank and comment lines may also appear
369 The format of the test data is specified below. Note that each test
370 case constitutes a single line of text; reordering is applied within a
371 single line and independently of a rendering engine, and rules L3 and L4
374 The number sign '#' is the comment character: everything is ignored from
375 the occurrence of '#' until the end of the line,
376 Empty lines and lines containing only spaces and/or comments are ignored.
378 Lines which represent test cases consist of 4 or 5 fields separated by a
379 semicolon. Each field consists of tokens separated by whitespace (space
380 or Tab). Whitespace before and after semicolons is optional.
382 Field 0: A sequence of hexadecimal code point values separated by space
384 Field 1: A value representing the paragraph direction, as follows:
385 - 0 represents left-to-right
386 - 1 represents right-to-left
387 - 2 represents auto-LTR according to rules P2 and P3 of the algorithm
388 - 3 represents auto-RTL according to rules P2 and P3 of the algorithm
389 - a negative number whose absolute value is taken as paragraph level;
390 this may be useful to test cases where the embedding level approaches
391 or exceeds the maximum embedding level.
393 Field 2: The resolved paragraph embedding level. If the input (field 0)
394 includes more than one paragraph, this field represents the
395 resolved level of the first paragraph.
397 Field 3: An ordered list of resulting levels for each token in field 0
398 (each token represents one source character).
399 The UBA does not assign levels to certain characters (e.g. LRO);
400 characters removed in rule X9 are indicated with an 'x'.
402 Field 4: An ordered list of indices showing the resulting visual ordering
403 from left to right; characters with a resolved level of 'x' are
404 skipped. The number are zero-based. Each index corresponds to
405 a character in the reordered (visual) string. It represents the
406 index of the source character in the input (field 0).
407 This field is optional. When it is absent, the visual ordering
412 # This is a comment line.
413 L L ON R ; 0 ; 0 ; 0 0 0 1 ; 0 1 2 3
414 L L ON R;0;0;0 0 0 1;0 1 2 3
416 # Note: in the next line, 'B' represents a block separator, not the letter 'B'.
417 LRE A B C PDF;2;0;x 2 0 0 x;1 2 3
418 # Note: in the next line, 'b' represents the letter 'b', not a block separator.
419 a b c 05d0 05d1 x ; 0 ; 0 ; 0 0 0 1 1 0 ; 0 1 2 4 3 5
421 a R R x ; 1 ; 1 ; 2 1 1 2
422 L L R R R B R R L L L B ON ON ; 3 ; 0 ; 0 0 1 1 1 0 1 1 2 2 2 1 1 1
425 *******************************************************************************
427 void BiDiConformanceTest::TestBidiCharacterTest() {
428 IcuTestErrorCode
errorCode(*this, "TestBidiCharacterTest");
429 const char *sourceTestDataPath
=getSourceTestData(errorCode
);
430 if(errorCode
.logIfFailureAndReset("unable to find the source/test/testdata "
431 "folder (getSourceTestData())")) {
434 char bidiTestPath
[400];
435 strcpy(bidiTestPath
, sourceTestDataPath
);
436 strcat(bidiTestPath
, "BidiCharacterTest.txt");
437 LocalStdioFilePointer
bidiTestFile(fopen(bidiTestPath
, "r"));
438 if(bidiTestFile
.isNull()) {
439 errln("unable to open %s", bidiTestPath
);
442 LocalUBiDiPointer
ubidi(ubidi_open());
447 while(errorCount
<20 && fgets(line
, (int)sizeof(line
), bidiTestFile
.getAlias())!=NULL
) {
451 // Remove trailing comments and whitespace.
452 char *commentStart
=strchr(line
, '#');
453 if(commentStart
!=NULL
) {
457 const char *start
=u_skipWhitespace(line
);
459 continue; // Skip empty and comment-only lines.
461 // Parse the code point string in field 0.
462 UChar
*buffer
=inputString
.getBuffer(200);
463 int32_t length
=u_parseString(start
, buffer
, inputString
.getCapacity(), NULL
, errorCode
);
464 if(errorCode
.logIfFailureAndReset("Invalid string in field 0")) {
465 errln("Input line %d: %s", (int)lineNumber
, line
);
466 inputString
.remove();
469 inputString
.releaseBuffer(length
);
470 start
=strchr(start
, ';');
473 errln("\nError on line %d: Missing ; separator on line: %s", (int)lineNumber
, line
);
476 start
=u_skipWhitespace(start
+1);
478 int32_t paraDirection
=(int32_t)strtol(start
, &end
, 10);
479 UBiDiLevel paraLevel
=UBIDI_MAX_EXPLICIT_LEVEL
+2;
480 if(paraDirection
==0) {
484 else if(paraDirection
==1) {
488 else if(paraDirection
==2) {
489 paraLevel
=UBIDI_DEFAULT_LTR
;
490 paraLevelName
="Auto/LTR";
492 else if(paraDirection
==3) {
493 paraLevel
=UBIDI_DEFAULT_RTL
;
494 paraLevelName
="Auto/RTL";
496 else if(paraDirection
<0 && -paraDirection
<=(UBIDI_MAX_EXPLICIT_LEVEL
+1)) {
497 paraLevel
=(UBiDiLevel
)(-paraDirection
);
498 sprintf(levelNameString
, "%d", (int)paraLevel
);
499 paraLevelName
=levelNameString
;
501 if(end
<=start
|| (!U_IS_INV_WHITESPACE(*end
) && *end
!=';' && *end
!=0) ||
502 paraLevel
==(UBIDI_MAX_EXPLICIT_LEVEL
+2)) {
503 errln("\nError on line %d: Input paragraph direction incorrect at %s", (int)lineNumber
, start
);
507 start
=u_skipWhitespace(end
);
510 errln("\nError on line %d: Missing ; separator on line: %s", (int)lineNumber
, line
);
514 uint32_t resolvedParaLevel
=(uint32_t)strtoul(start
, &end
, 10);
515 if(end
<=start
|| (!U_IS_INV_WHITESPACE(*end
) && *end
!=';' && *end
!=0) ||
516 resolvedParaLevel
>1) {
517 errln("\nError on line %d: Resolved paragraph level incorrect at %s", (int)lineNumber
, start
);
521 start
=u_skipWhitespace(end
);
524 errln("\nError on line %d: Missing ; separator on line: %s", (int)lineNumber
, line
);
528 if(!parseLevels(start
)) {
531 start
=u_skipWhitespace(start
);
533 if(!parseOrdering(start
+1)) {
540 ubidi_setPara(ubidi
.getAlias(), inputString
.getBuffer(), inputString
.length(),
541 paraLevel
, NULL
, errorCode
);
542 const UBiDiLevel
*actualLevels
=ubidi_getLevels(ubidi
.getAlias(), errorCode
);
543 if(errorCode
.logIfFailureAndReset("ubidi_setPara() or ubidi_getLevels()")) {
544 errln("Input line %d: %s", (int)lineNumber
, line
);
547 UBiDiLevel actualLevel
;
548 if((actualLevel
=ubidi_getParaLevel(ubidi
.getAlias()))!=resolvedParaLevel
) {
550 errln("\nError on line %d: Wrong resolved paragraph level; expected %d actual %d",
551 (int)lineNumber
, resolvedParaLevel
, actualLevel
);
554 if(!checkLevels(actualLevels
, ubidi_getProcessedLength(ubidi
.getAlias()))) {
557 if(orderingCount
>=0 && !checkOrdering(ubidi
.getAlias())) {
563 static UChar
printLevel(UBiDiLevel level
) {
564 if(level
<UBIDI_DEFAULT_LTR
) {
571 static uint32_t getDirectionBits(const UBiDiLevel actualLevels
[], int32_t actualCount
) {
572 uint32_t actualDirectionBits
=0;
573 for(int32_t i
=0; i
<actualCount
; ++i
) {
574 actualDirectionBits
|=(1<<(actualLevels
[i
]&1));
576 return actualDirectionBits
;
579 UBool
BiDiConformanceTest::checkLevels(const UBiDiLevel actualLevels
[], int32_t actualCount
) {
581 if(levelsCount
!=actualCount
) {
582 errln("\nError on line %d: Wrong number of level values; expected %d actual %d",
583 (int)lineNumber
, (int)levelsCount
, (int)actualCount
);
586 for(int32_t i
=0; i
<actualCount
; ++i
) {
587 if(levels
[i
]!=actualLevels
[i
] && levels
[i
]<UBIDI_DEFAULT_LTR
) {
588 if(directionBits
!=3 && directionBits
==getDirectionBits(actualLevels
, actualCount
)) {
589 // ICU used a shortcut:
590 // Since the text is unidirectional, it did not store the resolved
591 // levels but just returns all levels as the paragraph level 0 or 1.
592 // The reordering result is the same, so this is fine.
595 errln("\nError on line %d: Wrong level value at index %d; expected %d actual %d",
596 (int)lineNumber
, (int)i
, levels
[i
], actualLevels
[i
]);
605 UnicodeString
els("Expected levels: ");
607 for(i
=0; i
<levelsCount
; ++i
) {
608 els
.append((UChar
)0x20).append(printLevel(levels
[i
]));
610 UnicodeString
als("Actual levels: ");
611 for(i
=0; i
<actualCount
; ++i
) {
612 als
.append((UChar
)0x20).append(printLevel(actualLevels
[i
]));
620 // Note: ubidi_setReorderingOptions(ubidi, UBIDI_OPTION_REMOVE_CONTROLS);
621 // does not work for custom BiDi class assignments
622 // and anyway also removes LRM/RLM/ZWJ/ZWNJ which is not desirable here.
623 // Therefore we just skip the indexes for BiDi controls while comparing
624 // with the expected ordering that has them omitted.
625 UBool
BiDiConformanceTest::checkOrdering(UBiDi
*ubidi
) {
627 IcuTestErrorCode
errorCode(*this, "checkOrdering()");
628 int32_t resultLength
=ubidi_getResultLength(ubidi
); // visual length including BiDi controls
629 int32_t i
, visualIndex
;
630 // Note: It should be faster to call ubidi_countRuns()/ubidi_getVisualRun()
631 // and loop over each run's indexes, but that seems unnecessary for this test code.
632 for(i
=visualIndex
=0; i
<resultLength
; ++i
) {
633 int32_t logicalIndex
=ubidi_getLogicalIndex(ubidi
, i
, errorCode
);
634 if(errorCode
.logIfFailureAndReset("ubidi_getLogicalIndex()")) {
635 errln("Input line %d: %s", (int)lineNumber
, line
);
638 if(levels
[logicalIndex
]>=UBIDI_DEFAULT_LTR
) {
639 continue; // BiDi control, omitted from expected ordering.
641 if(visualIndex
<orderingCount
&& logicalIndex
!=ordering
[visualIndex
]) {
642 errln("\nError on line %d: Wrong ordering value at visual index %d; expected %d actual %d",
643 (int)lineNumber
, (int)visualIndex
, ordering
[visualIndex
], logicalIndex
);
649 // visualIndex is now the visual length minus the BiDi controls,
650 // which should match the length of the BidiTest.txt ordering.
651 if(isOk
&& orderingCount
!=visualIndex
) {
652 errln("\nError on line %d: Wrong number of ordering values; expected %d actual %d",
653 (int)lineNumber
, (int)orderingCount
, (int)visualIndex
);
658 UnicodeString
eord("Expected ordering: ");
659 for(i
=0; i
<orderingCount
; ++i
) {
660 eord
.append((UChar
)0x20).append((UChar
)(0x30+ordering
[i
]));
662 UnicodeString
aord("Actual ordering: ");
663 for(i
=0; i
<resultLength
; ++i
) {
664 int32_t logicalIndex
=ubidi_getLogicalIndex(ubidi
, i
, errorCode
);
665 if(levels
[logicalIndex
]<UBIDI_DEFAULT_LTR
) {
666 aord
.append((UChar
)0x20).append((UChar
)(0x30+logicalIndex
));
675 void BiDiConformanceTest::printErrorLine() {
677 errln("Input line %5d: %s", (int)lineNumber
, line
);
678 errln(UnicodeString("Input string: ")+inputString
);
679 errln("Para level: %s", paraLevelName
);