1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
5 * Copyright (c) 1997-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8 /********************************************************************************
12 * Modification History:
14 * Madhu Katragadda Creation
15 *********************************************************************************/
16 /*C API TEST FOR BREAKITERATOR */
18 * This is an API test. It doesn't test very many cases, and doesn't
19 * try to test the full functionality. It just calls each function in the class and
20 * verifies that it works on a basic level.
23 #include "unicode/utypes.h"
25 #if !UCONFIG_NO_BREAK_ITERATION
29 #include "unicode/uloc.h"
30 #include "unicode/ubrk.h"
31 #include "unicode/ustring.h"
32 #include "unicode/ucnv.h"
33 #include "unicode/utext.h"
38 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
39 log_data_err("Failure at file %s, line %d, error = %s (Are you missing data?)\n", __FILE__, __LINE__, u_errorName(status));}}
41 #define TEST_ASSERT(expr) {if ((expr)==FALSE) { \
42 log_data_err("Test Failure at file %s, line %d (Are you missing data?)\n", __FILE__, __LINE__);}}
44 #define APPLE_ADDITIONS 1
46 #if !UCONFIG_NO_FILE_IO
47 static void TestBreakIteratorSafeClone(void);
49 static void TestBreakIteratorRules(void);
50 static void TestBreakIteratorRuleError(void);
51 static void TestBreakIteratorStatusVec(void);
52 static void TestBreakIteratorUText(void);
53 static void TestBreakIteratorTailoring(void);
54 static void TestBreakIteratorRefresh(void);
55 static void TestBug11665(void);
56 static void TestBreakIteratorSuppressions(void);
58 static void TestRuleBasedTokenizer(void);
61 void addBrkIterAPITest(TestNode
** root
);
63 void addBrkIterAPITest(TestNode
** root
)
65 #if !UCONFIG_NO_FILE_IO
66 addTest(root
, &TestBreakIteratorCAPI
, "tstxtbd/cbiapts/TestBreakIteratorCAPI");
67 addTest(root
, &TestBreakIteratorSafeClone
, "tstxtbd/cbiapts/TestBreakIteratorSafeClone");
68 addTest(root
, &TestBreakIteratorUText
, "tstxtbd/cbiapts/TestBreakIteratorUText");
70 addTest(root
, &TestBreakIteratorRules
, "tstxtbd/cbiapts/TestBreakIteratorRules");
71 addTest(root
, &TestBreakIteratorRuleError
, "tstxtbd/cbiapts/TestBreakIteratorRuleError");
72 addTest(root
, &TestBreakIteratorStatusVec
, "tstxtbd/cbiapts/TestBreakIteratorStatusVec");
73 addTest(root
, &TestBreakIteratorTailoring
, "tstxtbd/cbiapts/TestBreakIteratorTailoring");
74 addTest(root
, &TestBreakIteratorRefresh
, "tstxtbd/cbiapts/TestBreakIteratorRefresh");
75 addTest(root
, &TestBug11665
, "tstxtbd/cbiapts/TestBug11665");
76 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
77 addTest(root
, &TestBreakIteratorSuppressions
, "tstxtbd/cbiapts/TestBreakIteratorSuppressions");
80 addTest(root
, &TestRuleBasedTokenizer
, "tstxtbd/cbiapts/TestRuleBasedTokenizer");
84 #define CLONETEST_ITERATOR_COUNT 2
87 * Utility function for converting char * to UChar * strings, to
88 * simplify the test code. Converted strings are put in heap allocated
89 * storage. A hook (probably a local in the caller's code) allows all
90 * strings converted with that hook to be freed with a single call.
92 typedef struct StringStruct
{
93 struct StringStruct
*link
;
98 static UChar
* toUChar(const char *src
, void **freeHook
) {
99 /* Structure of the memory that we allocate on the heap */
103 UChar stackBuf
[2000 + sizeof(void *)/sizeof(UChar
)];
107 UErrorCode status
= U_ZERO_ERROR
;
112 cnv
= ucnv_open(NULL
, &status
);
113 if(U_FAILURE(status
) || cnv
== NULL
) {
117 numUChars
= ucnv_toUChars(cnv
,
123 destSize
= (numUChars
+1) * sizeof(UChar
) + sizeof(struct StringStruct
);
124 dest
= (StringStruct
*)malloc(destSize
);
126 if (status
== U_BUFFER_OVERFLOW_ERROR
|| status
== U_STRING_NOT_TERMINATED_WARNING
) {
127 ucnv_toUChars(cnv
, dest
->str
, numUChars
+1, src
, -1, &status
);
128 } else if (status
== U_ZERO_ERROR
) {
129 u_strcpy(dest
->str
, stackBuf
);
136 ucnv_reset(cnv
); /* be good citizens */
142 dest
->link
= (StringStruct
*)(*freeHook
);
147 static void freeToUCharStrings(void **hook
) {
148 StringStruct
*s
= *(StringStruct
**)hook
;
150 StringStruct
*next
= s
->link
;
157 #if !UCONFIG_NO_FILE_IO
158 static void TestBreakIteratorCAPI()
160 UErrorCode status
= U_ZERO_ERROR
;
161 UBreakIterator
*word
, *sentence
, *line
, *character
, *b
, *bogus
;
162 int32_t start
,pos
,end
,to
;
168 /* Note: the adjacent "" are concatenating strings, not adding a \" to the
169 string, which is probably what whoever wrote this intended. Don't fix,
170 because it would throw off the hard coded break positions in the following
172 u_uastrcpy(text
, "He's from Africa. ""Mr. Livingston, I presume?"" Yeah");
176 log_verbose("\nTesting BreakIterator open functions\n");
178 /* Use french for fun */
179 word
= ubrk_open(UBRK_WORD
, "en_US", text
, u_strlen(text
), &status
);
180 if(status
== U_FILE_ACCESS_ERROR
) {
181 log_data_err("Check your data - it doesn't seem to be around\n");
183 } else if(U_FAILURE(status
)){
184 log_err_status(status
, "FAIL: Error in ubrk_open() for word breakiterator: %s\n", myErrorName(status
));
187 log_verbose("PASS: Successfully opened word breakiterator\n");
190 sentence
= ubrk_open(UBRK_SENTENCE
, "en_US", text
, u_strlen(text
), &status
);
191 if(U_FAILURE(status
)){
192 log_err_status(status
, "FAIL: Error in ubrk_open() for sentence breakiterator: %s\n", myErrorName(status
));
196 log_verbose("PASS: Successfully opened sentence breakiterator\n");
199 line
= ubrk_open(UBRK_LINE
, "en_US", text
, u_strlen(text
), &status
);
200 if(U_FAILURE(status
)){
201 log_err("FAIL: Error in ubrk_open() for line breakiterator: %s\n", myErrorName(status
));
205 log_verbose("PASS: Successfully opened line breakiterator\n");
208 character
= ubrk_open(UBRK_CHARACTER
, "en_US", text
, u_strlen(text
), &status
);
209 if(U_FAILURE(status
)){
210 log_err("FAIL: Error in ubrk_open() for character breakiterator: %s\n", myErrorName(status
));
214 log_verbose("PASS: Successfully opened character breakiterator\n");
216 /*trying to open an illegal iterator*/
217 bogus
= ubrk_open((UBreakIteratorType
)5, "en_US", text
, u_strlen(text
), &status
);
219 log_err("FAIL: expected NULL from opening an invalid break iterator.\n");
221 if(U_SUCCESS(status
)){
222 log_err("FAIL: Error in ubrk_open() for BOGUS breakiterator. Expected U_ILLEGAL_ARGUMENT_ERROR\n");
224 if(U_FAILURE(status
)){
225 if(status
!= U_ILLEGAL_ARGUMENT_ERROR
){
226 log_err("FAIL: Error in ubrk_open() for BOGUS breakiterator. Expected U_ILLEGAL_ARGUMENT_ERROR\n Got %s\n", myErrorName(status
));
232 /* ======= Test ubrk_countAvialable() and ubrk_getAvialable() */
234 log_verbose("\nTesting ubrk_countAvailable() and ubrk_getAvailable()\n");
235 count
=ubrk_countAvailable();
236 /* use something sensible w/o hardcoding the count */
238 log_err("FAIL: Error in ubrk_countAvialable() returned %d\n", count
);
241 log_verbose("PASS: ubrk_countAvialable() successful returned %d\n", count
);
245 log_verbose("%s\n", ubrk_getAvailable(i
));
246 if (ubrk_getAvailable(i
) == 0)
247 log_err("No locale for which breakiterator is applicable\n");
249 log_verbose("A locale %s for which breakiterator is applicable\n",ubrk_getAvailable(i
));
252 /*========Test ubrk_first(), ubrk_last()...... and other functions*/
254 log_verbose("\nTesting the functions for word\n");
255 start
= ubrk_first(word
);
257 log_err("error ubrk_start(word) did not return 0\n");
258 log_verbose("first (word = %d\n", (int32_t)start
);
261 log_err("error ubrk_next(word) did not return 4\n");
262 log_verbose("next (word = %d\n", (int32_t)pos
);
263 pos
=ubrk_following(word
, 4);
265 log_err("error ubrl_following(word,4) did not return 6\n");
266 log_verbose("next (word = %d\n", (int32_t)pos
);
269 log_err("error ubrk_last(word) did not return 49\n");
270 log_verbose("last (word = %d\n", (int32_t)end
);
272 pos
=ubrk_previous(word
);
273 log_verbose("%d %d\n", end
, pos
);
275 pos
=ubrk_previous(word
);
276 log_verbose("%d \n", pos
);
278 if (ubrk_isBoundary(word
, 2) != FALSE
) {
279 log_err("error ubrk_isBoundary(word, 2) did not return FALSE\n");
281 pos
=ubrk_current(word
);
283 log_err("error ubrk_current() != 4 after ubrk_isBoundary(word, 2)\n");
285 if (ubrk_isBoundary(word
, 4) != TRUE
) {
286 log_err("error ubrk_isBoundary(word, 4) did not return TRUE\n");
291 log_verbose("\nTesting the functions for character\n");
292 ubrk_first(character
);
293 pos
= ubrk_following(character
, 5);
295 log_err("error ubrk_following(character,5) did not return 6\n");
296 log_verbose("Following (character,5) = %d\n", (int32_t)pos
);
297 pos
=ubrk_following(character
, 18);
299 log_err("error ubrk_following(character,18) did not return 19\n");
300 log_verbose("Followingcharacter,18) = %d\n", (int32_t)pos
);
301 pos
=ubrk_preceding(character
, 22);
303 log_err("error ubrk_preceding(character,22) did not return 21\n");
304 log_verbose("preceding(character,22) = %d\n", (int32_t)pos
);
307 log_verbose("\nTesting the functions for line\n");
308 pos
=ubrk_first(line
);
310 log_err("error ubrk_first(line) returned %d, expected 0\n", (int32_t)pos
);
311 pos
= ubrk_next(line
);
312 pos
=ubrk_following(line
, 18);
314 log_err("error ubrk_following(line) did not return 22\n");
315 log_verbose("following (line) = %d\n", (int32_t)pos
);
318 log_verbose("\nTesting the functions for sentence\n");
319 ubrk_first(sentence
);
320 pos
= ubrk_current(sentence
);
321 log_verbose("Current(sentence) = %d\n", (int32_t)pos
);
322 pos
= ubrk_last(sentence
);
324 log_err("error ubrk_last for sentence did not return 49\n");
325 log_verbose("Last (sentence) = %d\n", (int32_t)pos
);
326 ubrk_first(sentence
);
327 to
= ubrk_following( sentence
, 0 );
328 if (to
== 0) log_err("ubrk_following returned 0\n");
329 to
= ubrk_preceding( sentence
, to
);
330 if (to
!= 0) log_err("ubrk_preceding didn't return 0\n");
331 if (ubrk_first(sentence
)!=ubrk_current(sentence
)) {
332 log_err("error in ubrk_first() or ubrk_current()\n");
337 /*Testing ubrk_open and ubrk_close()*/
338 log_verbose("\nTesting open and close for us locale\n");
339 b
= ubrk_open(UBRK_WORD
, "fr_FR", text
, u_strlen(text
), &status
);
340 if (U_FAILURE(status
)) {
341 log_err("ubrk_open for word returned NULL: %s\n", myErrorName(status
));
345 /* Test setText and setUText */
347 UChar s1
[] = {0x41, 0x42, 0x20, 0};
348 UChar s2
[] = {0x41, 0x42, 0x43, 0x44, 0x45, 0};
353 log_verbose("\nTesting ubrk_setText() and ubrk_setUText()\n");
354 status
= U_ZERO_ERROR
;
355 bb
= ubrk_open(UBRK_WORD
, "en_US", NULL
, 0, &status
);
356 TEST_ASSERT_SUCCESS(status
);
357 ubrk_setText(bb
, s1
, -1, &status
);
358 TEST_ASSERT_SUCCESS(status
);
362 ut
= utext_openUChars(ut
, s2
, -1, &status
);
363 ubrk_setUText(bb
, ut
, &status
);
364 TEST_ASSERT_SUCCESS(status
);
373 ubrk_close(sentence
);
375 ubrk_close(character
);
378 static void TestBreakIteratorSafeClone(void)
380 UChar text
[51]; /* Keep this odd to test for 64-bit memory alignment */
381 /* NOTE: This doesn't reliably force mis-alignment of following items. */
382 uint8_t buffer
[CLONETEST_ITERATOR_COUNT
] [U_BRK_SAFECLONE_BUFFERSIZE
];
383 int32_t bufferSize
= U_BRK_SAFECLONE_BUFFERSIZE
;
385 UBreakIterator
* someIterators
[CLONETEST_ITERATOR_COUNT
];
386 UBreakIterator
* someClonedIterators
[CLONETEST_ITERATOR_COUNT
];
388 UBreakIterator
* brk
;
389 UErrorCode status
= U_ZERO_ERROR
;
393 /*Testing ubrk_safeClone */
395 /* Note: the adjacent "" are concatenating strings, not adding a \" to the
396 string, which is probably what whoever wrote this intended. Don't fix,
397 because it would throw off the hard coded break positions in the following
399 u_uastrcpy(text
, "He's from Africa. ""Mr. Livingston, I presume?"" Yeah");
401 /* US & Thai - rule-based & dictionary based */
402 someIterators
[0] = ubrk_open(UBRK_WORD
, "en_US", text
, u_strlen(text
), &status
);
403 if(!someIterators
[0] || U_FAILURE(status
)) {
404 log_data_err("Couldn't open en_US word break iterator - %s\n", u_errorName(status
));
408 someIterators
[1] = ubrk_open(UBRK_WORD
, "th_TH", text
, u_strlen(text
), &status
);
409 if(!someIterators
[1] || U_FAILURE(status
)) {
410 log_data_err("Couldn't open th_TH word break iterator - %s\n", u_errorName(status
));
414 /* test each type of iterator */
415 for (i
= 0; i
< CLONETEST_ITERATOR_COUNT
; i
++)
418 /* Check the various error & informational states */
420 /* Null status - just returns NULL */
421 if (NULL
!= ubrk_safeClone(someIterators
[i
], buffer
[i
], &bufferSize
, NULL
))
423 log_err("FAIL: Cloned Iterator failed to deal correctly with null status\n");
425 /* error status - should return 0 & keep error the same */
426 status
= U_MEMORY_ALLOCATION_ERROR
;
427 if (NULL
!= ubrk_safeClone(someIterators
[i
], buffer
[i
], &bufferSize
, &status
) || status
!= U_MEMORY_ALLOCATION_ERROR
)
429 log_err("FAIL: Cloned Iterator failed to deal correctly with incoming error status\n");
431 status
= U_ZERO_ERROR
;
433 /* Null buffer size pointer is ok */
434 if (NULL
== (brk
= ubrk_safeClone(someIterators
[i
], buffer
[i
], NULL
, &status
)) || U_FAILURE(status
))
436 log_err("FAIL: Cloned Iterator failed to deal correctly with null bufferSize pointer\n");
439 status
= U_ZERO_ERROR
;
441 /* buffer size pointer is 0 - fill in pbufferSize with a size */
443 if (NULL
!= ubrk_safeClone(someIterators
[i
], buffer
[i
], &bufferSize
, &status
) ||
444 U_FAILURE(status
) || bufferSize
<= 0)
446 log_err("FAIL: Cloned Iterator failed a sizing request ('preflighting')\n");
448 /* Verify our define is large enough */
449 if (U_BRK_SAFECLONE_BUFFERSIZE
< bufferSize
)
451 log_err("FAIL: Pre-calculated buffer size is too small - %d but needed %d\n", U_BRK_SAFECLONE_BUFFERSIZE
, bufferSize
);
453 /* Verify we can use this run-time calculated size */
454 if (NULL
== (brk
= ubrk_safeClone(someIterators
[i
], buffer
[i
], &bufferSize
, &status
)) || U_FAILURE(status
))
456 log_err("FAIL: Iterator can't be cloned with run-time size\n");
460 /* size one byte too small - should allocate & let us know */
461 if (bufferSize
> 1) {
464 if (NULL
== (brk
= ubrk_safeClone(someIterators
[i
], NULL
, &bufferSize
, &status
)) || status
!= U_SAFECLONE_ALLOCATED_WARNING
)
466 log_err("FAIL: Cloned Iterator failed to deal correctly with too-small buffer size\n");
470 status
= U_ZERO_ERROR
;
471 bufferSize
= U_BRK_SAFECLONE_BUFFERSIZE
;
473 /* Null buffer pointer - return Iterator & set error to U_SAFECLONE_ALLOCATED_ERROR */
474 if (NULL
== (brk
= ubrk_safeClone(someIterators
[i
], NULL
, &bufferSize
, &status
)) || status
!= U_SAFECLONE_ALLOCATED_WARNING
)
476 log_err("FAIL: Cloned Iterator failed to deal correctly with null buffer pointer\n");
480 status
= U_ZERO_ERROR
;
482 /* Mis-aligned buffer pointer. */
484 char stackBuf
[U_BRK_SAFECLONE_BUFFERSIZE
+sizeof(void *)];
486 brk
= ubrk_safeClone(someIterators
[i
], &stackBuf
[1], &bufferSize
, &status
);
487 if (U_FAILURE(status
) || brk
== NULL
) {
488 log_err("FAIL: Cloned Iterator failed with misaligned buffer pointer\n");
490 if (status
== U_SAFECLONE_ALLOCATED_WARNING
) {
491 log_verbose("Cloned Iterator allocated when using a mis-aligned buffer.\n");
498 /* Null Iterator - return NULL & set U_ILLEGAL_ARGUMENT_ERROR */
499 if (NULL
!= ubrk_safeClone(NULL
, buffer
[i
], &bufferSize
, &status
) || status
!= U_ILLEGAL_ARGUMENT_ERROR
)
501 log_err("FAIL: Cloned Iterator failed to deal correctly with null Iterator pointer\n");
503 status
= U_ZERO_ERROR
;
505 /* Do these cloned Iterators work at all - make a first & next call */
506 bufferSize
= U_BRK_SAFECLONE_BUFFERSIZE
;
507 someClonedIterators
[i
] = ubrk_safeClone(someIterators
[i
], buffer
[i
], &bufferSize
, &status
);
509 start
= ubrk_first(someClonedIterators
[i
]);
511 log_err("error ubrk_start(clone) did not return 0\n");
512 pos
=ubrk_next(someClonedIterators
[i
]);
514 log_err("error ubrk_next(clone) did not return 4\n");
516 ubrk_close(someClonedIterators
[i
]);
517 ubrk_close(someIterators
[i
]);
524 // Open a break iterator from char * rules. Take care of conversion
525 // of the rules and error checking.
527 static UBreakIterator
* testOpenRules(char *rules
) {
528 UErrorCode status
= U_ZERO_ERROR
;
529 UChar
*ruleSourceU
= NULL
;
530 void *strCleanUp
= NULL
;
531 UParseError parseErr
;
534 ruleSourceU
= toUChar(rules
, &strCleanUp
);
536 bi
= ubrk_openRules(ruleSourceU
, -1, /* The rules */
537 NULL
, -1, /* The text to be iterated over. */
540 if (U_FAILURE(status
)) {
541 log_data_err("FAIL: ubrk_openRules: ICU Error \"%s\" (Are you missing data?)\n", u_errorName(status
));
544 freeToUCharStrings(&strCleanUp
);
550 * TestBreakIteratorRules - Verify that a break iterator can be created from
551 * a set of source rules.
553 static void TestBreakIteratorRules() {
554 /* Rules will keep together any run of letters not including 'a', OR
555 * keep together 'abc', but only when followed by 'def', OTHERWISE
556 * just return one char at a time.
558 char rules
[] = "abc/def{666};\n [\\p{L} - [a]]* {2}; . {1};";
559 /* 0123456789012345678 */
560 char data
[] = "abcdex abcdefgh-def"; /* the test data string */
561 char breaks
[] = "** ** * ** *"; /* * the expected break positions */
562 char tags
[] = "01 21 6 21 2"; /* expected tag values at break positions */
563 int32_t tagMap
[] = {0, 1, 2, 3, 4, 5, 666};
566 void *freeHook
= NULL
;
567 UErrorCode status
= U_ZERO_ERROR
;
571 UBreakIterator
*bi
= testOpenRules(rules
);
572 if (bi
== NULL
) {return;}
573 uData
= toUChar(data
, &freeHook
);
574 ubrk_setText(bi
, uData
, -1, &status
);
576 pos
= ubrk_first(bi
);
577 for (i
=0; i
<sizeof(breaks
); i
++) {
578 if (pos
== i
&& breaks
[i
] != '*') {
579 log_err("FAIL: unexpected break at position %d found\n", pos
);
582 if (pos
!= i
&& breaks
[i
] == '*') {
583 log_err("FAIL: expected break at position %d not found.\n", i
);
587 int32_t tag
, expectedTag
;
588 tag
= ubrk_getRuleStatus(bi
);
589 expectedTag
= tagMap
[tags
[i
]&0xf];
590 if (tag
!= expectedTag
) {
591 log_err("FAIL: incorrect tag value. Position = %d; expected tag %d, got %d",
592 pos
, expectedTag
, tag
);
599 /* #12914 add basic sanity test for ubrk_getBinaryRules, ubrk_openBinaryRules */
600 /* Underlying functionality checked in C++ rbbiapts.cpp TestRoundtripRules */
601 status
= U_ZERO_ERROR
;
602 int32_t rulesLength
= ubrk_getBinaryRules(bi
, NULL
, 0, &status
); /* preflight */
603 if (U_FAILURE(status
)) {
604 log_err("FAIL: ubrk_getBinaryRules preflight err: %s", u_errorName(status
));
606 uint8_t* binaryRules
= (uint8_t*)uprv_malloc(rulesLength
);
607 if (binaryRules
== NULL
) {
608 log_err("FAIL: unable to malloc rules buffer, size %u", rulesLength
);
610 rulesLength
= ubrk_getBinaryRules(bi
, binaryRules
, rulesLength
, &status
);
611 if (U_FAILURE(status
)) {
612 log_err("FAIL: ubrk_getBinaryRules err: %s", u_errorName(status
));
614 UBreakIterator
* bi2
= ubrk_openBinaryRules(binaryRules
, rulesLength
, uData
, -1, &status
);
615 if (U_FAILURE(status
)) {
616 log_err("FAIL: ubrk_openBinaryRules err: %s", u_errorName(status
));
618 int32_t maxCount
= sizeof(breaks
); /* fail-safe test limit */
619 int32_t pos2
= ubrk_first(bi2
);
620 pos
= ubrk_first(bi
);
623 log_err("FAIL: interator from ubrk_openBinaryRules does not match original, get pos = %d instead of %d", pos2
, pos
);
625 pos2
= ubrk_next(bi2
);
627 } while ((pos
!= UBRK_DONE
|| pos2
!= UBRK_DONE
) && maxCount
-- > 0);
632 uprv_free(binaryRules
);
636 freeToUCharStrings(&freeHook
);
640 static void TestBreakIteratorRuleError() {
642 * TestBreakIteratorRuleError - Try to create a BI from rules with syntax errors,
643 * check that the error is reported correctly.
645 char rules
[] = " # This is a rule comment on line 1\n"
646 "[:L:]; # this rule is OK.\n"
647 "abcdefg); # Error, mismatched parens\n";
649 void *freeHook
= NULL
;
650 UErrorCode status
= U_ZERO_ERROR
;
651 UParseError parseErr
;
654 uRules
= toUChar(rules
, &freeHook
);
655 bi
= ubrk_openRules(uRules
, -1, /* The rules */
656 NULL
, -1, /* The text to be iterated over. */
658 if (U_SUCCESS(status
)) {
659 log_err("FAIL: construction of break iterator succeeded when it should have failed.\n");
662 if (parseErr
.line
!= 3 || parseErr
.offset
!= 8) {
663 log_data_err("FAIL: incorrect error position reported. Got line %d, char %d, expected line 3, char 7 (Are you missing data?)\n",
664 parseErr
.line
, parseErr
.offset
);
667 freeToUCharStrings(&freeHook
);
672 * TestsBreakIteratorStatusVals() Test the ubrk_getRuleStatusVec() funciton
674 static void TestBreakIteratorStatusVec() {
675 #define RULE_STRING_LENGTH 200
676 UChar rules
[RULE_STRING_LENGTH
];
678 #define TEST_STRING_LENGTH 25
679 UChar testString
[TEST_STRING_LENGTH
];
680 UBreakIterator
*bi
= NULL
;
684 UErrorCode status
= U_ZERO_ERROR
;
686 u_uastrncpy(rules
, "[A-N]{100}; \n"
691 "!.*;\n", RULE_STRING_LENGTH
);
692 u_uastrncpy(testString
, "ABC", TEST_STRING_LENGTH
);
695 bi
= ubrk_openRules(rules
, -1, testString
, -1, NULL
, &status
);
696 TEST_ASSERT_SUCCESS(status
);
697 TEST_ASSERT(bi
!= NULL
);
699 /* The TEST_ASSERT above should change too... */
702 TEST_ASSERT(pos
== 1);
704 memset(vals
, -1, sizeof(vals
));
705 numVals
= ubrk_getRuleStatusVec(bi
, vals
, 10, &status
);
706 TEST_ASSERT_SUCCESS(status
);
707 TEST_ASSERT(numVals
== 2);
708 TEST_ASSERT(vals
[0] == 100);
709 TEST_ASSERT(vals
[1] == 300);
710 TEST_ASSERT(vals
[2] == -1);
712 numVals
= ubrk_getRuleStatusVec(bi
, vals
, 0, &status
);
713 TEST_ASSERT(status
== U_BUFFER_OVERFLOW_ERROR
);
714 TEST_ASSERT(numVals
== 2);
722 * static void TestBreakIteratorUText(void);
724 * Test that ubrk_setUText() is present and works for a simple case.
726 static void TestBreakIteratorUText(void) {
727 const char *UTF8Str
= "\x41\xc3\x85\x5A\x20\x41\x52\x69\x6E\x67"; /* c3 85 is utf-8 for A with a ring on top */
730 UErrorCode status
= U_ZERO_ERROR
;
731 UBreakIterator
*bi
= NULL
;
735 UText
*ut
= utext_openUTF8(NULL
, UTF8Str
, -1, &status
);
736 TEST_ASSERT_SUCCESS(status
);
738 bi
= ubrk_open(UBRK_WORD
, "en_US", NULL
, 0, &status
);
739 if (U_FAILURE(status
)) {
740 log_err_status(status
, "Failure at file %s, line %d, error = %s\n", __FILE__
, __LINE__
, u_errorName(status
));
744 ubrk_setUText(bi
, ut
, &status
);
745 if (U_FAILURE(status
)) {
746 log_err("Failure at file %s, line %d, error = %s\n", __FILE__
, __LINE__
, u_errorName(status
));
750 pos
= ubrk_first(bi
);
751 TEST_ASSERT(pos
== 0);
754 TEST_ASSERT(pos
== 4);
757 TEST_ASSERT(pos
== 5);
760 TEST_ASSERT(pos
== 10);
763 TEST_ASSERT(pos
== UBRK_DONE
);
769 * static void TestBreakIteratorTailoring(void);
771 * Test break iterator tailorings from CLDR data.
774 /* Thai/Lao grapheme break tailoring */
775 static const UChar thTest
[] = { 0x0020, 0x0E40, 0x0E01, 0x0020,
776 0x0E01, 0x0E30, 0x0020, 0x0E01, 0x0E33, 0x0020, 0 };
777 /*in Unicode 6.1 en should behave just like th for this*/
778 /*static const int32_t thTestOffs_enFwd[] = { 1, 3, 4, 6, 7, 9, 10 };*/
779 static const int32_t thTestOffs_thFwd
[] = { 1, 2, 3, 4, 5, 6, 7, 9, 10 };
780 /*static const int32_t thTestOffs_enRev[] = { 9, 7, 6, 4, 3, 1, 0 };*/
781 static const int32_t thTestOffs_thRev
[] = { 9, 7, 6, 5, 4, 3, 2, 1, 0 };
783 /* Hebrew line break tailoring, for cldrbug 3028 */
784 static const UChar heTest
[] = { 0x0020, 0x002D, 0x0031, 0x0032, 0x0020,
785 0x0061, 0x002D, 0x006B, 0x0020,
786 0x0061, 0x0300, 0x2010, 0x006B, 0x0020,
787 0x05DE, 0x05D4, 0x002D, 0x0069, 0x0020,
788 0x05D1, 0x05BC, 0x2010, 0x0047, 0x0020, 0 };
789 /*in Unicode 6.1 en should behave just like he for this*/
790 /*static const int32_t heTestOffs_enFwd[] = { 1, 5, 7, 9, 12, 14, 17, 19, 22, 24 };*/
791 static const int32_t heTestOffs_heFwd
[] = { 1, 5, 7, 9, 12, 14, 19, 24 };
792 /*static const int32_t heTestOffs_enRev[] = { 22, 19, 17, 14, 12, 9, 7, 5, 1, 0 };*/
793 static const int32_t heTestOffs_heRev
[] = { 19, 14, 12, 9, 7, 5, 1, 0 };
795 /* Finnish line break tailoring, for cldrbug 3029 */
796 static const UChar fiTest
[] = { /* 00 */ 0x0020, 0x002D, 0x0031, 0x0032, 0x0020,
797 /* 05 */ 0x0061, 0x002D, 0x006B, 0x0020,
798 /* 09 */ 0x0061, 0x0300, 0x2010, 0x006B, 0x0020,
799 /* 14 */ 0x0061, 0x0020, 0x002D, 0x006B, 0x0020,
800 /* 19 */ 0x0061, 0x0300, 0x0020, 0x2010, 0x006B, 0x0020, 0 };
801 static const int32_t fiTestOffs_enFwd
[] = { 1, 5, 7, 9, 12, 14, 16, 17, 19, 22, 23, 25 };
802 static const int32_t fiTestOffs_fiFwd
[] = { 1, 5, 7, 9, 12, 14, 16, 19, 22, 25 };
803 static const int32_t fiTestOffs_enRev
[] = { 23, 22, 19, 17, 16, 14, 12, 9, 7, 5, 1, 0 };
804 static const int32_t fiTestOffs_fiRev
[] = { 22, 19, 16, 14, 12, 9, 7, 5, 1, 0 };
806 /* Khmer dictionary-based work break, for ICU ticket #8329 */
807 static const UChar kmTest
[] = { /* 00 */ 0x179F, 0x17BC, 0x1798, 0x1785, 0x17C6, 0x178E, 0x17B6, 0x1799, 0x1796, 0x17C1,
808 /* 10 */ 0x179B, 0x1794, 0x1793, 0x17D2, 0x178F, 0x17B7, 0x1785, 0x178A, 0x17BE, 0x1798,
809 /* 20 */ 0x17D2, 0x1794, 0x17B8, 0x17A2, 0x1792, 0x17B7, 0x179F, 0x17D2, 0x178B, 0x17B6,
810 /* 30 */ 0x1793, 0x17A2, 0x179A, 0x1796, 0x17D2, 0x179A, 0x17C7, 0x1782, 0x17BB, 0x178E,
811 /* 40 */ 0x178A, 0x179B, 0x17CB, 0x1796, 0x17D2, 0x179A, 0x17C7, 0x17A2, 0x1784, 0x17D2,
812 /* 50 */ 0x1782, 0 };
813 static const int32_t kmTestOffs_kmFwd
[] = { 3, /*8,*/ 11, 17, 23, 31, /*33,*/ 40, 43, 51 }; /* TODO: Investigate failure to break at offset 8 */
814 static const int32_t kmTestOffs_kmRev
[] = { 43, 40, /*33,*/ 31, 23, 17, 11, /*8,*/ 3, 0 };
817 /* Korean keepAll vs Normal */
818 static const UChar koTest
[] = { /* 00 */ 0xBAA8, 0xB4E0, 0x0020, 0xC778, 0xB958, 0x0020, 0xAD6C, 0xC131, 0xC6D0, 0xC758,
819 /* 10 */ 0x0020, 0xCC9C, 0xBD80, 0xC758, 0x0020, 0xC874, 0xC5C4, 0xC131, 0xACFC, 0x0020,
820 /* 20 */ 0xB3D9, 0xB4F1, 0xD558, 0xACE0, 0x0020, 0xC591, 0xB3C4, 0xD560, 0 };
821 static const int32_t koTestOffs_koKeepFwd
[] = { 3, 6, 11, 15, 20, 25, 28 };
822 static const int32_t koTestOffs_koKeepRev
[] = { 25, 20, 15, 11, 6, 3, 0 };
823 static const int32_t koTestOffs_koNormFwd
[] = { 1, 3, 4, 6, 7, 8, 9, 11, 12, 13, 15, 16, 17, 18, 20, 21, 22, 23, 25, 26, 27, 28 };
824 static const int32_t koTestOffs_koNormRev
[] = { 27, 26, 25, 23, 22, 21, 20, 18, 17, 16, 15, 13, 12, 11, 9, 8, 7, 6, 4, 3, 1, 0 };
828 UBreakIteratorType type
;
830 const int32_t * offsFwd
;
831 const int32_t * offsRev
;
835 static const RBBITailoringTest tailoringTests
[] = {
836 { "en", UBRK_CHARACTER
, thTest
, thTestOffs_thFwd
, thTestOffs_thRev
, UPRV_LENGTHOF(thTestOffs_thFwd
) },
837 { "en_US_POSIX", UBRK_CHARACTER
, thTest
, thTestOffs_thFwd
, thTestOffs_thRev
, UPRV_LENGTHOF(thTestOffs_thFwd
) },
838 { "en", UBRK_LINE
, heTest
, heTestOffs_heFwd
, heTestOffs_heRev
, UPRV_LENGTHOF(heTestOffs_heFwd
) },
839 { "he", UBRK_LINE
, heTest
, heTestOffs_heFwd
, heTestOffs_heRev
, UPRV_LENGTHOF(heTestOffs_heFwd
) },
840 { "en", UBRK_LINE
, fiTest
, fiTestOffs_enFwd
, fiTestOffs_enRev
, UPRV_LENGTHOF(fiTestOffs_enFwd
) },
841 { "fi", UBRK_LINE
, fiTest
, fiTestOffs_fiFwd
, fiTestOffs_fiRev
, UPRV_LENGTHOF(fiTestOffs_fiFwd
) },
842 { "km", UBRK_WORD
, kmTest
, kmTestOffs_kmFwd
, kmTestOffs_kmRev
, UPRV_LENGTHOF(kmTestOffs_kmFwd
) },
843 { "ko", UBRK_LINE
, koTest
, koTestOffs_koKeepFwd
, koTestOffs_koKeepRev
, UPRV_LENGTHOF(koTestOffs_koKeepFwd
) },
844 { "ko@lw=keepall", UBRK_LINE
, koTest
, koTestOffs_koKeepFwd
, koTestOffs_koKeepRev
, UPRV_LENGTHOF(koTestOffs_koKeepFwd
) },
845 { "ko@lw=normal", UBRK_LINE
, koTest
, koTestOffs_koNormFwd
, koTestOffs_koNormRev
, UPRV_LENGTHOF(koTestOffs_koNormFwd
) },
846 { NULL
, 0, NULL
, NULL
, NULL
, 0 },
849 static void TestBreakIteratorTailoring(void) {
850 const RBBITailoringTest
* testPtr
;
851 for (testPtr
= tailoringTests
; testPtr
->locale
!= NULL
; ++testPtr
) {
852 UErrorCode status
= U_ZERO_ERROR
;
853 UBreakIterator
* ubrkiter
= ubrk_open(testPtr
->type
, testPtr
->locale
, testPtr
->test
, -1, &status
);
854 if ( U_SUCCESS(status
) ) {
855 int32_t offset
, offsindx
;
859 ubrk_first(ubrkiter
);
860 for (offsindx
= 0; (offset
= ubrk_next(ubrkiter
)) != UBRK_DONE
; ++offsindx
) {
861 if (!foundError
&& offsindx
>= testPtr
->numOffsets
) {
862 log_err("FAIL: locale %s, break type %d, ubrk_next expected UBRK_DONE, got %d\n",
863 testPtr
->locale
, testPtr
->type
, offset
);
865 } else if (!foundError
&& offset
!= testPtr
->offsFwd
[offsindx
]) {
866 log_err("FAIL: locale %s, break type %d, ubrk_next expected %d, got %d\n",
867 testPtr
->locale
, testPtr
->type
, testPtr
->offsFwd
[offsindx
], offset
);
871 if (!foundError
&& offsindx
< testPtr
->numOffsets
) {
872 log_err("FAIL: locale %s, break type %d, ubrk_next expected %d, got UBRK_DONE\n",
873 testPtr
->locale
, testPtr
->type
, testPtr
->offsFwd
[offsindx
]);
878 for (offsindx
= 0; (offset
= ubrk_previous(ubrkiter
)) != UBRK_DONE
; ++offsindx
) {
879 if (!foundError
&& offsindx
>= testPtr
->numOffsets
) {
880 log_err("FAIL: locale %s, break type %d, ubrk_previous expected UBRK_DONE, got %d\n",
881 testPtr
->locale
, testPtr
->type
, offset
);
883 } else if (!foundError
&& offset
!= testPtr
->offsRev
[offsindx
]) {
884 log_err("FAIL: locale %s, break type %d, ubrk_previous expected %d, got %d\n",
885 testPtr
->locale
, testPtr
->type
, testPtr
->offsRev
[offsindx
], offset
);
889 if (!foundError
&& offsindx
< testPtr
->numOffsets
) {
890 log_err("FAIL: locale %s, break type %d, ubrk_previous expected %d, got UBRK_DONE\n",
891 testPtr
->locale
, testPtr
->type
, testPtr
->offsRev
[offsindx
]);
894 ubrk_close(ubrkiter
);
896 log_err_status(status
, "FAIL: locale %s, break type %d, ubrk_open status: %s\n", testPtr
->locale
, testPtr
->type
, u_errorName(status
));
902 static void TestBreakIteratorRefresh(void) {
904 * RefreshInput changes out the input of a Break Iterator without
905 * changing anything else in the iterator's state. Used with Java JNI,
906 * when Java moves the underlying string storage. This test
907 * runs a ubrk_next() repeatedly, moving the text in the middle of the sequence.
908 * The right set of boundaries should still be found.
910 UChar testStr
[] = {0x20, 0x41, 0x20, 0x42, 0x20, 0x43, 0x20, 0x44, 0x0}; /* = " A B C D" */
911 UChar movedStr
[] = {0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0};
912 UErrorCode status
= U_ZERO_ERROR
;
914 UText ut1
= UTEXT_INITIALIZER
;
915 UText ut2
= UTEXT_INITIALIZER
;
917 bi
= ubrk_open(UBRK_LINE
, "en_US", NULL
, 0, &status
);
918 TEST_ASSERT_SUCCESS(status
);
919 if (U_FAILURE(status
)) {
923 utext_openUChars(&ut1
, testStr
, -1, &status
);
924 TEST_ASSERT_SUCCESS(status
);
925 ubrk_setUText(bi
, &ut1
, &status
);
926 TEST_ASSERT_SUCCESS(status
);
928 if (U_SUCCESS(status
)) {
929 /* Line boundaries will occur before each letter in the original string */
930 TEST_ASSERT(1 == ubrk_next(bi
));
931 TEST_ASSERT(3 == ubrk_next(bi
));
933 /* Move the string, kill the original string. */
934 u_strcpy(movedStr
, testStr
);
935 u_memset(testStr
, 0x20, u_strlen(testStr
));
936 utext_openUChars(&ut2
, movedStr
, -1, &status
);
937 TEST_ASSERT_SUCCESS(status
);
938 ubrk_refreshUText(bi
, &ut2
, &status
);
939 TEST_ASSERT_SUCCESS(status
);
941 /* Find the following matches, now working in the moved string. */
942 TEST_ASSERT(5 == ubrk_next(bi
));
943 TEST_ASSERT(7 == ubrk_next(bi
));
944 TEST_ASSERT(8 == ubrk_next(bi
));
945 TEST_ASSERT(UBRK_DONE
== ubrk_next(bi
));
946 TEST_ASSERT_SUCCESS(status
);
955 static void TestBug11665(void) {
956 // The problem was with the incorrect breaking of Japanese text beginning
957 // with Katakana characters when no prior Japanese or Chinese text had been
960 // Tested here in cintltst, rather than in intltest, because only cintltst
961 // tests have the ability to reset ICU, which is needed to get the bug
962 // to manifest itself.
964 static UChar japaneseText
[] = {0x30A2, 0x30EC, 0x30EB, 0x30AE, 0x30FC, 0x6027, 0x7D50, 0x819C, 0x708E};
965 int32_t boundaries
[10] = {0};
966 UBreakIterator
*bi
= NULL
;
969 int32_t totalBreaks
= 0;
970 UErrorCode status
= U_ZERO_ERROR
;
973 bi
= ubrk_open(UBRK_WORD
, "en_US", japaneseText
, UPRV_LENGTHOF(japaneseText
), &status
);
974 TEST_ASSERT_SUCCESS(status
);
978 for (brk
=ubrk_first(bi
); brk
!= UBRK_DONE
; brk
=ubrk_next(bi
)) {
979 boundaries
[brkIdx
] = brk
;
980 if (++brkIdx
>= UPRV_LENGTHOF(boundaries
) - 1) {
984 if (brkIdx
<= 2 || brkIdx
>= UPRV_LENGTHOF(boundaries
)) {
985 log_err("%s:%d too few or many breaks found.\n", __FILE__
, __LINE__
);
987 totalBreaks
= brkIdx
;
989 for (brk
=ubrk_first(bi
); brk
!= UBRK_DONE
; brk
=ubrk_next(bi
)) {
990 if (brk
!= boundaries
[brkIdx
]) {
991 log_err("%s:%d Break #%d differs between first and second iteration.\n", __FILE__
, __LINE__
, brkIdx
);
994 if (++brkIdx
>= UPRV_LENGTHOF(boundaries
) - 1) {
995 log_err("%s:%d Too many breaks.\n", __FILE__
, __LINE__
);
999 if (totalBreaks
!= brkIdx
) {
1000 log_err("%s:%d Number of breaks differ between first and second iteration.\n", __FILE__
, __LINE__
);
1007 * expOffset is the set of expected offsets, ending with '-1'.
1008 * "Expected expOffset -1" means "expected the end of the offsets"
1011 static const char testSentenceSuppressionsEn
[] = "Mr. Jones comes home. Dr. Smith Ph.D. is out. In the U.S.A. it is hot.";
1012 static const int32_t testSentSuppFwdOffsetsEn
[] = { 22, 46, 70, -1 }; /* With suppressions */
1013 static const int32_t testSentFwdOffsetsEn
[] = { 4, 22, 26, 46, 70, -1 }; /* Without suppressions */
1014 static const int32_t testSentSuppRevOffsetsEn
[] = { 46, 22, 0, -1 }; /* With suppressions */
1015 static const int32_t testSentRevOffsetsEn
[] = { 46, 26, 22, 4, 0, -1 }; /* Without suppressions */
1017 static const char testSentenceSuppressionsDe
[] = "Wenn ich schon h\\u00F6re zu Guttenberg kommt evtl. zur\\u00FCck.";
1018 // "Wenn ich schon höre zu Guttenberg kommt evtl. zurück."
1019 static const int32_t testSentSuppFwdOffsetsDe
[] = { 53, -1 }; /* With suppressions */
1020 static const int32_t testSentFwdOffsetsDe
[] = { 53, -1 }; /* Without suppressions; no break in evtl. zur due to casing */
1021 static const int32_t testSentSuppRevOffsetsDe
[] = { 0, -1 }; /* With suppressions */
1022 static const int32_t testSentRevOffsetsDe
[] = { 0, -1 }; /* Without suppressions */
1024 static const char testSentenceSuppressionsEs
[] = "Te esperamos todos los miercoles en Bravo 416, Col. El Pueblo a las 7 PM.";
1025 static const int32_t testSentSuppFwdOffsetsEs
[] = { 73, -1 }; /* With suppressions */
1026 static const int32_t testSentFwdOffsetsEs
[] = { 52, 73, -1 }; /* Without suppressions */
1027 static const int32_t testSentSuppRevOffsetsEs
[] = { 0, -1 }; /* With suppressions */
1028 static const int32_t testSentRevOffsetsEs
[] = { 52, 0, -1 }; /* Without suppressions */
1030 static const char testSentenceSuppressionsE1
[] = "Add or detract. The world will little note.";
1031 static const char testSentenceSuppressionsE1u
[] = "ADD OR DETRACT. THE WORLD WILL LITTLE NOTE.";
1032 static const int32_t testSentFwdOffsetsE1
[] = { 16, 43, -1 }; /* Suppressions and case should make no difference */
1033 static const int32_t testSentRevOffsetsE1
[] = { 16, 0, -1 }; /* Suppressions and case should make no difference */
1035 static const char testSentenceSuppressionsE2
[] = "Coming up, the sprints at NCAA. Are you watching?";
1036 static const char testSentenceSuppressionsE2u
[] = "COMING UP, THE SPRINTS AT NCAA. ARE YOU WATCHING?";
1037 static const int32_t testSentFwdOffsetsE2
[] = { 32, 49, -1 }; /* Suppressions and case should make no difference */
1038 static const int32_t testSentRevOffsetsE2
[] = { 32, 0, -1 }; /* Suppressions and case should make no difference */
1040 static const char testSentenceSuppressionsFr
[] = "Tr\\u00E8s bonne prise de parole de M. Junod, municipal \\u00E0 la culture de Lausanne.";
1041 // "Très bonne prise de parole de M. Junod, municipal à la culture de Lausanne."
1042 static const int32_t testSentFwdOffsetsFr
[] = { 33, 75, -1 }; /* Without suppressions */
1043 static const int32_t testSentSuppFwdOffsetsFr
[] = { 75, -1 }; /* With suppressions */
1044 static const int32_t testSentRevOffsetsFr
[] = { 33, 0, -1 }; /* Without suppressions */
1045 static const int32_t testSentSuppRevOffsetsFr
[] = { 0, -1 }; /* With suppressions */
1047 static const char testSentenceSuppressionsE3
[] = "G8 countries e.g. U.K., Japan. Sanctions i.e. restrictions. Test E. Xx G. Xx I. Xx.";
1048 static const char testSentenceSuppressionsE3u
[] = "G8 COUNTRIES E.G. U.K., JAPAN. SANCTIONS I.E. RESTRICTIONS. TEST E. XX G. XX I. XX.";
1049 static const int32_t testSentSuppFwdOffsetsE3
[] = { 31, 60, 83, -1 }; /* With suppressions */
1050 static const int32_t testSentSuppRevOffsetsE3
[] = { 60, 31, 0, -1 }; /* With suppressions */
1051 static const int32_t testSentFwdOffsetsE3
[] = { 18, 31, 60, 68, 74, 80, 83, -1 }; /* Without suppressions */
1052 static const int32_t testSentRevOffsetsE3
[] = { 80, 74, 68, 60, 31, 18, 0, -1 }; /* Without suppressions */
1053 static const int32_t testSentFwdOffsetsE3u
[] = { 18, 31, 46, 60, 68, 74, 80, 83, -1 }; /* Without suppressions */
1054 static const int32_t testSentRevOffsetsE3u
[] = { 80, 74, 68, 60, 46, 31, 18, 0, -1 }; /* Without suppressions */
1056 enum { kTextULenMax
= 128, kTextBLenMax
= 192 };
1059 const char * locale
;
1061 const int32_t * expFwdOffsets
;
1062 const int32_t * expRevOffsets
;
1063 } TestBISuppressionsItem
;
1065 static const TestBISuppressionsItem testBISuppressionsItems
[] = {
1066 { "en@ss=standard", testSentenceSuppressionsEn
, testSentSuppFwdOffsetsEn
, testSentSuppRevOffsetsEn
},
1067 { "en", testSentenceSuppressionsEn
, testSentFwdOffsetsEn
, testSentRevOffsetsEn
},
1068 { "en_CA", testSentenceSuppressionsEn
, testSentFwdOffsetsEn
, testSentRevOffsetsEn
},
1069 { "en_CA@ss=standard", testSentenceSuppressionsEn
, testSentSuppFwdOffsetsEn
, testSentSuppRevOffsetsEn
},
1070 { "fr@ss=standard", testSentenceSuppressionsEn
, testSentFwdOffsetsEn
, testSentRevOffsetsEn
},
1071 { "af@ss=standard", testSentenceSuppressionsEn
, testSentFwdOffsetsEn
, testSentRevOffsetsEn
}, /* no brkiter data => nosuppressions? */
1072 { "af_ZA@ss=standard", testSentenceSuppressionsEn
, testSentFwdOffsetsEn
, testSentRevOffsetsEn
}, /* no brkiter data => nosuppressions? */
1073 { "zh@ss=standard", testSentenceSuppressionsEn
, testSentFwdOffsetsEn
, testSentRevOffsetsEn
}, /* brkiter data, no suppressions data => no suppressions */
1074 { "zh_Hant@ss=standard", testSentenceSuppressionsEn
, testSentFwdOffsetsEn
, testSentRevOffsetsEn
}, /* brkiter data, no suppressions data => no suppressions */
1075 { "fi@ss=standard", testSentenceSuppressionsEn
, testSentFwdOffsetsEn
, testSentRevOffsetsEn
}, /* brkiter data, no suppressions data => no suppressions */
1076 { "ja@ss=standard", testSentenceSuppressionsEn
, testSentFwdOffsetsEn
, testSentRevOffsetsEn
}, /* brkiter data, no suppressions data => no suppressions */
1077 { "de@ss=standard", testSentenceSuppressionsDe
, testSentSuppFwdOffsetsDe
, testSentSuppRevOffsetsDe
},
1078 { "de", testSentenceSuppressionsDe
, testSentFwdOffsetsDe
, testSentRevOffsetsDe
},
1079 { "es@ss=standard", testSentenceSuppressionsEs
, testSentSuppFwdOffsetsEs
, testSentSuppRevOffsetsEs
},
1080 { "es", testSentenceSuppressionsEs
, testSentFwdOffsetsEs
, testSentRevOffsetsEs
},
1081 { "en", testSentenceSuppressionsE1
, testSentFwdOffsetsE1
, testSentRevOffsetsE1
},
1082 { "en@ss=standard", testSentenceSuppressionsE1
, testSentFwdOffsetsE1
, testSentRevOffsetsE1
},
1083 { "en", testSentenceSuppressionsE1u
, testSentFwdOffsetsE1
, testSentRevOffsetsE1
},
1084 { "en@ss=standard", testSentenceSuppressionsE1u
, testSentFwdOffsetsE1
, testSentRevOffsetsE1
},
1085 { "en", testSentenceSuppressionsE2
, testSentFwdOffsetsE2
, testSentRevOffsetsE2
},
1086 { "en@ss=standard", testSentenceSuppressionsE2
, testSentFwdOffsetsE2
, testSentRevOffsetsE2
},
1087 { "en", testSentenceSuppressionsE2u
, testSentFwdOffsetsE2
, testSentRevOffsetsE2
},
1088 { "en@ss=standard", testSentenceSuppressionsE2u
, testSentFwdOffsetsE2
, testSentRevOffsetsE2
},
1089 { "fr", testSentenceSuppressionsFr
, testSentFwdOffsetsFr
, testSentRevOffsetsFr
},
1090 { "fr@ss=standard", testSentenceSuppressionsFr
, testSentSuppFwdOffsetsFr
, testSentSuppRevOffsetsFr
},
1091 { "en@ss=standard", testSentenceSuppressionsE3
, testSentSuppFwdOffsetsE3
, testSentSuppRevOffsetsE3
},
1092 { "en", testSentenceSuppressionsE3
, testSentFwdOffsetsE3
, testSentRevOffsetsE3
},
1093 { "en@ss=standard", testSentenceSuppressionsE3u
, testSentSuppFwdOffsetsE3
, testSentSuppRevOffsetsE3
},
1094 { "en", testSentenceSuppressionsE3u
, testSentFwdOffsetsE3u
, testSentRevOffsetsE3u
},
1095 { NULL
, NULL
, NULL
}
1098 static void TestBreakIteratorSuppressions(void) {
1099 const TestBISuppressionsItem
* itemPtr
;
1101 for (itemPtr
= testBISuppressionsItems
; itemPtr
->locale
!= NULL
; itemPtr
++) {
1102 UChar textU
[kTextULenMax
+ 1];
1103 char textB
[kTextBLenMax
];
1104 int32_t textULen
= u_unescape(itemPtr
->text
, textU
, kTextULenMax
);
1105 textU
[kTextULenMax
] = 0; // ensure zero termination
1106 UErrorCode status
= U_ZERO_ERROR
;
1107 UBreakIterator
*bi
= ubrk_open(UBRK_SENTENCE
, itemPtr
->locale
, textU
, textULen
, &status
);
1108 log_verbose("#%d: %s\n", (itemPtr
-testBISuppressionsItems
), itemPtr
->locale
);
1109 if (U_SUCCESS(status
)) {
1110 int32_t offset
, start
;
1111 const int32_t * expOffsetPtr
;
1112 const int32_t * expOffsetStart
;
1113 u_austrcpy(textB
, textU
);
1115 expOffsetStart
= expOffsetPtr
= itemPtr
->expFwdOffsets
;
1117 for (; (offset
= ubrk_next(bi
)) != UBRK_DONE
&& *expOffsetPtr
>= 0; expOffsetPtr
++) {
1118 if (offset
!= *expOffsetPtr
) {
1119 log_err("FAIL: ubrk_next loc \"%s\", expected %d, got %d, text \"%s\"\n",
1120 itemPtr
->locale
, *expOffsetPtr
, offset
, textB
);
1123 if (offset
!= UBRK_DONE
|| *expOffsetPtr
>= 0) {
1124 log_err("FAIL: ubrk_next loc \"%s\", expected UBRK_DONE & expOffset -1, got %d and %d, text \"%s\"\n",
1125 itemPtr
->locale
, offset
, *expOffsetPtr
, textB
);
1128 expOffsetStart
= expOffsetPtr
= itemPtr
->expFwdOffsets
;
1129 start
= ubrk_first(bi
) + 1;
1130 for (; (offset
= ubrk_following(bi
, start
)) != UBRK_DONE
&& *expOffsetPtr
>= 0; expOffsetPtr
++) {
1131 if (offset
!= *expOffsetPtr
) {
1132 log_err("FAIL: ubrk_following(%d) loc \"%s\", expected %d, got %d, text \"%s\"\n",
1133 start
, itemPtr
->locale
, *expOffsetPtr
, offset
, textB
);
1135 start
= *expOffsetPtr
+ 1;
1137 if (offset
!= UBRK_DONE
|| *expOffsetPtr
>= 0) {
1138 log_err("FAIL: ubrk_following(%d) loc \"%s\", expected UBRK_DONE & expOffset -1, got %d and %d, text \"%s\"\n",
1139 start
, itemPtr
->locale
, offset
, *expOffsetPtr
, textB
);
1142 expOffsetStart
= expOffsetPtr
= itemPtr
->expRevOffsets
;
1143 offset
= ubrk_last(bi
);
1144 log_verbose("___ @%d ubrk_last\n", offset
);
1146 log_err("FAIL: ubrk_last loc \"%s\" unexpected %d\n", itemPtr
->locale
, offset
);
1148 for (; (offset
= ubrk_previous(bi
)) != UBRK_DONE
&& *expOffsetPtr
>= 0; expOffsetPtr
++) {
1149 if (offset
!= *expOffsetPtr
) {
1150 log_err("FAIL: ubrk_previous loc \"%s\", expected %d, got %d, text \"%s\"\n",
1151 itemPtr
->locale
, *expOffsetPtr
, offset
, textB
);
1153 log_verbose("[%d] @%d ubrk_previous()\n", (expOffsetPtr
- expOffsetStart
), offset
);
1156 if (offset
!= UBRK_DONE
|| *expOffsetPtr
>= 0) {
1157 log_err("FAIL: ubrk_previous loc \"%s\", expected UBRK_DONE & expOffset[%d] -1, got %d and %d, text \"%s\"\n",
1158 itemPtr
->locale
, expOffsetPtr
- expOffsetStart
, offset
, *expOffsetPtr
, textB
);
1161 expOffsetStart
= expOffsetPtr
= itemPtr
->expRevOffsets
;
1162 start
= ubrk_last(bi
) - 1;
1163 for (; (offset
= ubrk_preceding(bi
, start
)) != UBRK_DONE
&& *expOffsetPtr
>= 0; expOffsetPtr
++) {
1164 if (offset
!= *expOffsetPtr
) {
1165 log_err("FAIL: ubrk_preceding(%d) loc \"%s\", expected %d, got %d, text \"%s\"\n",
1166 start
, itemPtr
->locale
, *expOffsetPtr
, offset
, textB
);
1168 start
= *expOffsetPtr
- 1;
1170 if (start
>=0 && (offset
!= UBRK_DONE
|| *expOffsetPtr
>= 0)) {
1171 log_err("FAIL: ubrk_preceding loc(%d) \"%s\", expected UBRK_DONE & expOffset -1, got %d and %d, text \"%s\"\n",
1172 start
, itemPtr
->locale
, offset
, *expOffsetPtr
, textB
);
1177 log_data_err("FAIL: ubrk_open(UBRK_SENTENCE, \"%s\", ...) status %s (Are you missing data?)\n",
1178 itemPtr
->locale
, u_errorName(status
));
1185 #include "unicode/urbtok.h"
1186 #include "cstring.h"
1188 static const char testRulesFilePath
[] = "../testdata/word_urbTokTest.txt";
1189 static const UChar textToTokenize
[] = {
1191 "Short phrase! Another (with parens); done.\n
1192 At 4:00, tea-time.\n
1193 He wouldn't've wanted y'all to ... come at 3:30pm for $3 coffee @funman :)\n
1196 0x53,0x68,0x6F,0x72,0x74,0x20,0x70,0x68,0x72,0x61,0x73,0x65,0x21,0x20,
1197 0x41,0x6E,0x6F,0x74,0x68,0x65,0x72,0x20,0x28,0x77,0x69,0x74,0x68,0x20,0x70,0x61,0x72,0x65,0x6E,0x73,0x29,0x3B,0x20,0x64,0x6F,0x6E,0x65,0x2E,0x0A,
1198 0x41,0x74,0x20,0x34,0x3A,0x30,0x30,0x2C,0x20,0x74,0x65,0x61,0x2D,0x74,0x69,0x6D,0x65,0x2E,0x0A,
1199 0x48,0x65,0x20,0x77,0x6F,0x75,0x6C,0x64,0x6E,0x27,0x74,0x27,0x76,0x65,0x20,0x77,0x61,0x6E,0x74,0x65,0x64,0x20,
1200 0x79,0x27,0x61,0x6C,0x6C,0x20,0x74,0x6F,0x20,0x2E,0x2E,0x2E,0x20, 0x63,0x6F,0x6D,0x65,0x20,0x61,0x74,0x20,
1201 0x33,0x3A,0x33,0x30,0x70,0x6D,0x20,0x66,0x6F,0x72,0x20,0x24,0x33,0x20,0x63,0x6F,0x66,0x66,0x65,0x65,0x20,
1202 0x40,0x66,0x75,0x6E,0x6D,0x61,0x6E,0x20,0x3A,0x29,0x0A,
1203 0x78,0x33,0x3A,0x33,0x30,0x20,0x2D,0x2D,0x20,0x78,0x31,0x2E,0x30,0
1206 RuleBasedTokenRange token
;
1207 unsigned long flags
;
1209 static const RBTokResult expectedResults
[] = { // 66 tokens
1210 { { 0, 5 }, 0xC8 }, // Short
1211 { { 5, 1 }, 0x01 }, // _sp_
1212 { { 6, 6 }, 0xC8 }, // phrase
1213 { { 12, 1 }, 0x00 }, // !
1214 { { 13, 1 }, 0x01 }, // _sp_
1215 { { 14, 7 }, 0xC8 }, // Another
1216 { { 21, 1 }, 0x01 }, // _sp_
1217 { { 22, 1 }, 0x00 }, // (
1218 { { 23, 4 }, 0xC8 }, // with
1219 { { 27, 1 }, 0x01 }, // _sp_
1220 { { 28, 6 }, 0xC8 }, // parens
1221 { { 34, 1 }, 0x00 }, // )
1222 { { 35, 1 }, 0x00 }, // ;
1223 { { 36, 1 }, 0x01 }, // _sp_
1224 { { 37, 4 }, 0xC8 }, // done
1225 { { 41, 1 }, 0x14 }, // .
1226 { { 42, 1 }, 0x00 }, // _nl_
1228 { { 43, 2 }, 0xC8 }, // At
1229 { { 45, 1 }, 0x01 }, // _sp_
1230 { { 46, 4 }, 0x76 }, // 4:00 ** here RBBI has x64
1231 { { 50, 1 }, 0x00 }, // ,
1232 { { 51, 1 }, 0x01 }, // _sp_
1233 { { 52, 3 }, 0xC8 }, // tea
1234 { { 55, 1 }, 0x15 }, // -
1235 { { 56, 4 }, 0xC8 }, // time
1236 { { 60, 1 }, 0x14 }, // .
1237 { { 61, 1 }, 0x00 }, // _nl_
1239 { { 62, 2 }, 0xC8 }, // He
1240 { { 64, 1 }, 0x01 }, // _sp_
1241 { { 65, 8 }, 0xCA }, // wouldn't
1242 { { 73, 1 }, 0x16 }, // '
1243 { { 74, 2 }, 0xC8 }, // ve
1244 { { 76, 1 }, 0x01 }, // _sp_
1245 { { 77, 6 }, 0xC8 }, // wanted
1246 { { 83, 1 }, 0x01 }, // _sp_
1247 { { 84, 5 }, 0xCA }, // y'all
1248 { { 89, 1 }, 0x01 }, // _sp_
1249 { { 90, 2 }, 0xC8 }, // to
1250 { { 92, 1 }, 0x01 }, // _sp_
1251 { { 93, 3 }, 0x3C }, // ... ** here RBBI has 0x28
1252 { { 96, 1 }, 0x01 }, // _sp_
1253 { { 97, 4 }, 0xC8 }, // come
1254 { { 101, 1 }, 0x01 }, // _sp_
1255 { { 102, 2 }, 0xC8 }, // at
1256 { { 104, 1 }, 0x01 }, // _sp_
1257 { { 105, 6 }, 0xC8 }, // 3:30pm
1258 { { 111, 1 }, 0x01 }, // _sp_
1259 { { 112, 3 }, 0xC8 }, // for
1260 { { 115, 1 }, 0x01 }, // _sp_
1261 { { 116, 1 }, 0x00 }, // $
1262 { { 117, 1 }, 0x64 }, // 3
1263 { { 118, 1 }, 0x01 }, // _sp_
1264 { { 119, 6 }, 0xC8 }, // coffee
1265 { { 125, 1 }, 0x01 }, // _sp_
1266 { { 126, 7 }, 0xDF }, // @funman ** here RBBI has 0xC8
1267 { { 133, 1 }, 0x01 }, // _sp_
1268 { { 134, 2 }, 0x20 }, // :)
1269 { { 136, 1 }, 0x00 }, // _nl_
1271 // ** incorrect ranges (and flags) currently produced by RBTok
1272 { { 137, 2 }, 0xEC }, // x3
1273 { { 139, 1 }, 0x00 }, // :
1274 { { 140, 2 }, 0x64 }, // 30
1275 // ** for the above, RBBI has
1276 //{ { 137, 1 }, 0x64 }, // x
1277 //{ { 138, 4 }, 0x64 }, // 3:30
1279 { { 142, 1 }, 0x01 }, // _sp_
1280 { { 143, 2 }, 0x3D }, // -- ** here RBBI has 0x28
1281 { { 145, 1 }, 0x01 }, // _sp_
1282 { { 146, 2 }, 0xEC }, // x1 ** here RBBI has 0xC8
1283 { { 148, 1 }, 0x14 }, // .
1284 { { 149, 1 }, 0x64 }, // 0
1287 kNumTokensExpected
= UPRV_LENGTHOF(expectedResults
), // 66
1291 static void TestRuleBasedTokenizer(void) {
1292 FILE * testRulesFile
;
1293 char * testRulesUTF8Buf
;
1294 UChar
* testRulesUTF16Buf
= NULL
;
1295 long testRulesFileSize
, testRulesFileRead
= 0;
1296 long testRulesUTF8Offset
= 0;
1297 int32_t testRulesUTF16Size
;
1298 UErrorCode status
= U_ZERO_ERROR
;
1300 testRulesFile
= fopen(testRulesFilePath
, "r");
1301 if (testRulesFile
== NULL
) {
1302 log_data_err("FAIL: fopen fails for: %s\n", testRulesFilePath
);
1305 fseek(testRulesFile
, 0, SEEK_END
);
1306 testRulesFileSize
= ftell(testRulesFile
);
1307 rewind(testRulesFile
);
1309 testRulesUTF8Buf
= (char *)uprv_malloc(testRulesFileSize
);
1310 if (testRulesUTF8Buf
!= NULL
) {
1311 testRulesFileRead
= fread(testRulesUTF8Buf
, 1, testRulesFileSize
, testRulesFile
);
1313 fclose(testRulesFile
);
1314 if (testRulesUTF8Buf
== NULL
) {
1315 log_data_err("FAIL: uprv_malloc fails for testRulesUTF8Buf[%ld]\n", testRulesFileSize
);
1318 if (testRulesFileRead
< testRulesFileSize
) {
1319 log_data_err("FAIL: fread fails for %s, read %ld of %ld\n", testRulesFile
, testRulesFileRead
, testRulesFileSize
);
1320 uprv_free(testRulesUTF8Buf
);
1323 /* done with file, UTF8 rules in testRulesUTF8Buf. Handle UTF8 BOM: */
1324 if (uprv_strncmp(testRulesUTF8Buf
, "\xEF\xBB\xBF", 3) == 0) {
1325 testRulesUTF8Offset
= 3;
1326 testRulesFileSize
-= testRulesUTF8Offset
;
1329 u_strFromUTF8(NULL
, 0, &testRulesUTF16Size
, testRulesUTF8Buf
+testRulesUTF8Offset
, testRulesFileSize
, &status
); /* preflight */
1330 if (status
== U_BUFFER_OVERFLOW_ERROR
) { /* expected for preflight */
1331 status
= U_ZERO_ERROR
;
1333 if (U_FAILURE(status
)) {
1334 log_data_err("FAIL: u_strFromUTF8 preflight fails: %s\n", u_errorName(status
));
1336 testRulesUTF16Buf
= (UChar
*)uprv_malloc(testRulesUTF16Size
*sizeof(UChar
));
1337 if (testRulesUTF16Buf
== NULL
) {
1338 log_data_err("FAIL: uprv_malloc fails for testRulesUTF16Buf[%ld]\n", testRulesUTF16Size
*sizeof(UChar
));
1340 u_strFromUTF8(testRulesUTF16Buf
, testRulesUTF16Size
, &testRulesUTF16Size
, testRulesUTF8Buf
+testRulesUTF8Offset
, testRulesFileSize
, &status
);
1343 uprv_free(testRulesUTF8Buf
);
1344 if (testRulesUTF16Buf
== NULL
) {
1347 if (U_FAILURE(status
)) {
1348 log_data_err("FAIL: u_strFromUTF8 fails: %s\n", u_errorName(status
));
1350 UParseError parseErr
;
1351 UBreakIterator
*brkFromRules
= urbtok_openRules(testRulesUTF16Buf
, testRulesUTF16Size
, &parseErr
, &status
);
1352 if (U_FAILURE(status
)) {
1353 log_err("FAIL: urbtok_openRules status: %s\n", u_errorName(status
));
1355 uint8_t *rulesBinaryBuf
;
1356 uint32_t rulesBinarySize
;
1357 rulesBinarySize
= urbtok_getBinaryRules(brkFromRules
, NULL
, 0, &status
);
1358 if (U_FAILURE(status
)) {
1359 log_err("FAIL: urbtok_getBinaryRules preflight status: %s, rulesBinarySize %u\n", u_errorName(status
), rulesBinarySize
);
1361 rulesBinaryBuf
= (uint8_t *)uprv_malloc(rulesBinarySize
);
1362 if (rulesBinaryBuf
== NULL
) {
1363 log_data_err("FAIL: uprv_malloc fails for rulesBinaryBuf[%ld]\n", rulesBinarySize
);
1365 rulesBinarySize
= urbtok_getBinaryRules(brkFromRules
, rulesBinaryBuf
, rulesBinarySize
, &status
);
1366 if (U_FAILURE(status
)) {
1367 log_err("FAIL: urbtok_getBinaryRules status: %s, rulesBinarySize %u\n", u_errorName(status
), rulesBinarySize
);
1369 UBreakIterator
*brkFromBinary
= urbtok_openBinaryRules(rulesBinaryBuf
, &status
);
1370 if (U_FAILURE(status
)) {
1371 log_err("FAIL: urbtok_openBinaryRules status: %s\n", u_errorName(status
));
1373 RuleBasedTokenRange tokens
[kMaxTokens
];
1374 unsigned long flags
[kMaxTokens
];
1375 int32_t iToken
, numTokens
= 0;
1377 status
= U_ZERO_ERROR
;
1378 ubrk_setText(brkFromRules
, textToTokenize
, -1, &status
);
1379 if (U_FAILURE(status
)) {
1380 log_err("FAIL: ubrk_setText brkFromRules status: %s\n", u_errorName(status
));
1382 numTokens
= urbtok_tokenize(brkFromRules
, kMaxTokens
, tokens
, flags
);
1383 UBool fail
= (numTokens
!= kNumTokensExpected
);
1384 for (iToken
= 0; !fail
&& iToken
< numTokens
; iToken
++) {
1385 if ( tokens
[iToken
].location
!= expectedResults
[iToken
].token
.location
||
1386 tokens
[iToken
].length
!= expectedResults
[iToken
].token
.length
||
1387 flags
[iToken
] != expectedResults
[iToken
].flags
) {
1392 log_err("FAIL: urbtok_tokenize brkFromRules expected %d tokens, got %d\n", kNumTokensExpected
, numTokens
);
1393 printf("# expect get\n");
1394 printf("# loc len flags loc len flags\n");
1395 int32_t maxTokens
= (numTokens
>= kNumTokensExpected
)? numTokens
: kNumTokensExpected
;
1396 for (iToken
= 0; iToken
< maxTokens
; iToken
++) {
1397 if (iToken
< kNumTokensExpected
) {
1398 printf(" %3ld %3ld 0x%03lX", expectedResults
[iToken
].token
.location
,
1399 expectedResults
[iToken
].token
.length
, expectedResults
[iToken
].flags
);
1403 if (iToken
< numTokens
) {
1404 printf(" %3ld %3ld 0x%03lX\n", tokens
[iToken
].location
, tokens
[iToken
].length
, flags
[iToken
] );
1412 status
= U_ZERO_ERROR
;
1413 ubrk_setText(brkFromBinary
, textToTokenize
, -1, &status
);
1414 if (U_FAILURE(status
)) {
1415 log_err("FAIL: ubrk_setText brkFromBinary status: %s\n", u_errorName(status
));
1417 numTokens
= urbtok_tokenize(brkFromBinary
, kMaxTokens
, tokens
, flags
);
1418 UBool fail
= (numTokens
!= kNumTokensExpected
);
1419 for (iToken
= 0; !fail
&& iToken
< numTokens
; iToken
++) {
1420 if ( tokens
[iToken
].location
!= expectedResults
[iToken
].token
.location
||
1421 tokens
[iToken
].length
!= expectedResults
[iToken
].token
.length
||
1422 flags
[iToken
] != expectedResults
[iToken
].flags
) {
1427 log_err("FAIL: urbtok_tokenize brkFromBinary expected %d tokens, got %d\n", kNumTokensExpected
, numTokens
);
1428 printf("# expect get\n");
1429 printf("# loc len flags loc len flags\n");
1430 int32_t maxTokens
= (numTokens
>= kNumTokensExpected
)? numTokens
: kNumTokensExpected
;
1431 for (iToken
= 0; iToken
< maxTokens
; iToken
++) {
1432 if (iToken
< kNumTokensExpected
) {
1433 printf(" %3ld %3ld 0x%03lX", expectedResults
[iToken
].token
.location
,
1434 expectedResults
[iToken
].token
.length
, expectedResults
[iToken
].flags
);
1438 if (iToken
< numTokens
) {
1439 printf(" %3ld %3ld 0x%03lX\n", tokens
[iToken
].location
, tokens
[iToken
].length
, flags
[iToken
] );
1446 ubrk_close(brkFromBinary
);
1449 uprv_free(rulesBinaryBuf
);
1452 ubrk_close(brkFromRules
);
1455 uprv_free(testRulesUTF16Buf
);
1460 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */