icuSources/test/cintltst/cbiapts.c

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /********************************************************************
   4  * COPYRIGHT:
   5  * Copyright (c) 1997-2016, International Business Machines Corporation and
   6  * others. All Rights Reserved.
   7  ********************************************************************/
   8 /********************************************************************************
   9 *
  10 * File CBIAPTS.C
  11 *
  12 * Modification History:
  13 *        Name                     Description
  14 *     Madhu Katragadda              Creation
  15 *********************************************************************************/
  16 /*C API TEST FOR BREAKITERATOR */
  17 /**
  18 * This is an API test.  It doesn't test very many cases, and doesn't
  19 * try to test the full functionality.  It just calls each function in the class and
  20 * verifies that it works on a basic level.
  21 **/
  22
  23 #include "unicode/utypes.h"
  24
  25 #if !UCONFIG_NO_BREAK_ITERATION
  26
  27 #include <stdlib.h>
  28 #include <string.h>
  29 #include "unicode/uloc.h"
  30 #include "unicode/ubrk.h"
  31 #include "unicode/ustring.h"
  32 #include "unicode/ucnv.h"
  33 #include "unicode/utext.h"
  34 #include "cintltst.h"
  35 #include "cbiapts.h"
  36 #include "cmemory.h"
  37
  38 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
  39 log_data_err("Failure at file %s, line %d, error = %s (Are you missing data?)\n", __FILE__, __LINE__, u_errorName(status));}}
  40
  41 #define TEST_ASSERT(expr) {if ((expr)==FALSE) { \
  42 log_data_err("Test Failure at file %s, line %d (Are you missing data?)\n", __FILE__, __LINE__);}}
  43
  44 #define APPLE_ADDITIONS 1
  45
  46 #if !UCONFIG_NO_FILE_IO
  47 static void TestBreakIteratorSafeClone(void);
  48 #endif
  49 static void TestBreakIteratorRules(void);
  50 static void TestBreakIteratorRuleError(void);
  51 static void TestBreakIteratorStatusVec(void);
  52 static void TestBreakIteratorUText(void);
  53 static void TestBreakIteratorTailoring(void);
  54 static void TestBreakIteratorRefresh(void);
  55 static void TestBug11665(void);
  56 static void TestBreakIteratorSuppressions(void);
  57 #if APPLE_ADDITIONS
  58 static void TestRuleBasedTokenizer(void);
  59 #endif
  60
  61 void addBrkIterAPITest(TestNode** root);
  62
  63 void addBrkIterAPITest(TestNode** root)
  64 {
  65 #if !UCONFIG_NO_FILE_IO
  66     addTest(root, &TestBreakIteratorCAPI, "tstxtbd/cbiapts/TestBreakIteratorCAPI");
  67     addTest(root, &TestBreakIteratorSafeClone, "tstxtbd/cbiapts/TestBreakIteratorSafeClone");
  68     addTest(root, &TestBreakIteratorUText, "tstxtbd/cbiapts/TestBreakIteratorUText");
  69 #endif
  70     addTest(root, &TestBreakIteratorRules, "tstxtbd/cbiapts/TestBreakIteratorRules");
  71     addTest(root, &TestBreakIteratorRuleError, "tstxtbd/cbiapts/TestBreakIteratorRuleError");
  72     addTest(root, &TestBreakIteratorStatusVec, "tstxtbd/cbiapts/TestBreakIteratorStatusVec");
  73     addTest(root, &TestBreakIteratorTailoring, "tstxtbd/cbiapts/TestBreakIteratorTailoring");
  74     addTest(root, &TestBreakIteratorRefresh, "tstxtbd/cbiapts/TestBreakIteratorRefresh");
  75     addTest(root, &TestBug11665, "tstxtbd/cbiapts/TestBug11665");
  76 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
  77     addTest(root, &TestBreakIteratorSuppressions, "tstxtbd/cbiapts/TestBreakIteratorSuppressions");
  78 #endif
  79 #if APPLE_ADDITIONS
  80     addTest(root, &TestRuleBasedTokenizer, "tstxtbd/cbiapts/TestRuleBasedTokenizer");
  81 #endif
  82 }
  83
  84 #define CLONETEST_ITERATOR_COUNT 2
  85
  86 /*
  87  *   Utility function for converting char * to UChar * strings, to
  88  *     simplify the test code.   Converted strings are put in heap allocated
  89  *     storage.   A hook (probably a local in the caller's code) allows all
  90  *     strings converted with that hook to be freed with a single call.
  91  */
  92 typedef struct StringStruct {
  93         struct StringStruct   *link;
  94         UChar                 str[1];
  95     } StringStruct;
  96
  97
  98 static UChar* toUChar(const char *src, void **freeHook) {
  99     /* Structure of the memory that we allocate on the heap */
 100
 101     int32_t    numUChars;
 102     int32_t    destSize;
 103     UChar      stackBuf[2000 + sizeof(void *)/sizeof(UChar)];
 104     StringStruct  *dest;
 105     UConverter *cnv;
 106
 107     UErrorCode status = U_ZERO_ERROR;
 108     if (src == NULL) {
 109         return NULL;
 110     };
 111
 112     cnv = ucnv_open(NULL, &status);
 113     if(U_FAILURE(status) || cnv == NULL) {
 114         return NULL;
 115     }
 116     ucnv_reset(cnv);
 117     numUChars = ucnv_toUChars(cnv,
 118                   stackBuf,
 119                   2000,
 120                   src, -1,
 121                   &status);
 122
 123     destSize = (numUChars+1) * sizeof(UChar) + sizeof(struct StringStruct);
 124     dest = (StringStruct *)malloc(destSize);
 125     if (dest != NULL) {
 126         if (status == U_BUFFER_OVERFLOW_ERROR || status == U_STRING_NOT_TERMINATED_WARNING) {
 127             ucnv_toUChars(cnv, dest->str, numUChars+1, src, -1, &status);
 128         } else if (status == U_ZERO_ERROR) {
 129             u_strcpy(dest->str, stackBuf);
 130         } else {
 131             free(dest);
 132             dest = NULL;
 133         }
 134     }
 135
 136     ucnv_reset(cnv); /* be good citizens */
 137     ucnv_close(cnv);
 138     if (dest == NULL) {
 139         return NULL;
 140     }
 141
 142     dest->link = (StringStruct*)(*freeHook);
 143     *freeHook = dest;
 144     return dest->str;
 145 }
 146
 147 static void freeToUCharStrings(void **hook) {
 148     StringStruct  *s = *(StringStruct **)hook;
 149     while (s != NULL) {
 150         StringStruct *next = s->link;
 151         free(s);
 152         s = next;
 153     }
 154 }
 155
 156
 157 #if !UCONFIG_NO_FILE_IO
 158 static void TestBreakIteratorCAPI()
 159 {
 160     UErrorCode status = U_ZERO_ERROR;
 161     UBreakIterator *word, *sentence, *line, *character, *b, *bogus;
 162     int32_t start,pos,end,to;
 163     int32_t i;
 164     int32_t count = 0;
 165
 166     UChar text[50];
 167
 168     /* Note:  the adjacent "" are concatenating strings, not adding a \" to the
 169        string, which is probably what whoever wrote this intended.  Don't fix,
 170        because it would throw off the hard coded break positions in the following
 171        tests. */
 172     u_uastrcpy(text, "He's from Africa. ""Mr. Livingston, I presume?"" Yeah");
 173
 174
 175 /*test ubrk_open()*/
 176     log_verbose("\nTesting BreakIterator open functions\n");
 177
 178     /* Use french for fun */
 179     word         = ubrk_open(UBRK_WORD, "en_US", text, u_strlen(text), &status);
 180     if(status == U_FILE_ACCESS_ERROR) {
 181         log_data_err("Check your data - it doesn't seem to be around\n");
 182         return;
 183     } else if(U_FAILURE(status)){
 184         log_err_status(status, "FAIL: Error in ubrk_open() for word breakiterator: %s\n", myErrorName(status));
 185     }
 186     else{
 187         log_verbose("PASS: Successfully opened  word breakiterator\n");
 188     }
 189
 190     sentence     = ubrk_open(UBRK_SENTENCE, "en_US", text, u_strlen(text), &status);
 191     if(U_FAILURE(status)){
 192         log_err_status(status, "FAIL: Error in ubrk_open() for sentence breakiterator: %s\n", myErrorName(status));
 193         return;
 194     }
 195     else{
 196         log_verbose("PASS: Successfully opened  sentence breakiterator\n");
 197     }
 198
 199     line         = ubrk_open(UBRK_LINE, "en_US", text, u_strlen(text), &status);
 200     if(U_FAILURE(status)){
 201         log_err("FAIL: Error in ubrk_open() for line breakiterator: %s\n", myErrorName(status));
 202         return;
 203     }
 204     else{
 205         log_verbose("PASS: Successfully opened  line breakiterator\n");
 206     }
 207
 208     character     = ubrk_open(UBRK_CHARACTER, "en_US", text, u_strlen(text), &status);
 209     if(U_FAILURE(status)){
 210         log_err("FAIL: Error in ubrk_open() for character breakiterator: %s\n", myErrorName(status));
 211         return;
 212     }
 213     else{
 214         log_verbose("PASS: Successfully opened  character breakiterator\n");
 215     }
 216     /*trying to open an illegal iterator*/
 217     bogus     = ubrk_open((UBreakIteratorType)5, "en_US", text, u_strlen(text), &status);
 218     if(bogus != NULL) {
 219         log_err("FAIL: expected NULL from opening an invalid break iterator.\n");
 220     }
 221     if(U_SUCCESS(status)){
 222         log_err("FAIL: Error in ubrk_open() for BOGUS breakiterator. Expected U_ILLEGAL_ARGUMENT_ERROR\n");
 223     }
 224     if(U_FAILURE(status)){
 225         if(status != U_ILLEGAL_ARGUMENT_ERROR){
 226             log_err("FAIL: Error in ubrk_open() for BOGUS breakiterator. Expected U_ILLEGAL_ARGUMENT_ERROR\n Got %s\n", myErrorName(status));
 227         }
 228     }
 229     status=U_ZERO_ERROR;
 230
 231
 232 /* ======= Test ubrk_countAvialable() and ubrk_getAvialable() */
 233
 234     log_verbose("\nTesting ubrk_countAvailable() and ubrk_getAvailable()\n");
 235     count=ubrk_countAvailable();
 236     /* use something sensible w/o hardcoding the count */
 237     if(count < 0){
 238         log_err("FAIL: Error in ubrk_countAvialable() returned %d\n", count);
 239     }
 240     else{
 241         log_verbose("PASS: ubrk_countAvialable() successful returned %d\n", count);
 242     }
 243     for(i=0;i<count;i++)
 244     {
 245         log_verbose("%s\n", ubrk_getAvailable(i));
 246         if (ubrk_getAvailable(i) == 0)
 247             log_err("No locale for which breakiterator is applicable\n");
 248         else
 249             log_verbose("A locale %s for which breakiterator is applicable\n",ubrk_getAvailable(i));
 250     }
 251
 252 /*========Test ubrk_first(), ubrk_last()...... and other functions*/
 253
 254     log_verbose("\nTesting the functions for word\n");
 255     start = ubrk_first(word);
 256     if(start!=0)
 257         log_err("error ubrk_start(word) did not return 0\n");
 258     log_verbose("first (word = %d\n", (int32_t)start);
 259        pos=ubrk_next(word);
 260     if(pos!=4)
 261         log_err("error ubrk_next(word) did not return 4\n");
 262     log_verbose("next (word = %d\n", (int32_t)pos);
 263     pos=ubrk_following(word, 4);
 264     if(pos!=5)
 265         log_err("error ubrl_following(word,4) did not return 6\n");
 266     log_verbose("next (word = %d\n", (int32_t)pos);
 267     end=ubrk_last(word);
 268     if(end!=49)
 269         log_err("error ubrk_last(word) did not return 49\n");
 270     log_verbose("last (word = %d\n", (int32_t)end);
 271
 272     pos=ubrk_previous(word);
 273     log_verbose("%d   %d\n", end, pos);
 274
 275     pos=ubrk_previous(word);
 276     log_verbose("%d \n", pos);
 277
 278     if (ubrk_isBoundary(word, 2) != FALSE) {
 279         log_err("error ubrk_isBoundary(word, 2) did not return FALSE\n");
 280     }
 281     pos=ubrk_current(word);
 282     if (pos != 4) {
 283         log_err("error ubrk_current() != 4 after ubrk_isBoundary(word, 2)\n");
 284     }
 285     if (ubrk_isBoundary(word, 4) != TRUE) {
 286         log_err("error ubrk_isBoundary(word, 4) did not return TRUE\n");
 287     }
 288
 289
 290
 291     log_verbose("\nTesting the functions for character\n");
 292     ubrk_first(character);
 293     pos = ubrk_following(character, 5);
 294     if(pos!=6)
 295        log_err("error ubrk_following(character,5) did not return 6\n");
 296     log_verbose("Following (character,5) = %d\n", (int32_t)pos);
 297     pos=ubrk_following(character, 18);
 298     if(pos!=19)
 299        log_err("error ubrk_following(character,18) did not return 19\n");
 300     log_verbose("Followingcharacter,18) = %d\n", (int32_t)pos);
 301     pos=ubrk_preceding(character, 22);
 302     if(pos!=21)
 303        log_err("error ubrk_preceding(character,22) did not return 21\n");
 304     log_verbose("preceding(character,22) = %d\n", (int32_t)pos);
 305
 306
 307     log_verbose("\nTesting the functions for line\n");
 308     pos=ubrk_first(line);
 309     if(pos != 0)
 310         log_err("error ubrk_first(line) returned %d, expected 0\n", (int32_t)pos);
 311     pos = ubrk_next(line);
 312     pos=ubrk_following(line, 18);
 313     if(pos!=22)
 314         log_err("error ubrk_following(line) did not return 22\n");
 315     log_verbose("following (line) = %d\n", (int32_t)pos);
 316
 317
 318     log_verbose("\nTesting the functions for sentence\n");
 319     ubrk_first(sentence);
 320     pos = ubrk_current(sentence);
 321     log_verbose("Current(sentence) = %d\n", (int32_t)pos);
 322        pos = ubrk_last(sentence);
 323     if(pos!=49)
 324         log_err("error ubrk_last for sentence did not return 49\n");
 325     log_verbose("Last (sentence) = %d\n", (int32_t)pos);
 326     ubrk_first(sentence);
 327     to = ubrk_following( sentence, 0 );
 328     if (to == 0) log_err("ubrk_following returned 0\n");
 329     to = ubrk_preceding( sentence, to );
 330     if (to != 0) log_err("ubrk_preceding didn't return 0\n");
 331     if (ubrk_first(sentence)!=ubrk_current(sentence)) {
 332         log_err("error in ubrk_first() or ubrk_current()\n");
 333     }
 334
 335
 336     /*---- */
 337     /*Testing ubrk_open and ubrk_close()*/
 338    log_verbose("\nTesting open and close for us locale\n");
 339     b = ubrk_open(UBRK_WORD, "fr_FR", text, u_strlen(text), &status);
 340     if (U_FAILURE(status)) {
 341         log_err("ubrk_open for word returned NULL: %s\n", myErrorName(status));
 342     }
 343     ubrk_close(b);
 344
 345     /* Test setText and setUText */
 346     {
 347         UChar s1[] = {0x41, 0x42, 0x20, 0};
 348         UChar s2[] = {0x41, 0x42, 0x43, 0x44, 0x45, 0};
 349         UText *ut = NULL;
 350         UBreakIterator *bb;
 351         int j;
 352
 353         log_verbose("\nTesting ubrk_setText() and ubrk_setUText()\n");
 354         status = U_ZERO_ERROR;
 355         bb = ubrk_open(UBRK_WORD, "en_US", NULL, 0, &status);
 356         TEST_ASSERT_SUCCESS(status);
 357         ubrk_setText(bb, s1, -1, &status);
 358         TEST_ASSERT_SUCCESS(status);
 359         ubrk_first(bb);
 360         j = ubrk_next(bb);
 361         TEST_ASSERT(j == 2);
 362         ut = utext_openUChars(ut, s2, -1, &status);
 363         ubrk_setUText(bb, ut, &status);
 364         TEST_ASSERT_SUCCESS(status);
 365         j = ubrk_next(bb);
 366         TEST_ASSERT(j == 5);
 367
 368         ubrk_close(bb);
 369         utext_close(ut);
 370     }
 371
 372     ubrk_close(word);
 373     ubrk_close(sentence);
 374     ubrk_close(line);
 375     ubrk_close(character);
 376 }
 377
 378 static void TestBreakIteratorSafeClone(void)
 379 {
 380     UChar text[51];     /* Keep this odd to test for 64-bit memory alignment */
 381                         /*  NOTE:  This doesn't reliably force mis-alignment of following items. */
 382     uint8_t buffer [CLONETEST_ITERATOR_COUNT] [U_BRK_SAFECLONE_BUFFERSIZE];
 383     int32_t bufferSize = U_BRK_SAFECLONE_BUFFERSIZE;
 384
 385     UBreakIterator * someIterators [CLONETEST_ITERATOR_COUNT];
 386     UBreakIterator * someClonedIterators [CLONETEST_ITERATOR_COUNT];
 387
 388     UBreakIterator * brk;
 389     UErrorCode status = U_ZERO_ERROR;
 390     int32_t start,pos;
 391     int32_t i;
 392
 393     /*Testing ubrk_safeClone */
 394
 395     /* Note:  the adjacent "" are concatenating strings, not adding a \" to the
 396        string, which is probably what whoever wrote this intended.  Don't fix,
 397        because it would throw off the hard coded break positions in the following
 398        tests. */
 399     u_uastrcpy(text, "He's from Africa. ""Mr. Livingston, I presume?"" Yeah");
 400
 401     /* US & Thai - rule-based & dictionary based */
 402     someIterators[0] = ubrk_open(UBRK_WORD, "en_US", text, u_strlen(text), &status);
 403     if(!someIterators[0] || U_FAILURE(status)) {
 404       log_data_err("Couldn't open en_US word break iterator - %s\n", u_errorName(status));
 405       return;
 406     }
 407
 408     someIterators[1] = ubrk_open(UBRK_WORD, "th_TH", text, u_strlen(text), &status);
 409     if(!someIterators[1] || U_FAILURE(status)) {
 410       log_data_err("Couldn't open th_TH word break iterator - %s\n", u_errorName(status));
 411       return;
 412     }
 413
 414     /* test each type of iterator */
 415     for (i = 0; i < CLONETEST_ITERATOR_COUNT; i++)
 416     {
 417
 418         /* Check the various error & informational states */
 419
 420         /* Null status - just returns NULL */
 421         if (NULL != ubrk_safeClone(someIterators[i], buffer[i], &bufferSize, NULL))
 422         {
 423             log_err("FAIL: Cloned Iterator failed to deal correctly with null status\n");
 424         }
 425         /* error status - should return 0 & keep error the same */
 426         status = U_MEMORY_ALLOCATION_ERROR;
 427         if (NULL != ubrk_safeClone(someIterators[i], buffer[i], &bufferSize, &status) || status != U_MEMORY_ALLOCATION_ERROR)
 428         {
 429             log_err("FAIL: Cloned Iterator failed to deal correctly with incoming error status\n");
 430         }
 431         status = U_ZERO_ERROR;
 432
 433         /* Null buffer size pointer is ok */
 434         if (NULL == (brk = ubrk_safeClone(someIterators[i], buffer[i], NULL, &status)) || U_FAILURE(status))
 435         {
 436             log_err("FAIL: Cloned Iterator failed to deal correctly with null bufferSize pointer\n");
 437         }
 438         ubrk_close(brk);
 439         status = U_ZERO_ERROR;
 440
 441         /* buffer size pointer is 0 - fill in pbufferSize with a size */
 442         bufferSize = 0;
 443         if (NULL != ubrk_safeClone(someIterators[i], buffer[i], &bufferSize, &status) ||
 444                 U_FAILURE(status) || bufferSize <= 0)
 445         {
 446             log_err("FAIL: Cloned Iterator failed a sizing request ('preflighting')\n");
 447         }
 448         /* Verify our define is large enough  */
 449         if (U_BRK_SAFECLONE_BUFFERSIZE < bufferSize)
 450         {
 451           log_err("FAIL: Pre-calculated buffer size is too small - %d but needed %d\n", U_BRK_SAFECLONE_BUFFERSIZE, bufferSize);
 452         }
 453         /* Verify we can use this run-time calculated size */
 454         if (NULL == (brk = ubrk_safeClone(someIterators[i], buffer[i], &bufferSize, &status)) || U_FAILURE(status))
 455         {
 456             log_err("FAIL: Iterator can't be cloned with run-time size\n");
 457         }
 458         if (brk)
 459             ubrk_close(brk);
 460         /* size one byte too small - should allocate & let us know */
 461         if (bufferSize > 1) {
 462             --bufferSize;
 463         }
 464         if (NULL == (brk = ubrk_safeClone(someIterators[i], NULL, &bufferSize, &status)) || status != U_SAFECLONE_ALLOCATED_WARNING)
 465         {
 466             log_err("FAIL: Cloned Iterator failed to deal correctly with too-small buffer size\n");
 467         }
 468         if (brk)
 469             ubrk_close(brk);
 470         status = U_ZERO_ERROR;
 471         bufferSize = U_BRK_SAFECLONE_BUFFERSIZE;
 472
 473         /* Null buffer pointer - return Iterator & set error to U_SAFECLONE_ALLOCATED_ERROR */
 474         if (NULL == (brk = ubrk_safeClone(someIterators[i], NULL, &bufferSize, &status)) || status != U_SAFECLONE_ALLOCATED_WARNING)
 475         {
 476             log_err("FAIL: Cloned Iterator failed to deal correctly with null buffer pointer\n");
 477         }
 478         if (brk)
 479             ubrk_close(brk);
 480         status = U_ZERO_ERROR;
 481
 482         /* Mis-aligned buffer pointer. */
 483         {
 484             char  stackBuf[U_BRK_SAFECLONE_BUFFERSIZE+sizeof(void *)];
 485
 486             brk = ubrk_safeClone(someIterators[i], &stackBuf[1], &bufferSize, &status);
 487             if (U_FAILURE(status) || brk == NULL) {
 488                 log_err("FAIL: Cloned Iterator failed with misaligned buffer pointer\n");
 489             }
 490             if (status == U_SAFECLONE_ALLOCATED_WARNING) {
 491                 log_verbose("Cloned Iterator allocated when using a mis-aligned buffer.\n");
 492             }
 493             if (brk)
 494                 ubrk_close(brk);
 495         }
 496
 497
 498         /* Null Iterator - return NULL & set U_ILLEGAL_ARGUMENT_ERROR */
 499         if (NULL != ubrk_safeClone(NULL, buffer[i], &bufferSize, &status) || status != U_ILLEGAL_ARGUMENT_ERROR)
 500         {
 501             log_err("FAIL: Cloned Iterator failed to deal correctly with null Iterator pointer\n");
 502         }
 503         status = U_ZERO_ERROR;
 504
 505         /* Do these cloned Iterators work at all - make a first & next call */
 506         bufferSize = U_BRK_SAFECLONE_BUFFERSIZE;
 507         someClonedIterators[i] = ubrk_safeClone(someIterators[i], buffer[i], &bufferSize, &status);
 508
 509         start = ubrk_first(someClonedIterators[i]);
 510         if(start!=0)
 511             log_err("error ubrk_start(clone) did not return 0\n");
 512         pos=ubrk_next(someClonedIterators[i]);
 513         if(pos!=4)
 514             log_err("error ubrk_next(clone) did not return 4\n");
 515
 516         ubrk_close(someClonedIterators[i]);
 517         ubrk_close(someIterators[i]);
 518     }
 519 }
 520 #endif
 521
 522
 523 /*
 524 //  Open a break iterator from char * rules.  Take care of conversion
 525 //     of the rules and error checking.
 526 */
 527 static UBreakIterator * testOpenRules(char *rules) {
 528     UErrorCode      status       = U_ZERO_ERROR;
 529     UChar          *ruleSourceU  = NULL;
 530     void           *strCleanUp   = NULL;
 531     UParseError     parseErr;
 532     UBreakIterator *bi;
 533
 534     ruleSourceU = toUChar(rules, &strCleanUp);
 535
 536     bi = ubrk_openRules(ruleSourceU,  -1,     /*  The rules  */
 537                         NULL,  -1,            /*  The text to be iterated over. */
 538                         &parseErr, &status);
 539
 540     if (U_FAILURE(status)) {
 541         log_data_err("FAIL: ubrk_openRules: ICU Error \"%s\" (Are you missing data?)\n", u_errorName(status));
 542         bi = 0;
 543     };
 544     freeToUCharStrings(&strCleanUp);
 545     return bi;
 546
 547 }
 548
 549 /*
 550  *  TestBreakIteratorRules - Verify that a break iterator can be created from
 551  *                           a set of source rules.
 552  */
 553 static void TestBreakIteratorRules() {
 554     /*  Rules will keep together any run of letters not including 'a', OR
 555      *             keep together 'abc', but only when followed by 'def', OTHERWISE
 556      *             just return one char at a time.
 557      */
 558     char         rules[]  = "abc/def{666};\n   [\\p{L} - [a]]* {2};  . {1};";
 559     /*                        0123456789012345678 */
 560     char         data[]   =  "abcdex abcdefgh-def";     /* the test data string                     */
 561     char         breaks[] =  "**    **  *    **  *";    /*  * the expected break positions          */
 562     char         tags[]   =  "01    21  6    21  2";    /*  expected tag values at break positions  */
 563     int32_t      tagMap[] = {0, 1, 2, 3, 4, 5, 666};
 564
 565     UChar       *uData;
 566     void        *freeHook = NULL;
 567     UErrorCode   status   = U_ZERO_ERROR;
 568     int32_t      pos;
 569     int          i;
 570
 571     UBreakIterator *bi = testOpenRules(rules);
 572     if (bi == NULL) {return;}
 573     uData = toUChar(data, &freeHook);
 574     ubrk_setText(bi,  uData, -1, &status);
 575
 576     pos = ubrk_first(bi);
 577     for (i=0; i<sizeof(breaks); i++) {
 578         if (pos == i && breaks[i] != '*') {
 579             log_err("FAIL: unexpected break at position %d found\n", pos);
 580             break;
 581         }
 582         if (pos != i && breaks[i] == '*') {
 583             log_err("FAIL: expected break at position %d not found.\n", i);
 584             break;
 585         }
 586         if (pos == i) {
 587             int32_t tag, expectedTag;
 588             tag = ubrk_getRuleStatus(bi);
 589             expectedTag = tagMap[tags[i]&0xf];
 590             if (tag != expectedTag) {
 591                 log_err("FAIL: incorrect tag value.  Position = %d;  expected tag %d, got %d",
 592                     pos, expectedTag, tag);
 593                 break;
 594             }
 595             pos = ubrk_next(bi);
 596         }
 597     }
 598
 599     /* #12914 add basic sanity test for ubrk_getBinaryRules, ubrk_openBinaryRules */
 600     /* Underlying functionality checked in C++ rbbiapts.cpp TestRoundtripRules */
 601     status = U_ZERO_ERROR;
 602     int32_t rulesLength = ubrk_getBinaryRules(bi, NULL, 0, &status); /* preflight */
 603     if (U_FAILURE(status)) {
 604         log_err("FAIL: ubrk_getBinaryRules preflight err: %s", u_errorName(status));
 605     } else {
 606         uint8_t* binaryRules = (uint8_t*)uprv_malloc(rulesLength);
 607         if (binaryRules == NULL) {
 608             log_err("FAIL: unable to malloc rules buffer, size %u", rulesLength);
 609         } else {
 610             rulesLength = ubrk_getBinaryRules(bi, binaryRules, rulesLength, &status);
 611             if (U_FAILURE(status)) {
 612                 log_err("FAIL: ubrk_getBinaryRules err: %s", u_errorName(status));
 613             } else {
 614                 UBreakIterator* bi2 = ubrk_openBinaryRules(binaryRules, rulesLength, uData, -1, &status);
 615                 if (U_FAILURE(status)) {
 616                     log_err("FAIL: ubrk_openBinaryRules err: %s", u_errorName(status));
 617                 } else {
 618                     int32_t maxCount = sizeof(breaks); /* fail-safe test limit */
 619                     int32_t pos2 = ubrk_first(bi2);
 620                     pos = ubrk_first(bi);
 621                     do {
 622                         if (pos2 != pos) {
 623                             log_err("FAIL: interator from ubrk_openBinaryRules does not match original, get pos = %d instead of %d", pos2, pos);
 624                         }
 625                         pos2 = ubrk_next(bi2);
 626                         pos = ubrk_next(bi);
 627                     } while ((pos != UBRK_DONE || pos2 != UBRK_DONE) && maxCount-- > 0);
 628
 629                     ubrk_close(bi2);
 630                 }
 631             }
 632             uprv_free(binaryRules);
 633         }
 634     }
 635
 636     freeToUCharStrings(&freeHook);
 637     ubrk_close(bi);
 638 }
 639
 640 static void TestBreakIteratorRuleError() {
 641 /*
 642  *  TestBreakIteratorRuleError -   Try to create a BI from rules with syntax errors,
 643  *                                 check that the error is reported correctly.
 644  */
 645     char            rules[]  = "           #  This is a rule comment on line 1\n"
 646                                "[:L:];     # this rule is OK.\n"
 647                                "abcdefg);  # Error, mismatched parens\n";
 648     UChar          *uRules;
 649     void           *freeHook = NULL;
 650     UErrorCode      status   = U_ZERO_ERROR;
 651     UParseError     parseErr;
 652     UBreakIterator *bi;
 653
 654     uRules = toUChar(rules, &freeHook);
 655     bi = ubrk_openRules(uRules,  -1,          /*  The rules  */
 656                         NULL,  -1,            /*  The text to be iterated over. */
 657                         &parseErr, &status);
 658     if (U_SUCCESS(status)) {
 659         log_err("FAIL: construction of break iterator succeeded when it should have failed.\n");
 660         ubrk_close(bi);
 661     } else {
 662         if (parseErr.line != 3 || parseErr.offset != 8) {
 663             log_data_err("FAIL: incorrect error position reported. Got line %d, char %d, expected line 3, char 7 (Are you missing data?)\n",
 664                 parseErr.line, parseErr.offset);
 665         }
 666     }
 667     freeToUCharStrings(&freeHook);
 668 }
 669
 670
 671 /*
 672 *   TestsBreakIteratorStatusVals()   Test the ubrk_getRuleStatusVec() funciton
 673 */
 674 static void TestBreakIteratorStatusVec() {
 675     #define RULE_STRING_LENGTH 200
 676     UChar          rules[RULE_STRING_LENGTH];
 677
 678     #define TEST_STRING_LENGTH 25
 679     UChar           testString[TEST_STRING_LENGTH];
 680     UBreakIterator *bi        = NULL;
 681     int32_t         pos       = 0;
 682     int32_t         vals[10];
 683     int32_t         numVals;
 684     UErrorCode      status    = U_ZERO_ERROR;
 685
 686     u_uastrncpy(rules,  "[A-N]{100}; \n"
 687                              "[a-w]{200}; \n"
 688                              "[\\p{L}]{300}; \n"
 689                              "[\\p{N}]{400}; \n"
 690                              "[0-5]{500}; \n"
 691                               "!.*;\n", RULE_STRING_LENGTH);
 692     u_uastrncpy(testString, "ABC", TEST_STRING_LENGTH);
 693
 694
 695     bi = ubrk_openRules(rules, -1, testString, -1, NULL, &status);
 696     TEST_ASSERT_SUCCESS(status);
 697     TEST_ASSERT(bi != NULL);
 698
 699     /* The TEST_ASSERT above should change too... */
 700     if (bi != NULL) {
 701         pos = ubrk_next(bi);
 702         TEST_ASSERT(pos == 1);
 703
 704         memset(vals, -1, sizeof(vals));
 705         numVals = ubrk_getRuleStatusVec(bi, vals, 10, &status);
 706         TEST_ASSERT_SUCCESS(status);
 707         TEST_ASSERT(numVals == 2);
 708         TEST_ASSERT(vals[0] == 100);
 709         TEST_ASSERT(vals[1] == 300);
 710         TEST_ASSERT(vals[2] == -1);
 711
 712         numVals = ubrk_getRuleStatusVec(bi, vals, 0, &status);
 713         TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR);
 714         TEST_ASSERT(numVals == 2);
 715     }
 716
 717     ubrk_close(bi);
 718 }
 719
 720
 721 /*
 722  *  static void TestBreakIteratorUText(void);
 723  *
 724  *         Test that ubrk_setUText() is present and works for a simple case.
 725  */
 726 static void TestBreakIteratorUText(void) {
 727     const char *UTF8Str = "\x41\xc3\x85\x5A\x20\x41\x52\x69\x6E\x67";  /* c3 85 is utf-8 for A with a ring on top */
 728                       /*   0  1   2 34567890  */
 729
 730     UErrorCode      status = U_ZERO_ERROR;
 731     UBreakIterator *bi     = NULL;
 732     int32_t         pos    = 0;
 733
 734
 735     UText *ut = utext_openUTF8(NULL, UTF8Str, -1, &status);
 736     TEST_ASSERT_SUCCESS(status);
 737
 738     bi = ubrk_open(UBRK_WORD, "en_US", NULL, 0, &status);
 739     if (U_FAILURE(status)) {
 740         log_err_status(status, "Failure at file %s, line %d, error = %s\n", __FILE__, __LINE__, u_errorName(status));
 741         return;
 742     }
 743
 744     ubrk_setUText(bi, ut, &status);
 745     if (U_FAILURE(status)) {
 746         log_err("Failure at file %s, line %d, error = %s\n", __FILE__, __LINE__, u_errorName(status));
 747         return;
 748     }
 749
 750     pos = ubrk_first(bi);
 751     TEST_ASSERT(pos == 0);
 752
 753     pos = ubrk_next(bi);
 754     TEST_ASSERT(pos == 4);
 755
 756     pos = ubrk_next(bi);
 757     TEST_ASSERT(pos == 5);
 758
 759     pos = ubrk_next(bi);
 760     TEST_ASSERT(pos == 10);
 761
 762     pos = ubrk_next(bi);
 763     TEST_ASSERT(pos == UBRK_DONE);
 764     ubrk_close(bi);
 765     utext_close(ut);
 766 }
 767
 768 /*
 769  *  static void TestBreakIteratorTailoring(void);
 770  *
 771  *         Test break iterator tailorings from CLDR data.
 772  */
 773
 774 /* Thai/Lao grapheme break tailoring */
 775 static const UChar thTest[] = { 0x0020, 0x0E40, 0x0E01, 0x0020,
 776                                 0x0E01, 0x0E30, 0x0020, 0x0E01, 0x0E33, 0x0020, 0 };
 777 /*in Unicode 6.1 en should behave just like th for this*/
 778 /*static const int32_t thTestOffs_enFwd[] = {  1,      3,  4,      6,  7,      9, 10 };*/
 779 static const int32_t thTestOffs_thFwd[] = {  1,  2,  3,  4,  5,  6,  7,      9, 10 };
 780 /*static const int32_t thTestOffs_enRev[] = {  9,      7,  6,      4,  3,      1,  0 };*/
 781 static const int32_t thTestOffs_thRev[] = {  9,      7,  6,  5,  4,  3,  2,  1,  0 };
 782
 783 /* Hebrew line break tailoring, for cldrbug 3028 */
 784 static const UChar heTest[] = { 0x0020, 0x002D, 0x0031, 0x0032, 0x0020,
 785                                 0x0061, 0x002D, 0x006B, 0x0020,
 786                                 0x0061, 0x0300, 0x2010, 0x006B, 0x0020,
 787                                 0x05DE, 0x05D4, 0x002D, 0x0069, 0x0020,
 788                                 0x05D1, 0x05BC, 0x2010, 0x0047, 0x0020, 0 };
 789 /*in Unicode 6.1 en should behave just like he for this*/
 790 /*static const int32_t heTestOffs_enFwd[] = {  1,  5,  7,  9, 12, 14, 17, 19, 22, 24 };*/
 791 static const int32_t heTestOffs_heFwd[] = {  1,  5,  7,  9, 12, 14,     19,     24 };
 792 /*static const int32_t heTestOffs_enRev[] = { 22, 19, 17, 14, 12,  9,  7,  5,  1,  0 };*/
 793 static const int32_t heTestOffs_heRev[] = {     19,     14, 12,  9,  7,  5,  1,  0 };
 794
 795 /* Finnish line break tailoring, for cldrbug 3029 */
 796 static const UChar fiTest[] = { /* 00 */ 0x0020, 0x002D, 0x0031, 0x0032, 0x0020,
 797                                 /* 05 */ 0x0061, 0x002D, 0x006B, 0x0020,
 798                                 /* 09 */ 0x0061, 0x0300, 0x2010, 0x006B, 0x0020,
 799                                 /* 14 */ 0x0061, 0x0020, 0x002D, 0x006B, 0x0020,
 800                                 /* 19 */ 0x0061, 0x0300, 0x0020, 0x2010, 0x006B, 0x0020, 0 };
 801 static const int32_t fiTestOffs_enFwd[] =  {  1,  5,  7,  9, 12, 14, 16, 17, 19, 22, 23, 25 };
 802 static const int32_t fiTestOffs_fiFwd[] =  {  1,  5,  7,  9, 12, 14, 16,     19, 22,     25 };
 803 static const int32_t fiTestOffs_enRev[] =  { 23, 22, 19, 17, 16, 14, 12,  9,  7,  5,  1,  0 };
 804 static const int32_t fiTestOffs_fiRev[] =  {     22, 19,     16, 14, 12,  9,  7,  5,  1,  0 };
 805
 806 /* Khmer dictionary-based work break, for ICU ticket #8329 */
 807 static const UChar kmTest[] = { /* 00 */ 0x179F, 0x17BC, 0x1798, 0x1785, 0x17C6, 0x178E, 0x17B6, 0x1799, 0x1796, 0x17C1,
 808                                 /* 10 */ 0x179B, 0x1794, 0x1793, 0x17D2, 0x178F, 0x17B7, 0x1785, 0x178A, 0x17BE, 0x1798,
 809                                 /* 20 */ 0x17D2, 0x1794, 0x17B8, 0x17A2, 0x1792, 0x17B7, 0x179F, 0x17D2, 0x178B, 0x17B6,
 810                                 /* 30 */ 0x1793, 0x17A2, 0x179A, 0x1796, 0x17D2, 0x179A, 0x17C7, 0x1782, 0x17BB, 0x178E,
 811                                 /* 40 */ 0x178A, 0x179B, 0x17CB, 0x1796, 0x17D2, 0x179A, 0x17C7, 0x17A2, 0x1784, 0x17D2,
 812                                 /* 50 */ 0x1782, 0 };
 813 static const int32_t kmTestOffs_kmFwd[] =  {  3, /*8,*/ 11, 17, 23, 31, /*33,*/  40,  43, 51 }; /* TODO: Investigate failure to break at offset 8 */
 814 static const int32_t kmTestOffs_kmRev[] =  { 43,  40,   /*33,*/ 31, 23, 17, 11, /*8,*/ 3,  0 };
 815
 816
 817 /* Korean keepAll vs Normal */
 818 static const UChar koTest[] = { /* 00 */ 0xBAA8, 0xB4E0, 0x0020, 0xC778, 0xB958, 0x0020, 0xAD6C, 0xC131, 0xC6D0, 0xC758,
 819                                 /* 10 */ 0x0020, 0xCC9C, 0xBD80, 0xC758, 0x0020, 0xC874, 0xC5C4, 0xC131, 0xACFC, 0x0020,
 820                                 /* 20 */ 0xB3D9, 0xB4F1, 0xD558, 0xACE0, 0x0020, 0xC591, 0xB3C4, 0xD560, 0 };
 821 static const int32_t koTestOffs_koKeepFwd[] =  {   3,  6, 11, 15, 20, 25, 28 };
 822 static const int32_t koTestOffs_koKeepRev[] =  {  25, 20, 15, 11,  6,  3,  0 };
 823 static const int32_t koTestOffs_koNormFwd[] =  {  1,  3,  4,  6,  7,  8,  9, 11, 12, 13, 15, 16, 17, 18, 20, 21, 22, 23, 25, 26, 27, 28 };
 824 static const int32_t koTestOffs_koNormRev[] =  { 27, 26, 25, 23, 22, 21, 20, 18, 17, 16, 15, 13, 12, 11,  9,  8,  7,  6,  4,  3,  1,  0 };
 825
 826 typedef struct {
 827     const char * locale;
 828     UBreakIteratorType type;
 829     const UChar * test;
 830     const int32_t * offsFwd;
 831     const int32_t * offsRev;
 832     int32_t numOffsets;
 833 } RBBITailoringTest;
 834
 835 static const RBBITailoringTest tailoringTests[] = {
 836     { "en",            UBRK_CHARACTER, thTest, thTestOffs_thFwd, thTestOffs_thRev, UPRV_LENGTHOF(thTestOffs_thFwd) },
 837     { "en_US_POSIX",   UBRK_CHARACTER, thTest, thTestOffs_thFwd, thTestOffs_thRev, UPRV_LENGTHOF(thTestOffs_thFwd) },
 838     { "en",            UBRK_LINE,      heTest, heTestOffs_heFwd, heTestOffs_heRev, UPRV_LENGTHOF(heTestOffs_heFwd) },
 839     { "he",            UBRK_LINE,      heTest, heTestOffs_heFwd, heTestOffs_heRev, UPRV_LENGTHOF(heTestOffs_heFwd) },
 840     { "en",            UBRK_LINE,      fiTest, fiTestOffs_enFwd, fiTestOffs_enRev, UPRV_LENGTHOF(fiTestOffs_enFwd) },
 841     { "fi",            UBRK_LINE,      fiTest, fiTestOffs_fiFwd, fiTestOffs_fiRev, UPRV_LENGTHOF(fiTestOffs_fiFwd) },
 842     { "km",            UBRK_WORD,      kmTest, kmTestOffs_kmFwd, kmTestOffs_kmRev, UPRV_LENGTHOF(kmTestOffs_kmFwd) },
 843     { "ko",            UBRK_LINE,      koTest, koTestOffs_koKeepFwd, koTestOffs_koKeepRev, UPRV_LENGTHOF(koTestOffs_koKeepFwd) },
 844     { "ko@lw=keepall", UBRK_LINE,      koTest, koTestOffs_koKeepFwd, koTestOffs_koKeepRev, UPRV_LENGTHOF(koTestOffs_koKeepFwd) },
 845     { "ko@lw=normal",  UBRK_LINE,      koTest, koTestOffs_koNormFwd, koTestOffs_koNormRev, UPRV_LENGTHOF(koTestOffs_koNormFwd) },
 846     { NULL, 0, NULL, NULL, NULL, 0 },
 847 };
 848
 849 static void TestBreakIteratorTailoring(void) {
 850     const RBBITailoringTest * testPtr;
 851     for (testPtr = tailoringTests; testPtr->locale != NULL; ++testPtr) {
 852         UErrorCode status = U_ZERO_ERROR;
 853         UBreakIterator* ubrkiter = ubrk_open(testPtr->type, testPtr->locale, testPtr->test, -1, &status);
 854         if ( U_SUCCESS(status) ) {
 855             int32_t offset, offsindx;
 856             UBool foundError;
 857
 858             foundError = FALSE;
 859             ubrk_first(ubrkiter);
 860             for (offsindx = 0; (offset = ubrk_next(ubrkiter)) != UBRK_DONE; ++offsindx) {
 861                 if (!foundError && offsindx >= testPtr->numOffsets) {
 862                     log_err("FAIL: locale %s, break type %d, ubrk_next expected UBRK_DONE, got %d\n",
 863                             testPtr->locale, testPtr->type, offset);
 864                     foundError = TRUE;
 865                 } else if (!foundError && offset != testPtr->offsFwd[offsindx]) {
 866                     log_err("FAIL: locale %s, break type %d, ubrk_next expected %d, got %d\n",
 867                             testPtr->locale, testPtr->type, testPtr->offsFwd[offsindx], offset);
 868                     foundError = TRUE;
 869                 }
 870             }
 871             if (!foundError && offsindx < testPtr->numOffsets) {
 872                 log_err("FAIL: locale %s, break type %d, ubrk_next expected %d, got UBRK_DONE\n",
 873                         testPtr->locale, testPtr->type, testPtr->offsFwd[offsindx]);
 874             }
 875
 876             foundError = FALSE;
 877             ubrk_last(ubrkiter);
 878             for (offsindx = 0; (offset = ubrk_previous(ubrkiter)) != UBRK_DONE; ++offsindx) {
 879                 if (!foundError && offsindx >= testPtr->numOffsets) {
 880                     log_err("FAIL: locale %s, break type %d, ubrk_previous expected UBRK_DONE, got %d\n",
 881                             testPtr->locale, testPtr->type, offset);
 882                     foundError = TRUE;
 883                 } else if (!foundError && offset != testPtr->offsRev[offsindx]) {
 884                     log_err("FAIL: locale %s, break type %d, ubrk_previous expected %d, got %d\n",
 885                             testPtr->locale, testPtr->type, testPtr->offsRev[offsindx], offset);
 886                     foundError = TRUE;
 887                 }
 888             }
 889             if (!foundError && offsindx < testPtr->numOffsets) {
 890                 log_err("FAIL: locale %s, break type %d, ubrk_previous expected %d, got UBRK_DONE\n",
 891                         testPtr->locale, testPtr->type, testPtr->offsRev[offsindx]);
 892             }
 893
 894             ubrk_close(ubrkiter);
 895         } else {
 896             log_err_status(status, "FAIL: locale %s, break type %d, ubrk_open status: %s\n", testPtr->locale, testPtr->type, u_errorName(status));
 897         }
 898     }
 899 }
 900
 901
 902 static void TestBreakIteratorRefresh(void) {
 903     /*
 904      *  RefreshInput changes out the input of a Break Iterator without
 905      *    changing anything else in the iterator's state.  Used with Java JNI,
 906      *    when Java moves the underlying string storage.   This test
 907      *    runs a ubrk_next() repeatedly, moving the text in the middle of the sequence.
 908      *    The right set of boundaries should still be found.
 909      */
 910     UChar testStr[]  = {0x20, 0x41, 0x20, 0x42, 0x20, 0x43, 0x20, 0x44, 0x0};  /* = " A B C D"  */
 911     UChar movedStr[] = {0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,  0};
 912     UErrorCode status = U_ZERO_ERROR;
 913     UBreakIterator *bi;
 914     UText ut1 = UTEXT_INITIALIZER;
 915     UText ut2 = UTEXT_INITIALIZER;
 916
 917     bi = ubrk_open(UBRK_LINE, "en_US", NULL, 0, &status);
 918     TEST_ASSERT_SUCCESS(status);
 919     if (U_FAILURE(status)) {
 920         return;
 921     }
 922
 923     utext_openUChars(&ut1, testStr, -1, &status);
 924     TEST_ASSERT_SUCCESS(status);
 925     ubrk_setUText(bi, &ut1, &status);
 926     TEST_ASSERT_SUCCESS(status);
 927
 928     if (U_SUCCESS(status)) {
 929         /* Line boundaries will occur before each letter in the original string */
 930         TEST_ASSERT(1 == ubrk_next(bi));
 931         TEST_ASSERT(3 == ubrk_next(bi));
 932
 933         /* Move the string, kill the original string.  */
 934         u_strcpy(movedStr, testStr);
 935         u_memset(testStr, 0x20, u_strlen(testStr));
 936         utext_openUChars(&ut2, movedStr, -1, &status);
 937         TEST_ASSERT_SUCCESS(status);
 938         ubrk_refreshUText(bi, &ut2, &status);
 939         TEST_ASSERT_SUCCESS(status);
 940
 941         /* Find the following matches, now working in the moved string. */
 942         TEST_ASSERT(5 == ubrk_next(bi));
 943         TEST_ASSERT(7 == ubrk_next(bi));
 944         TEST_ASSERT(8 == ubrk_next(bi));
 945         TEST_ASSERT(UBRK_DONE == ubrk_next(bi));
 946         TEST_ASSERT_SUCCESS(status);
 947
 948         utext_close(&ut1);
 949         utext_close(&ut2);
 950     }
 951     ubrk_close(bi);
 952 }
 953
 954
 955 static void TestBug11665(void) {
 956     // The problem was with the incorrect breaking of Japanese text beginning
 957     // with Katakana characters when no prior Japanese or Chinese text had been
 958     // encountered.
 959     //
 960     // Tested here in cintltst, rather than in intltest, because only cintltst
 961     // tests have the ability to reset ICU, which is needed to get the bug
 962     // to manifest itself.
 963
 964     static UChar japaneseText[] = {0x30A2, 0x30EC, 0x30EB, 0x30AE, 0x30FC, 0x6027, 0x7D50, 0x819C, 0x708E};
 965     int32_t boundaries[10] = {0};
 966     UBreakIterator *bi = NULL;
 967     int32_t brk;
 968     int32_t brkIdx = 0;
 969     int32_t totalBreaks = 0;
 970     UErrorCode status = U_ZERO_ERROR;
 971
 972     ctest_resetICU();
 973     bi = ubrk_open(UBRK_WORD, "en_US", japaneseText, UPRV_LENGTHOF(japaneseText), &status);
 974     TEST_ASSERT_SUCCESS(status);
 975     if (!bi) {
 976         return;
 977     }
 978     for (brk=ubrk_first(bi); brk != UBRK_DONE; brk=ubrk_next(bi)) {
 979         boundaries[brkIdx] = brk;
 980         if (++brkIdx >= UPRV_LENGTHOF(boundaries) - 1) {
 981             break;
 982         }
 983     }
 984     if (brkIdx <= 2 || brkIdx >= UPRV_LENGTHOF(boundaries)) {
 985         log_err("%s:%d too few or many breaks found.\n", __FILE__, __LINE__);
 986     } else {
 987         totalBreaks = brkIdx;
 988         brkIdx = 0;
 989         for (brk=ubrk_first(bi); brk != UBRK_DONE; brk=ubrk_next(bi)) {
 990             if (brk != boundaries[brkIdx]) {
 991                 log_err("%s:%d Break #%d differs between first and second iteration.\n", __FILE__, __LINE__, brkIdx);
 992                 break;
 993             }
 994             if (++brkIdx >= UPRV_LENGTHOF(boundaries) - 1) {
 995                 log_err("%s:%d Too many breaks.\n", __FILE__, __LINE__);
 996                 break;
 997             }
 998         }
 999         if (totalBreaks != brkIdx) {
1000             log_err("%s:%d Number of breaks differ between first and second iteration.\n", __FILE__, __LINE__);
1001         }
1002     }
1003     ubrk_close(bi);
1004 }
1005
1006 /*
1007  * expOffset is the set of expected offsets, ending with '-1'.
1008  * "Expected expOffset -1" means "expected the end of the offsets"
1009  */
1010
1011 static const char testSentenceSuppressionsEn[]  = "Mr. Jones comes home. Dr. Smith Ph.D. is out. In the U.S.A. it is hot.";
1012 static const int32_t testSentSuppFwdOffsetsEn[] = { 22, 46, 70, -1 };         /* With suppressions */
1013 static const int32_t testSentFwdOffsetsEn[]     = {  4, 22, 26, 46, 70, -1 }; /* Without suppressions */
1014 static const int32_t testSentSuppRevOffsetsEn[] = { 46, 22,  0, -1 };         /* With suppressions */
1015 static const int32_t testSentRevOffsetsEn[]     = { 46, 26, 22,  4,  0, -1 }; /* Without suppressions */
1016
1017 static const char testSentenceSuppressionsDe[]  = "Wenn ich schon h\\u00F6re zu Guttenberg kommt evtl. zur\\u00FCck.";
1018 //                                                "Wenn ich schon höre zu Guttenberg kommt evtl. zurück."
1019 static const int32_t testSentSuppFwdOffsetsDe[] = { 53, -1 };       /* With suppressions */
1020 static const int32_t testSentFwdOffsetsDe[]     = { 53, -1 };       /* Without suppressions; no break in evtl. zur due to casing */
1021 static const int32_t testSentSuppRevOffsetsDe[] = {  0, -1 };       /* With suppressions */
1022 static const int32_t testSentRevOffsetsDe[]     = {  0, -1 };       /* Without suppressions */
1023
1024 static const char testSentenceSuppressionsEs[]  = "Te esperamos todos los miercoles en Bravo 416, Col. El Pueblo a las 7 PM.";
1025 static const int32_t testSentSuppFwdOffsetsEs[] = { 73, -1 };       /* With suppressions */
1026 static const int32_t testSentFwdOffsetsEs[]     = { 52, 73, -1 };   /* Without suppressions */
1027 static const int32_t testSentSuppRevOffsetsEs[] = {  0, -1 };       /* With suppressions */
1028 static const int32_t testSentRevOffsetsEs[]     = { 52,  0, -1 };   /* Without suppressions */
1029
1030 static const char testSentenceSuppressionsE1[]  = "Add or detract. The world will little note.";
1031 static const char testSentenceSuppressionsE1u[] = "ADD OR DETRACT. THE WORLD WILL LITTLE NOTE.";
1032 static const int32_t testSentFwdOffsetsE1[]     = { 16, 43, -1 };   /* Suppressions and case should make no difference */
1033 static const int32_t testSentRevOffsetsE1[]     = { 16,  0, -1 };   /* Suppressions and case should make no difference */
1034
1035 static const char testSentenceSuppressionsE2[]  = "Coming up, the sprints at NCAA. Are you watching?";
1036 static const char testSentenceSuppressionsE2u[] = "COMING UP, THE SPRINTS AT NCAA. ARE YOU WATCHING?";
1037 static const int32_t testSentFwdOffsetsE2[]     = { 32, 49, -1 };   /* Suppressions and case should make no difference */
1038 static const int32_t testSentRevOffsetsE2[]     = { 32,  0, -1 };   /* Suppressions and case should make no difference */
1039
1040 static const char testSentenceSuppressionsFr[]  = "Tr\\u00E8s bonne prise de parole de M. Junod, municipal \\u00E0 la culture de Lausanne.";
1041 //                                                "Très bonne prise de parole de M. Junod, municipal à la culture de Lausanne."
1042 static const int32_t testSentFwdOffsetsFr[]     = { 33, 75, -1 };   /* Without suppressions */
1043 static const int32_t testSentSuppFwdOffsetsFr[] = { 75, -1 };       /* With suppressions */
1044 static const int32_t testSentRevOffsetsFr[]     = { 33,  0, -1 };   /* Without suppressions */
1045 static const int32_t testSentSuppRevOffsetsFr[] = {  0, -1 };       /* With suppressions */
1046
1047 static const char testSentenceSuppressionsE3[]  = "G8 countries e.g. U.K., Japan. Sanctions i.e. restrictions. Test E. Xx G. Xx I. Xx.";
1048 static const char testSentenceSuppressionsE3u[] = "G8 COUNTRIES E.G. U.K., JAPAN. SANCTIONS I.E. RESTRICTIONS. TEST E. XX G. XX I. XX.";
1049 static const int32_t testSentSuppFwdOffsetsE3[] = { 31, 60, 83, -1 };                 /* With suppressions */
1050 static const int32_t testSentSuppRevOffsetsE3[] = { 60, 31,  0, -1 };                 /* With suppressions */
1051 static const int32_t testSentFwdOffsetsE3[]     = { 18, 31, 60, 68, 74, 80, 83, -1 }; /* Without suppressions */
1052 static const int32_t testSentRevOffsetsE3[]     = { 80, 74, 68, 60, 31, 18,  0, -1 }; /* Without suppressions */
1053 static const int32_t testSentFwdOffsetsE3u[]    = { 18, 31, 46, 60, 68, 74, 80, 83, -1 }; /* Without suppressions */
1054 static const int32_t testSentRevOffsetsE3u[]    = { 80, 74, 68, 60, 46, 31, 18,  0, -1 }; /* Without suppressions */
1055
1056 enum { kTextULenMax = 128, kTextBLenMax = 192 };
1057
1058 typedef struct {
1059     const char * locale;
1060     const char * text;
1061     const int32_t * expFwdOffsets;
1062     const int32_t * expRevOffsets;
1063 } TestBISuppressionsItem;
1064
1065 static const TestBISuppressionsItem testBISuppressionsItems[] = {
1066     { "en@ss=standard", testSentenceSuppressionsEn, testSentSuppFwdOffsetsEn, testSentSuppRevOffsetsEn },
1067     { "en",             testSentenceSuppressionsEn, testSentFwdOffsetsEn,     testSentRevOffsetsEn     },
1068     { "en_CA",             testSentenceSuppressionsEn, testSentFwdOffsetsEn,     testSentRevOffsetsEn     },
1069     { "en_CA@ss=standard", testSentenceSuppressionsEn, testSentSuppFwdOffsetsEn, testSentSuppRevOffsetsEn },
1070     { "fr@ss=standard", testSentenceSuppressionsEn, testSentFwdOffsetsEn,     testSentRevOffsetsEn     },
1071     { "af@ss=standard", testSentenceSuppressionsEn, testSentFwdOffsetsEn,     testSentRevOffsetsEn     }, /* no brkiter data => nosuppressions? */
1072     { "af_ZA@ss=standard", testSentenceSuppressionsEn, testSentFwdOffsetsEn,     testSentRevOffsetsEn     }, /* no brkiter data => nosuppressions? */
1073     { "zh@ss=standard", testSentenceSuppressionsEn, testSentFwdOffsetsEn,     testSentRevOffsetsEn     }, /* brkiter data, no suppressions data => no suppressions */
1074     { "zh_Hant@ss=standard", testSentenceSuppressionsEn, testSentFwdOffsetsEn, testSentRevOffsetsEn    }, /* brkiter data, no suppressions data => no suppressions */
1075     { "fi@ss=standard", testSentenceSuppressionsEn, testSentFwdOffsetsEn,     testSentRevOffsetsEn     }, /* brkiter data, no suppressions data => no suppressions */
1076     { "ja@ss=standard", testSentenceSuppressionsEn, testSentFwdOffsetsEn,     testSentRevOffsetsEn     }, /* brkiter data, no suppressions data => no suppressions */
1077     { "de@ss=standard", testSentenceSuppressionsDe, testSentSuppFwdOffsetsDe, testSentSuppRevOffsetsDe },
1078     { "de",             testSentenceSuppressionsDe, testSentFwdOffsetsDe,     testSentRevOffsetsDe     },
1079     { "es@ss=standard", testSentenceSuppressionsEs, testSentSuppFwdOffsetsEs, testSentSuppRevOffsetsEs },
1080     { "es",             testSentenceSuppressionsEs, testSentFwdOffsetsEs,     testSentRevOffsetsEs     },
1081     { "en",             testSentenceSuppressionsE1,  testSentFwdOffsetsE1,    testSentRevOffsetsE1     },
1082     { "en@ss=standard", testSentenceSuppressionsE1,  testSentFwdOffsetsE1,    testSentRevOffsetsE1     },
1083     { "en",             testSentenceSuppressionsE1u, testSentFwdOffsetsE1,    testSentRevOffsetsE1     },
1084     { "en@ss=standard", testSentenceSuppressionsE1u, testSentFwdOffsetsE1,    testSentRevOffsetsE1     },
1085     { "en",             testSentenceSuppressionsE2,  testSentFwdOffsetsE2,    testSentRevOffsetsE2     },
1086     { "en@ss=standard", testSentenceSuppressionsE2,  testSentFwdOffsetsE2,    testSentRevOffsetsE2     },
1087     { "en",             testSentenceSuppressionsE2u, testSentFwdOffsetsE2,    testSentRevOffsetsE2     },
1088     { "en@ss=standard", testSentenceSuppressionsE2u, testSentFwdOffsetsE2,    testSentRevOffsetsE2     },
1089     { "fr",             testSentenceSuppressionsFr, testSentFwdOffsetsFr,     testSentRevOffsetsFr     },
1090     { "fr@ss=standard", testSentenceSuppressionsFr, testSentSuppFwdOffsetsFr, testSentSuppRevOffsetsFr },
1091     { "en@ss=standard", testSentenceSuppressionsE3, testSentSuppFwdOffsetsE3, testSentSuppRevOffsetsE3 },
1092     { "en",             testSentenceSuppressionsE3, testSentFwdOffsetsE3,     testSentRevOffsetsE3     },
1093     { "en@ss=standard", testSentenceSuppressionsE3u, testSentSuppFwdOffsetsE3, testSentSuppRevOffsetsE3 },
1094     { "en",             testSentenceSuppressionsE3u, testSentFwdOffsetsE3u,    testSentRevOffsetsE3u    },
1095     { NULL, NULL, NULL }
1096 };
1097
1098 static void TestBreakIteratorSuppressions(void) {
1099     const TestBISuppressionsItem * itemPtr;
1100
1101     for (itemPtr = testBISuppressionsItems; itemPtr->locale != NULL; itemPtr++) {
1102         UChar textU[kTextULenMax + 1];
1103         char  textB[kTextBLenMax];
1104         int32_t textULen = u_unescape(itemPtr->text, textU, kTextULenMax);
1105         textU[kTextULenMax] = 0; // ensure zero termination
1106         UErrorCode status = U_ZERO_ERROR;
1107         UBreakIterator *bi = ubrk_open(UBRK_SENTENCE, itemPtr->locale, textU, textULen, &status);
1108         log_verbose("#%d: %s\n", (itemPtr-testBISuppressionsItems), itemPtr->locale);
1109         if (U_SUCCESS(status)) {
1110             int32_t offset, start;
1111             const int32_t * expOffsetPtr;
1112             const int32_t * expOffsetStart;
1113             u_austrcpy(textB, textU);
1114
1115             expOffsetStart = expOffsetPtr = itemPtr->expFwdOffsets;
1116             ubrk_first(bi);
1117             for (; (offset = ubrk_next(bi)) != UBRK_DONE && *expOffsetPtr >= 0; expOffsetPtr++) {
1118                 if (offset != *expOffsetPtr) {
1119                     log_err("FAIL: ubrk_next loc \"%s\", expected %d, got %d, text \"%s\"\n",
1120                             itemPtr->locale, *expOffsetPtr, offset, textB);
1121                 }
1122             }
1123             if (offset != UBRK_DONE || *expOffsetPtr >= 0) {
1124                 log_err("FAIL: ubrk_next loc \"%s\", expected UBRK_DONE & expOffset -1, got %d and %d, text \"%s\"\n",
1125                         itemPtr->locale, offset, *expOffsetPtr, textB);
1126             }
1127
1128             expOffsetStart = expOffsetPtr = itemPtr->expFwdOffsets;
1129             start = ubrk_first(bi) + 1;
1130             for (; (offset = ubrk_following(bi, start)) != UBRK_DONE && *expOffsetPtr >= 0; expOffsetPtr++) {
1131                 if (offset != *expOffsetPtr) {
1132                     log_err("FAIL: ubrk_following(%d) loc \"%s\", expected %d, got %d, text \"%s\"\n",
1133                             start, itemPtr->locale, *expOffsetPtr, offset, textB);
1134                 }
1135                 start = *expOffsetPtr + 1;
1136             }
1137             if (offset != UBRK_DONE || *expOffsetPtr >= 0) {
1138                 log_err("FAIL: ubrk_following(%d) loc \"%s\", expected UBRK_DONE & expOffset -1, got %d and %d, text \"%s\"\n",
1139                         start, itemPtr->locale, offset, *expOffsetPtr, textB);
1140             }
1141
1142             expOffsetStart = expOffsetPtr = itemPtr->expRevOffsets;
1143             offset = ubrk_last(bi);
1144             log_verbose("___ @%d ubrk_last\n", offset);
1145             if(offset == 0) {
1146               log_err("FAIL: ubrk_last loc \"%s\" unexpected %d\n", itemPtr->locale, offset);
1147             }
1148             for (; (offset = ubrk_previous(bi)) != UBRK_DONE && *expOffsetPtr >= 0; expOffsetPtr++) {
1149                 if (offset != *expOffsetPtr) {
1150                     log_err("FAIL: ubrk_previous loc \"%s\", expected %d, got %d, text \"%s\"\n",
1151                             itemPtr->locale, *expOffsetPtr, offset, textB);
1152                 } else {
1153                     log_verbose("[%d] @%d ubrk_previous()\n", (expOffsetPtr - expOffsetStart), offset);
1154                 }
1155             }
1156             if (offset != UBRK_DONE || *expOffsetPtr >= 0) {
1157                 log_err("FAIL: ubrk_previous loc \"%s\", expected UBRK_DONE & expOffset[%d] -1, got %d and %d, text \"%s\"\n",
1158                         itemPtr->locale, expOffsetPtr - expOffsetStart, offset, *expOffsetPtr, textB);
1159             }
1160
1161             expOffsetStart = expOffsetPtr = itemPtr->expRevOffsets;
1162             start = ubrk_last(bi) - 1;
1163             for (; (offset = ubrk_preceding(bi, start)) != UBRK_DONE && *expOffsetPtr >= 0; expOffsetPtr++) {
1164                 if (offset != *expOffsetPtr) {
1165                     log_err("FAIL: ubrk_preceding(%d) loc \"%s\", expected %d, got %d, text \"%s\"\n",
1166                             start, itemPtr->locale, *expOffsetPtr, offset, textB);
1167                 }
1168                 start = *expOffsetPtr - 1;
1169             }
1170             if (start >=0 && (offset != UBRK_DONE || *expOffsetPtr >= 0)) {
1171                 log_err("FAIL: ubrk_preceding loc(%d) \"%s\", expected UBRK_DONE & expOffset -1, got %d and %d, text \"%s\"\n",
1172                         start, itemPtr->locale, offset, *expOffsetPtr, textB);
1173             }
1174
1175             ubrk_close(bi);
1176         } else {
1177             log_data_err("FAIL: ubrk_open(UBRK_SENTENCE, \"%s\", ...) status %s (Are you missing data?)\n",
1178                     itemPtr->locale, u_errorName(status));
1179         }
1180     }
1181 }
1182
1183 #if APPLE_ADDITIONS
1184 #include <stdio.h>
1185 #include "unicode/urbtok.h"
1186 #include "cstring.h"
1187
1188 static const char testRulesFilePath[] = "../testdata/word_urbTokTest.txt";
1189 static const UChar textToTokenize[] = {
1190 /*
1191 "Short phrase! Another (with parens); done.\n
1192 At 4:00, tea-time.\n
1193 He wouldn't've wanted y'all to ... come at 3:30pm for $3 coffee @funman :)\n
1194 x3:30 -- x1.0"
1195 */
1196     0x53,0x68,0x6F,0x72,0x74,0x20,0x70,0x68,0x72,0x61,0x73,0x65,0x21,0x20,
1197     0x41,0x6E,0x6F,0x74,0x68,0x65,0x72,0x20,0x28,0x77,0x69,0x74,0x68,0x20,0x70,0x61,0x72,0x65,0x6E,0x73,0x29,0x3B,0x20,0x64,0x6F,0x6E,0x65,0x2E,0x0A,
1198     0x41,0x74,0x20,0x34,0x3A,0x30,0x30,0x2C,0x20,0x74,0x65,0x61,0x2D,0x74,0x69,0x6D,0x65,0x2E,0x0A,
1199     0x48,0x65,0x20,0x77,0x6F,0x75,0x6C,0x64,0x6E,0x27,0x74,0x27,0x76,0x65,0x20,0x77,0x61,0x6E,0x74,0x65,0x64,0x20,
1200     0x79,0x27,0x61,0x6C,0x6C,0x20,0x74,0x6F,0x20,0x2E,0x2E,0x2E,0x20, 0x63,0x6F,0x6D,0x65,0x20,0x61,0x74,0x20,
1201     0x33,0x3A,0x33,0x30,0x70,0x6D,0x20,0x66,0x6F,0x72,0x20,0x24,0x33,0x20,0x63,0x6F,0x66,0x66,0x65,0x65,0x20,
1202     0x40,0x66,0x75,0x6E,0x6D,0x61,0x6E,0x20,0x3A,0x29,0x0A,
1203     0x78,0x33,0x3A,0x33,0x30,0x20,0x2D,0x2D,0x20,0x78,0x31,0x2E,0x30,0
1204 };
1205 typedef struct {
1206     RuleBasedTokenRange token;
1207     unsigned long       flags;
1208 } RBTokResult;
1209 static const RBTokResult expectedResults[] = { // 66 tokens
1210     { {  0, 5 }, 0xC8 },  // Short
1211     { {  5, 1 }, 0x01 },  // _sp_
1212     { {  6, 6 }, 0xC8 },  // phrase
1213     { { 12, 1 }, 0x00 },  // !
1214     { { 13, 1 }, 0x01 },  // _sp_
1215     { { 14, 7 }, 0xC8 },  // Another
1216     { { 21, 1 }, 0x01 },  // _sp_
1217     { { 22, 1 }, 0x00 },  // (
1218     { { 23, 4 }, 0xC8 },  // with
1219     { { 27, 1 }, 0x01 },  // _sp_
1220     { { 28, 6 }, 0xC8 },  // parens
1221     { { 34, 1 }, 0x00 },  // )
1222     { { 35, 1 }, 0x00 },  // ;
1223     { { 36, 1 }, 0x01 },  // _sp_
1224     { { 37, 4 }, 0xC8 },  // done
1225     { { 41, 1 }, 0x14 },  // .
1226     { { 42, 1 }, 0x00 },  // _nl_
1227
1228     { { 43, 2 }, 0xC8 },  // At
1229     { { 45, 1 }, 0x01 },  // _sp_
1230     { { 46, 4 }, 0x76 },  // 4:00       ** here RBBI has x64
1231     { { 50, 1 }, 0x00 },  // ,
1232     { { 51, 1 }, 0x01 },  // _sp_
1233     { { 52, 3 }, 0xC8 },  // tea
1234     { { 55, 1 }, 0x15 },  // -
1235     { { 56, 4 }, 0xC8 },  // time
1236     { { 60, 1 }, 0x14 },  // .
1237     { { 61, 1 }, 0x00 },  // _nl_
1238
1239     { { 62, 2 }, 0xC8 },  // He
1240     { { 64, 1 }, 0x01 },  // _sp_
1241     { { 65, 8 }, 0xCA },  // wouldn't
1242     { { 73, 1 }, 0x16 },  // '
1243     { { 74, 2 }, 0xC8 },  // ve
1244     { { 76, 1 }, 0x01 },  // _sp_
1245     { { 77, 6 }, 0xC8 },  // wanted
1246     { { 83, 1 }, 0x01 },  // _sp_
1247     { { 84, 5 }, 0xCA },  // y'all
1248     { { 89, 1 }, 0x01 },  // _sp_
1249     { { 90, 2 }, 0xC8 },  // to
1250     { { 92, 1 }, 0x01 },  // _sp_
1251     { { 93, 3 }, 0x3C },  // ...        ** here RBBI has 0x28
1252     { { 96, 1 }, 0x01 },  // _sp_
1253     { { 97, 4 }, 0xC8 },  // come
1254     { { 101, 1 }, 0x01 },  // _sp_
1255     { { 102, 2 }, 0xC8 },  // at
1256     { { 104, 1 }, 0x01 },  // _sp_
1257     { { 105, 6 }, 0xC8 },  // 3:30pm
1258     { { 111, 1 }, 0x01 },  // _sp_
1259     { { 112, 3 }, 0xC8 },  // for
1260     { { 115, 1 }, 0x01 },  // _sp_
1261     { { 116, 1 }, 0x00 },  // $
1262     { { 117, 1 }, 0x64 },  // 3
1263     { { 118, 1 }, 0x01 },  // _sp_
1264     { { 119, 6 }, 0xC8 },  // coffee
1265     { { 125, 1 }, 0x01 },  // _sp_
1266     { { 126, 7 }, 0xDF },  // @funman   ** here RBBI has 0xC8
1267     { { 133, 1 }, 0x01 },  // _sp_
1268     { { 134, 2 }, 0x20 },  // :)
1269     { { 136, 1 }, 0x00 },  // _nl_
1270
1271     // ** incorrect ranges (and flags) currently produced by RBTok
1272     { { 137, 2 }, 0xEC },  // x3
1273     { { 139, 1 }, 0x00 },  // :
1274     { { 140, 2 }, 0x64 },  // 30
1275     // ** for the above, RBBI has
1276     //{ { 137, 1 }, 0x64 },  // x
1277     //{ { 138, 4 }, 0x64 },  // 3:30
1278     //
1279     { { 142, 1 }, 0x01 },  // _sp_
1280     { { 143, 2 }, 0x3D },  // --        ** here RBBI has 0x28
1281     { { 145, 1 }, 0x01 },  //  _sp_
1282     { { 146, 2 }, 0xEC },  // x1        ** here RBBI has 0xC8
1283     { { 148, 1 }, 0x14 },  // .
1284     { { 149, 1 }, 0x64 },  // 0
1285 };
1286 enum {
1287     kNumTokensExpected = UPRV_LENGTHOF(expectedResults), // 66
1288     kMaxTokens = 96
1289 };
1290
1291 static void TestRuleBasedTokenizer(void) {
1292     FILE * testRulesFile;
1293     char * testRulesUTF8Buf;
1294     UChar* testRulesUTF16Buf = NULL;
1295     long testRulesFileSize, testRulesFileRead = 0;
1296     long testRulesUTF8Offset = 0;
1297     int32_t testRulesUTF16Size;
1298     UErrorCode status = U_ZERO_ERROR;
1299
1300     testRulesFile = fopen(testRulesFilePath, "r");
1301     if (testRulesFile == NULL) {
1302         log_data_err("FAIL: fopen fails for: %s\n", testRulesFilePath);
1303         return;
1304     }
1305     fseek(testRulesFile, 0, SEEK_END);
1306     testRulesFileSize = ftell(testRulesFile);
1307     rewind(testRulesFile);
1308
1309     testRulesUTF8Buf = (char *)uprv_malloc(testRulesFileSize);
1310     if (testRulesUTF8Buf != NULL) {
1311         testRulesFileRead = fread(testRulesUTF8Buf, 1, testRulesFileSize, testRulesFile);
1312     }
1313     fclose(testRulesFile);
1314     if (testRulesUTF8Buf == NULL) {
1315         log_data_err("FAIL: uprv_malloc fails for testRulesUTF8Buf[%ld]\n", testRulesFileSize);
1316         return;
1317     }
1318     if (testRulesFileRead < testRulesFileSize) {
1319         log_data_err("FAIL: fread fails for %s, read %ld of %ld\n", testRulesFile, testRulesFileRead, testRulesFileSize);
1320         uprv_free(testRulesUTF8Buf);
1321         return;
1322     }
1323     /* done with file, UTF8 rules in testRulesUTF8Buf. Handle UTF8 BOM: */
1324     if (uprv_strncmp(testRulesUTF8Buf, "\xEF\xBB\xBF", 3) == 0) {
1325         testRulesUTF8Offset = 3;
1326         testRulesFileSize -= testRulesUTF8Offset;
1327     }
1328
1329     u_strFromUTF8(NULL, 0, &testRulesUTF16Size, testRulesUTF8Buf+testRulesUTF8Offset, testRulesFileSize, &status); /* preflight */
1330     if (status == U_BUFFER_OVERFLOW_ERROR) { /* expected for preflight */
1331         status = U_ZERO_ERROR;
1332     }
1333     if (U_FAILURE(status)) {
1334         log_data_err("FAIL: u_strFromUTF8 preflight fails: %s\n", u_errorName(status));
1335     } else {
1336         testRulesUTF16Buf = (UChar *)uprv_malloc(testRulesUTF16Size*sizeof(UChar));
1337         if (testRulesUTF16Buf == NULL) {
1338             log_data_err("FAIL: uprv_malloc fails for testRulesUTF16Buf[%ld]\n", testRulesUTF16Size*sizeof(UChar));
1339         } else {
1340             u_strFromUTF8(testRulesUTF16Buf, testRulesUTF16Size, &testRulesUTF16Size, testRulesUTF8Buf+testRulesUTF8Offset, testRulesFileSize, &status);
1341         }
1342     }
1343     uprv_free(testRulesUTF8Buf);
1344     if (testRulesUTF16Buf == NULL) {
1345         return;
1346     }
1347     if (U_FAILURE(status)) {
1348         log_data_err("FAIL: u_strFromUTF8 fails: %s\n", u_errorName(status));
1349     } else {
1350         UParseError parseErr;
1351         UBreakIterator *brkFromRules = urbtok_openRules(testRulesUTF16Buf, testRulesUTF16Size, &parseErr, &status);
1352         if (U_FAILURE(status)) {
1353             log_err("FAIL: urbtok_openRules status: %s\n", u_errorName(status));
1354         } else {
1355             uint8_t *rulesBinaryBuf;
1356             uint32_t rulesBinarySize;
1357             rulesBinarySize = urbtok_getBinaryRules(brkFromRules, NULL, 0, &status);
1358             if (U_FAILURE(status)) {
1359                 log_err("FAIL: urbtok_getBinaryRules preflight status: %s, rulesBinarySize %u\n", u_errorName(status), rulesBinarySize);
1360             } else {
1361                 rulesBinaryBuf = (uint8_t *)uprv_malloc(rulesBinarySize);
1362                 if (rulesBinaryBuf == NULL) {
1363                     log_data_err("FAIL: uprv_malloc fails for rulesBinaryBuf[%ld]\n", rulesBinarySize);
1364                 } else {
1365                     rulesBinarySize = urbtok_getBinaryRules(brkFromRules, rulesBinaryBuf, rulesBinarySize, &status);
1366                     if (U_FAILURE(status)) {
1367                         log_err("FAIL: urbtok_getBinaryRules status: %s, rulesBinarySize %u\n", u_errorName(status), rulesBinarySize);
1368                     } else {
1369                         UBreakIterator *brkFromBinary = urbtok_openBinaryRules(rulesBinaryBuf, &status);
1370                         if (U_FAILURE(status)) {
1371                             log_err("FAIL: urbtok_openBinaryRules status: %s\n", u_errorName(status));
1372                         } else {
1373                             RuleBasedTokenRange tokens[kMaxTokens];
1374                             unsigned long       flags[kMaxTokens];
1375                             int32_t iToken, numTokens = 0;
1376
1377                             status = U_ZERO_ERROR;
1378                             ubrk_setText(brkFromRules, textToTokenize, -1, &status);
1379                             if (U_FAILURE(status)) {
1380                                 log_err("FAIL: ubrk_setText brkFromRules status: %s\n", u_errorName(status));
1381                             } else {
1382                                 numTokens = urbtok_tokenize(brkFromRules, kMaxTokens, tokens, flags);
1383                                 UBool fail = (numTokens != kNumTokensExpected);
1384                                 for (iToken = 0; !fail && iToken < numTokens; iToken++) {
1385                                     if (  tokens[iToken].location != expectedResults[iToken].token.location ||
1386                                           tokens[iToken].length   != expectedResults[iToken].token.length   ||
1387                                           flags[iToken]           != expectedResults[iToken].flags ) {
1388                                         fail = TRUE;
1389                                     }
1390                                 }
1391                                 if (fail) {
1392                                     log_err("FAIL: urbtok_tokenize brkFromRules expected %d tokens, got %d\n", kNumTokensExpected, numTokens);
1393                                     printf("# expect          get\n");
1394                                     printf("# loc len flags   loc len flags\n");
1395                                     int32_t maxTokens = (numTokens >= kNumTokensExpected)? numTokens: kNumTokensExpected;
1396                                     for (iToken = 0; iToken < maxTokens; iToken++) {
1397                                         if (iToken < kNumTokensExpected) {
1398                                             printf("  %3ld %3ld 0x%03lX", expectedResults[iToken].token.location,
1399                                                 expectedResults[iToken].token.length, expectedResults[iToken].flags);
1400                                         } else {
1401                                             printf("             ");
1402                                         }
1403                                         if (iToken < numTokens) {
1404                                             printf("   %3ld %3ld 0x%03lX\n",  tokens[iToken].location, tokens[iToken].length, flags[iToken] );
1405                                         } else {
1406                                             printf("\n");
1407                                         }
1408                                     }
1409                                 }
1410                             }
1411
1412                             status = U_ZERO_ERROR;
1413                             ubrk_setText(brkFromBinary, textToTokenize, -1, &status);
1414                             if (U_FAILURE(status)) {
1415                                 log_err("FAIL: ubrk_setText brkFromBinary status: %s\n", u_errorName(status));
1416                             } else {
1417                                 numTokens = urbtok_tokenize(brkFromBinary, kMaxTokens, tokens, flags);
1418                                 UBool fail = (numTokens != kNumTokensExpected);
1419                                 for (iToken = 0; !fail && iToken < numTokens; iToken++) {
1420                                     if (  tokens[iToken].location != expectedResults[iToken].token.location ||
1421                                           tokens[iToken].length   != expectedResults[iToken].token.length   ||
1422                                           flags[iToken]           != expectedResults[iToken].flags ) {
1423                                         fail = TRUE;
1424                                     }
1425                                 }
1426                                 if (fail) {
1427                                     log_err("FAIL: urbtok_tokenize brkFromBinary expected %d tokens, got %d\n", kNumTokensExpected, numTokens);
1428                                     printf("# expect          get\n");
1429                                     printf("# loc len flags   loc len flags\n");
1430                                     int32_t maxTokens = (numTokens >= kNumTokensExpected)? numTokens: kNumTokensExpected;
1431                                     for (iToken = 0; iToken < maxTokens; iToken++) {
1432                                         if (iToken < kNumTokensExpected) {
1433                                             printf("  %3ld %3ld 0x%03lX", expectedResults[iToken].token.location,
1434                                                 expectedResults[iToken].token.length, expectedResults[iToken].flags);
1435                                         } else {
1436                                             printf("             ");
1437                                         }
1438                                         if (iToken < numTokens) {
1439                                             printf("   %3ld %3ld 0x%03lX\n",  tokens[iToken].location, tokens[iToken].length, flags[iToken] );
1440                                         } else {
1441                                             printf("\n");
1442                                         }
1443                                     }
1444                                 }
1445                             }
1446                             ubrk_close(brkFromBinary);
1447                         }
1448                     }
1449                     uprv_free(rulesBinaryBuf);
1450                 }
1451             }
1452             ubrk_close(brkFromRules);
1453         }
1454     }
1455     uprv_free(testRulesUTF16Buf);
1456 }
1457 #endif
1458
1459
1460 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */