icuSources/test/intltest/wbnf.cpp

   1 /*
   2  ******************************************************************************
   3  * Copyright (C) 2005-2007, International Business Machines Corporation and   *
   4  * others. All Rights Reserved.                                               *
   5  ******************************************************************************
   6  */
   7
   8 #include <stdio.h>
   9 #include <string.h>
  10 #include <stdlib.h>
  11 #include <time.h>
  12
  13 #include "wbnf.h"
  14
  15 // Most of this code is meant to test the test code. It's a self test.
  16 // Normally this isn't run.
  17 #define TEST_WBNF_TEST 0
  18
  19 ///////////////////////////////////////////////////////////
  20 //
  21 // Constants and the most basic helper classes
  22 //
  23
  24 static const char DIGIT_CHAR[] = "0123456789";
  25 static const char WHITE_SPACE[] = {'\t', ' ', '\r', '\n', 0};
  26 static const char ALPHABET[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
  27 static const char SPECIAL[] = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~";
  28
  29 static inline UBool isInList(const char c /*in*/, const char list[] /*in*/){
  30     const char * p = list;
  31     for (;*p != 0 && *p != c; p++);
  32     return *p?TRUE:FALSE;
  33 }
  34 static inline UBool isDigit(char c) {return isInList(c, DIGIT_CHAR);}
  35 static inline UBool isWhiteSpace(char c) {return isInList(c, WHITE_SPACE);}
  36 static inline UBool isAlphabet(char c) {return isInList(c, ALPHABET);}
  37 static inline UBool isSpecialAsciiChar(char c) {return isInList(c,SPECIAL);}
  38
  39
  40
  41 ///////////////////////////////////////////////////////////
  42 //
  43 // Helper classes
  44 //
  45
  46 class Buffer_byte{
  47 // Utility class, can be treated as an auto expanded array. no boundary check.
  48
  49     typedef char byte;
  50     byte * start;
  51     byte * current;
  52     int buffer_size; // size unit is byte
  53 public:
  54     inline int content_size(){return current - start;} // size unit is byte
  55
  56 private:
  57     inline void expand(int add_size = 100){ // size unit is byte
  58         int new_size = buffer_size + add_size;
  59
  60         int cs_snap = content_size();
  61         start = (byte *) realloc(start, new_size);   // may change the value of start
  62         current = start + cs_snap;
  63
  64         memset(current, 0, add_size);
  65         buffer_size = new_size;
  66     }
  67
  68     inline void expand_to(int size){
  69         int r = size - buffer_size;
  70         if (r > 0) {
  71             expand(r);  // simply expand, no block alignment
  72         }
  73     }
  74     Buffer_byte(const Buffer_byte &);
  75     Buffer_byte & operator = (const Buffer_byte &);
  76 public:
  77     Buffer_byte():start(NULL),current(start),buffer_size(0){
  78         expand();
  79     }
  80     ~Buffer_byte(){
  81         free(start);
  82     }
  83
  84     inline void reset(){
  85         start != NULL ? memset(start, 0, buffer_size) : 0;
  86         current = start;
  87     }
  88
  89     // Using memory copy method to append a C array to buffer,
  90     inline void append(const void * c, int size){ // size unit is byte
  91         expand_to(content_size() + size) ;
  92         memcpy(current, c, size);
  93         current = current + size;
  94     }
  95
  96     byte * buffer(){
  97         return start;
  98     }
  99 };
 100
 101 /*
 102   The class(es) try to work as bulid-in array, so it overloads these two operators
 103     operator type *();
 104     type & operator[];
 105   The first is used to auto type convert, the latter is used to select member.
 106
 107   A small trick is the class does not overload the address-of operator. This
 108   behavior is different from bulid-in array, but it give us the opportunity
 109   to get the address of the class itself.
 110 */
 111 //template<typename type>
 112 //    class BUFFER{
 113 //       typedef BUFFER name;
 114 #define BUFFER(type, name)\
 115     class name {\
 116     private:\
 117        Buffer_byte buf;\
 118     public:\
 119         name & reset() {buf.reset(); return *this;}\
 120         name & append(type c) {buf.append(&c, sizeof(type)); return *this;}\
 121         name & append_array(const type * p, int size) {buf.append(p, sizeof(type)*size); return *this;}\
 122         type & operator [] (int i) { return ((type *) buf.buffer())[i];}\
 123         operator type *(){return (type *) buf.buffer();} \
 124         int content_size(){return buf.content_size() / sizeof(type);}\
 125     }
 126
 127
 128 class Pick{
 129 /* The Pick is the basic language generator element*/
 130 public:
 131     // generate a string accroding the syntax
 132     // Return a null-terminated c-string. The buffer is owned by callee.
 133     virtual const char* next() = 0;
 134     virtual ~Pick(){};
 135 };
 136
 137 //typedef BUFFER<char> Buffer_char;
 138 //typedef BUFFER<int> Buffer_int;
 139 //typedef BUFFER<Pick *> Buffer_pPick;
 140 BUFFER(char, Buffer_char);
 141 BUFFER(int, Buffer_int);
 142 BUFFER(Pick *, Buffer_pPick);
 143
 144 class SymbolTable{
 145 /* Helper class.
 146 * It's a mapping table between 'variable name' and its 'active Pick object'
 147 */
 148 private:
 149     Buffer_char  name_buffer;   // var names storage space
 150
 151     Buffer_int   names;         // points to name (offset in name_buffer)
 152     Buffer_pPick refs;          // points to Pick
 153
 154     int get_index(const char *const var_name){
 155         int len = names.content_size();
 156         for (int i=0; i< len; i++){
 157             if (strcmp(var_name, name_buffer + names[i]) == 0){
 158                 return i;
 159             }
 160         }
 161         return -1;
 162     }
 163
 164 public:
 165     enum RESULT {EMPTY, NO_VAR, NO_REF, HAS_REF};
 166
 167     RESULT find(const char *const var_name /*[in] c-string*/, Pick * * ref = NULL /*[out] Pick* */){
 168         if (!var_name) return EMPTY; // NULL name
 169
 170         int i = get_index(var_name);
 171         if (i == -1){
 172             return NO_VAR;   // new name
 173         }
 174         if (!refs[i]){  // exist name, no ref
 175             return NO_REF;
 176         } else {
 177             if (ref) {
 178                 *ref = refs[i];
 179             }
 180             return HAS_REF;   // exist name, has ref
 181         }
 182     }
 183
 184     void put(const char *const var_name, Pick *const var_ref = NULL){
 185         int i = get_index(var_name);
 186         switch(find(var_name)){
 187             case EMPTY:    // NULL name
 188                 break;
 189             case NO_VAR:    // new name
 190                 int offset;
 191                 offset = name_buffer.content_size();
 192                 name_buffer.append_array(var_name, strlen(var_name) + 1);
 193                 names.append(offset);
 194                 refs.append(var_ref);
 195                 break;
 196             case NO_REF:    // exist name, no ref
 197                 refs[i] = var_ref;    // link definition with variable
 198                 break;
 199             case HAS_REF:    // exist name, has ref
 200                 if (var_ref){
 201                     refs[i] = var_ref;
 202                 }
 203                 break;
 204             default:
 205                 ; // ASSERT(FALSE);
 206         }
 207         return;
 208     }
 209
 210     UBool is_complete(){
 211         int n = names.content_size();
 212         for (int i=0; i<n; ++i){
 213             if (refs[i] == NULL){
 214                 return FALSE;
 215             }
 216         }
 217         return TRUE;
 218     }
 219
 220     void reset(){
 221         names.reset();
 222         name_buffer.reset();
 223
 224         // release memory here
 225         int s = refs.content_size();
 226         for (int i=0; i < s; i++){
 227             delete refs[i]; // TOFIX: point alias/recursion problem
 228         }
 229         refs.reset();
 230     }
 231
 232     ~SymbolTable(){
 233         reset();
 234     }
 235 };
 236
 237
 238 /*
 239 // Document of class Escaper
 240 //
 241 // ATTENTION:
 242 // From http://icu-project.org/userguide/Collate_Customization.html.
 243 // We get the precedence of escape/quote operations
 244 //
 245 //     (highest) 1. backslash               \
 246 //               2. two single quotes       ''
 247 //               3. quoting                 ' '
 248 //
 249 // ICU Collation should accept following as the same string.
 250 //
 251 // 1)  'ab'c        _
 252 // 2)  a\bc          \
 253 // 3)  a'b'\c        |- They are equal.
 254 // 4)  abc          _/
 255 //
 256 // From "two single quotes", we have following deductions
 257 //    D1. empty quoting is illgal. (obviously)
 258 //    D2. no contact operation between two quotings
 259 //              '.''.'      is not ..   it is .'.
 260 //    D3. "two single quotes" cannot contact two quoting simultaneously
 261 //              '..''''.'   is not ..'. it is ..''.
 262 //       NOTICE:
 263 //        "two single quotes" can contact before one quoting
 264 //              '''.'       is '.
 265 //        "two single quotes" can literally contact after one quoting
 266 //        But, from syntax, it's one quoting including a "two single quotes"
 267 //              '.'''       is .'
 268 //    D4. "two single quotes" cannot solely be included in quoting
 269 //              ''''        is not '    it is ''
 270 //       NOTICE:  These are legal
 271 //              '.''.'      is .'.
 272 //              '.'''       is .'
 273 //
 274 //                 dicision
 275 //                    /\
 276 //                   /__\
 277 //      output buffer    input buffer
 278 //
 279 // To make our dicision (within an atom operation) without caring input and output buffer,
 280 // following calling pattern (within an atom operation) shall be avoided
 281 //
 282 //    P1 open_quoting()  then close_quoting()    (direct violation)   D1
 283 //    P2 close_quoting() then open_quoting()     (direct violation)   D2
 284 //    P3 empty open_quoting()                    (indirect violation) D1, D4
 285 //    P4 empty close_quoting()                   (indirect violation) D2, D3
 286 //    P5 open_quoting()  then two single quotes  (indirect violation) D4
 287 //    P6 close_quoting() then two single quotes  (indirect violation) D3
 288 //
 289 // two single quotes escaping will not open_ or close_ quoting()
 290 // The choice will not lose some quoing forms.
 291 //
 292 // For open_quoting(),
 293 // we may get this form quoting     '''         P5
 294 // It may raise a bug               ''''x
 295 // If we expect
 296 //      '''.'       let the next char open the quoting
 297 //      '.''.'      the quoting is already opened by preceding char
 298 //
 299 // For close_quoting()
 300 // we will get this form quoting    '.'''       P6
 301 // It may raise a bug               '.''''.'
 302 // If we expect
 303 //      '.'''\.     let the next char close the quoting
 304 //      '.''''.'    the expectation is wrong!  using  '.'\''.' instead
 305 //
 306 // It's a hard work to re-adjust generation opportunity for various escaping form.
 307 // We just simply ignore it.
 308 */
 309 class Escaper{
 310 public:
 311     enum CHOICE {YES, NO, RAND};
 312     enum ESCAPE_FORM {BSLASH_ONLY, QUOTE_ONLY, QUOTE_AND_BSLAH, RAND_ESC};
 313 private:
 314     class Bool{ // A wrapper class for CHOICE, to auto adapter UBool class
 315         private:
 316             const CHOICE tag;
 317         public:
 318             Bool(CHOICE flag=RAND):tag(flag){}
 319             operator UBool() {   // conversion operator
 320                 return tag == RAND ? rand()%2 : tag == YES;
 321                 //if (tag == RAND){
 322                 //    return rand()%2 == 1;
 323                 //} else {
 324                 //    return tag == YES ? TRUE : FALSE;
 325                 //}
 326             }
 327     };
 328 public:
 329     Escaper(CHOICE escapeLiteral = RAND,
 330         CHOICE twoQuotesEscape = RAND,
 331         ESCAPE_FORM escapeForm = RAND_ESC):
 332         escape_form(escapeForm),
 333         escape_literal(escapeLiteral),
 334         two_quotes_escape(twoQuotesEscape),
 335         is_quoting(FALSE){}
 336 private:
 337     Buffer_char str;
 338     ESCAPE_FORM escape_form;
 339     Bool escape_literal;
 340     Bool two_quotes_escape;
 341     UBool quote_escape;
 342     UBool bslash_escape;
 343     UBool is_quoting;
 344
 345     void set_options(){
 346         ESCAPE_FORM t = escape_form == RAND_ESC ? (ESCAPE_FORM) (rand()%3) : escape_form;
 347         switch (t){
 348                 case BSLASH_ONLY :
 349                     bslash_escape = TRUE; quote_escape = FALSE; break;
 350                 case QUOTE_ONLY:
 351                     bslash_escape = FALSE;quote_escape = TRUE;  break;
 352                 case QUOTE_AND_BSLAH:
 353                     bslash_escape = TRUE; quote_escape = TRUE;  break;
 354                 default:
 355                     ;// error
 356         }
 357     }
 358
 359     void reset(){
 360         str.reset();
 361         is_quoting = FALSE;
 362     }
 363
 364     inline void open_quoting(){
 365         if(is_quoting){
 366             // do nothing
 367         } else {
 368             str.append('\'');
 369             is_quoting = TRUE;
 370         }
 371     }
 372     inline void close_quoting(){
 373         if(is_quoting){
 374             str.append('\'');
 375             is_quoting = FALSE;
 376         } else {
 377             // do nothing
 378         }
 379     }
 380
 381     // str  [in]    null-terminated c-string
 382     void append(const char * strToAppend){
 383         for(;*strToAppend != 0; strToAppend++){
 384             append(*strToAppend);
 385         }
 386     }
 387
 388     inline void append(const char c){
 389         set_options();
 390
 391         if (c == '\\'){
 392             quote_escape ? open_quoting() : close_quoting();
 393             //bslash_escape always true here
 394             str.append('\\');
 395             str.append('\\');
 396         } else if (c == '\''){
 397             if (two_quotes_escape){     // quoted using two single quotes
 398                 // See documents in anonymous.design
 399                 str.append('\'');
 400                 str.append('\'');
 401             } else{
 402                 quote_escape ? open_quoting() : close_quoting();
 403                 //bslash_escape always true here
 404                 str.append('\\');
 405                 str.append('\'');
 406             }
 407         } else if (isSpecialAsciiChar(c) || isWhiteSpace(c)){
 408             quote_escape  ? open_quoting()   : close_quoting();
 409             if (bslash_escape) str.append('\\');
 410             str.append(c);
 411         } else { //if (isAlphabet(c) || isDigit(c) || TRUE){ // treat others as literal
 412             if (escape_literal){
 413                 quote_escape  ? open_quoting()   : close_quoting();
 414                 if (bslash_escape)  str.append('\\');
 415                 str.append(c);
 416             } else {
 417                 close_quoting();
 418                 str.append(c);
 419             }
 420         }
 421     }
 422
 423 public:
 424     // Return a null-terminate c-string. The buffer is owned by callee.
 425     char * operator()(const char * literal /*c-string*/){
 426         str.reset();
 427         for(;*literal != 0; literal++){
 428             append(*literal);
 429         }
 430         close_quoting();    // P4 exception, to close whole quoting
 431         return str;
 432     }
 433 };
 434
 435 class WeightedRand{
 436 // Return a random number in [0, size)
 437 // Every number has different chance (aka weight) to be selected.
 438 private:
 439     Buffer_int weights;
 440     double total;
 441     WeightedRand(const WeightedRand &);
 442     WeightedRand & operator = (const WeightedRand &);
 443 public:
 444     WeightedRand(Buffer_int * weight_list = NULL, int size = 0){
 445         if ( weight_list == NULL){
 446             for (int i=0; i<size; ++i) weights.append(DEFAULT_WEIGHT);
 447         } else {
 448             int s = weight_list->content_size();
 449             if (s < size){
 450                 weights.append_array( (*weight_list),s);
 451                 for (int i=s; i<size; ++i) weights.append(DEFAULT_WEIGHT);
 452             } else { // s >= size
 453                 weights.append_array( (*weight_list),size);
 454             }
 455         }
 456         total = 0;
 457         int c = weights.content_size();
 458         for (int i=0; i<c; ++i){
 459             total += weights[i];
 460         }
 461     }
 462
 463     void append(int weight){
 464         weights.append(weight);
 465         total += weight;
 466     }
 467
 468     // Give a random number with the consideration of weight.
 469     // Every random number is associated with a weight.
 470     // It identifies the chance to be selected,
 471     // larger weight has more chance to be selected.
 472     //
 473     //
 474     //  ______________________   every slot has equal chance
 475     //
 476     //  [____][_][___][______]   each item has different slots, hence different chance
 477     //
 478     //
 479     //  The algorithms to generate the number is illustrated by preceding figure.
 480     //  First, a slot is selected by rand(). Then we translate the slot to corresponding item.
 481     //
 482     int next(){
 483         // get a random in [0,1]
 484         double reference_mark = (double)rand() / (double)RAND_MAX;
 485
 486         // get the slot's index, 0 <= mark <= total;
 487         double mark = total * reference_mark;
 488
 489         // translate the slot to corresponding item
 490         int i=0;
 491         for (;;){
 492             mark -= weights[i];  // 0 <= mark <= total
 493             if (mark <= 0)
 494                 break;
 495             i++;
 496         }
 497         return i;
 498     }
 499 };
 500
 501 ///////////////////////////////////////////////////////////
 502 //
 503 // The parser result nodes
 504 //
 505
 506 class Literal : public Pick {
 507 public:
 508     virtual const char* next(){
 509         return str;
 510     }
 511     Literal(const char * s /*c-string*/){
 512         str.append_array(s, strlen(s) + 1);
 513     }
 514 private:
 515     Buffer_char str; //null-terminated c-string
 516 };
 517
 518 class Variable : public Pick {
 519 public:
 520     Variable(SymbolTable * symbols, const char * varName, Pick * varRef = NULL){
 521         this->var_name.append_array(varName, strlen(varName) + 1);
 522         if ((symbol_table = symbols)){
 523             symbol_table->put(varName, varRef);
 524         }
 525     }
 526
 527     operator const char *(){
 528         return var_name;
 529     }
 530
 531     virtual const char* next(){
 532         if (symbol_table){
 533             Pick * var_ref = NULL;
 534             symbol_table->find(var_name, &var_ref);
 535             if (var_ref) {
 536                 return var_ref->next();
 537             }
 538         }
 539         return "";  // dumb string
 540     }
 541 private:
 542     Buffer_char var_name;
 543     SymbolTable * symbol_table;
 544 };
 545
 546 class Quote : public Pick{
 547 public:
 548     Quote(Pick & base):item(base),e(Escaper::NO, Escaper::NO, Escaper::BSLASH_ONLY){
 549     }
 550     virtual const char* next(){
 551         return e(item.next());
 552     }
 553 private:
 554     Pick & item;
 555     Buffer_char str;
 556     Escaper e;
 557 };
 558
 559
 560 class Morph : public Pick{
 561 /*
 562 The difference between morph and an arbitrary random string is that
 563 a morph changes slowly. When we build collation rules, for example,
 564 it is a much better test if the strings we use are all in the same
 565 'neighborhood'; they share many common characters.
 566 */
 567 public:
 568     Morph(Pick & base):item(base){}
 569
 570     virtual const char* next(){
 571         current.reset();
 572         const char * s = item.next();
 573         current.append_array(s, strlen(s) + 1);
 574         if  (last.content_size() == 0) {
 575             str.reset();
 576             last.reset();
 577             str.append_array(current, current.content_size());
 578             last.append_array(current, current.content_size());
 579         } else {
 580             morph();
 581         }
 582         return str;
 583     }
 584 private:
 585     Pick & item;
 586     Buffer_char str;
 587     Buffer_char last;
 588     Buffer_char current;
 589
 590     char * p_last;
 591     char * p_curr;
 592
 593     void copy_curr(){
 594         if (*p_curr) {
 595             str.append(*p_curr);
 596             p_curr++;
 597         }
 598     }
 599
 600     void copy_last(){
 601         if (*p_last) {
 602             str.append(*p_last);
 603             p_last++;
 604         }
 605     }
 606
 607     // copy 0, 1, or 2 character(s) to str
 608     void copy(){
 609         static WeightedRand wr(& Buffer_int().append(DEFAULT_WEIGHT * 10), 5);
 610
 611         switch (wr.next()){
 612             case 0: // copy last  -- has 10 times chance than others
 613                 copy_last();
 614                 break;
 615             case 1: // copy both
 616                 copy_curr();
 617                 copy_last();
 618                 break;
 619             case 2: // copy both
 620                 copy_last();
 621                 copy_curr();
 622                 break;
 623             case 3:
 624                 copy_curr();
 625                 break;
 626             case 4:  // copy nothing
 627                 break;
 628             default:
 629                 // ASSERT(FALSE);
 630                 ;
 631         }
 632     }
 633
 634     void morph(void){
 635         int min = strlen(last);
 636         int max = strlen(current);
 637         if (min > max){
 638             int temp  = min;
 639             min = max;
 640             max = temp;
 641         }
 642
 643         int len = min + rand()%(max - min + 1); // min + [0, diff]
 644         p_curr = current;
 645         p_last = last;
 646         str.reset();
 647
 648         for (; str.content_size()<len && *p_curr && *p_last;){
 649             copy(); // copy 0, 1, or 2 character(s) to str
 650         }
 651
 652         if (str.content_size() == len) {
 653             str.append(0);
 654             final();
 655             return;
 656         }
 657
 658         if (str.content_size() > len) { // if the last copy copied two characters
 659             str[len]=0;
 660             final();
 661             return;
 662         }
 663
 664         // str.content_size() < len
 665         if (*p_last) {
 666             for (; str.content_size() < len; copy_last());
 667         } else if (*p_curr){
 668             for (; str.content_size() < len; copy_curr());
 669         }
 670
 671         int last_len = last.content_size();
 672         for (;str.content_size() < len;){
 673             str.append(last[rand()%last_len]);
 674         }
 675         str.append(0);
 676         final();
 677     }
 678
 679     void final(){
 680         last.reset();
 681         last.append_array(current, current.content_size());
 682     }
 683 };
 684
 685 class Sequence : public Pick {
 686 public:
 687     virtual const char* next(){
 688         str.reset();
 689         int s = items.content_size();
 690         for(int i=0; i < s; i++){
 691             const char * t = items[i]->next();
 692             str.append_array(t, strlen(t));
 693         }
 694         str.append(0); // terminal null
 695         return str;
 696     }
 697
 698     void append (Pick * node){
 699         items.append(node);
 700     }
 701
 702     virtual ~Sequence(){
 703         int s = items.content_size();
 704         for(int i=0; i < s; i++){
 705             //How can assure the item is got from heap?
 706             //Let's assume it.
 707             delete items[i]; // TOFIX: point alias/recursion problem
 708             items[i] = NULL;
 709         }
 710     }
 711 private:
 712     Buffer_pPick items;
 713     Buffer_char  str; //null-terminated c-string
 714 };
 715
 716 class Repeat : public Pick {
 717 private:
 718     Pick * item;
 719     Buffer_char str;
 720     WeightedRand wr;
 721     int min;
 722     int max;
 723     int select_a_count(){
 724         return min + wr.next();
 725     }
 726 public:
 727     virtual const char* next(){
 728         str.reset();
 729         int c = select_a_count();
 730         for(int i=0; i< c; i++){
 731             const char * t = item->next();
 732             str.append_array(t, strlen(t));
 733         }
 734         str.append(0);
 735         return str;
 736     }
 737
 738     Repeat(Pick * base, int minCount =0, int maxCount = 1, Buffer_int * weights = NULL):
 739         wr(weights, maxCount-minCount +1) {
 740         this->item = base;
 741         this->min = minCount;
 742         this->max = maxCount;
 743     }
 744     virtual ~Repeat(){
 745         delete item;  // TOFIX: point alias/recursion problem
 746         item = NULL;
 747     }
 748 };
 749
 750
 751 class Alternation : public Pick {
 752 public:
 753     virtual const char* next(){
 754         str.reset();
 755         int i = wr.next();
 756         const char * t = items[i]->next();
 757         str.append_array(t, strlen(t) + 1);
 758         return str;
 759     }
 760     virtual ~Alternation(){
 761         int s = items.content_size();
 762         for(int i=0; i < s; i++){
 763             delete items[i];  // TOFIX: point alias/recursion problem
 764             items[i] = NULL;
 765         }
 766     }
 767
 768     Alternation & append (Pick * node, int weight = DEFAULT_WEIGHT){
 769         items.append(node);
 770         wr.append(weight);
 771         return *this;
 772     }
 773 private:
 774     Buffer_pPick items;
 775     Buffer_char str; // null-terminated c-string
 776     WeightedRand wr;
 777 };
 778
 779 ///////////////////////////////////////////////////////////
 780 //
 781 // The parser
 782 //
 783
 784 enum TokenType {STRING, VAR, NUMBER, STREAM_END, ERROR, QUESTION, STAR, PLUS, LBRACE, RBRACE, LPAR, RPAR, SEMI, EQ, COMMA, BAR, AT, WAVE, PERCENT};
 785
 786 class Scanner{
 787 friend int DumpScanner(Scanner & s, UBool dumb);
 788 private:
 789     const char * source;
 790     const char * working;
 791     const char * history; // for debug
 792     enum StateType {START, IN_NUM, IN_VAR_FIRST, IN_VAR, IN_QUOTE, IN_QUOTE_BSLASH, IN_BSLASH, IN_STRING, DONE};
 793     StateType state;
 794     void terminated(TokenType t){
 795         working--;       // return the peeked character
 796         tokenType = t;
 797         token.append(0); // close buffer
 798         state = DONE;
 799     }
 800 public:
 801     // the buffer of "source" is owned by caller
 802     Scanner(const char *src/*[in] c-string*/ = NULL):source(src){
 803         working = src;
 804         history = working;
 805         state = DONE;
 806         tokenType = ERROR;
 807     }
 808
 809     //void setSource(const char *const src /*[in] c-string*/){
 810     //    *(&const_cast<const char *>(source)) = src;
 811     //}
 812
 813     Buffer_char token;
 814     TokenType tokenType;
 815
 816     TokenType getNextToken(){
 817         token.reset();
 818         state = START;
 819         history = working; // for debug
 820         while (state != DONE){
 821             char c = *working++;
 822             if (c == 0 && state != START){//avoid buffer overflow. for IN_QUOE, IN_ESCAPE
 823                 terminated(ERROR);
 824                 break; // while
 825             }
 826             switch(state){
 827                 case START:
 828                     tokenType = ERROR;
 829                     switch(c){
 830                         case '?'  : tokenType = QUESTION; break;
 831                         case '*'  : tokenType = STAR; break;
 832                         case '+'  : tokenType = PLUS; break;
 833                         case '{'  : tokenType = LBRACE; break;
 834                         case '}'  : tokenType = RBRACE; break;
 835                         case '('  : tokenType = LPAR; break;
 836                         case ')'  : tokenType = RPAR; break;
 837                         case ';'  : tokenType = SEMI; break;
 838                         case '='  : tokenType = EQ; break;
 839                         case ','  : tokenType = COMMA; break;
 840                         case '|'  : tokenType = BAR; break;
 841                         case '@'  : tokenType = AT; break;
 842                         case '~'  : tokenType = WAVE; break;
 843                         case '%'  : tokenType = PERCENT; break;
 844                         case 0    : tokenType = STREAM_END; working-- /*avoid buffer overflow*/; break;
 845                     }
 846                     if (tokenType != ERROR){
 847                         token.append(c);
 848                         token.append(0);
 849                         state = DONE;
 850                         break; // START
 851                     }
 852                     switch(c){
 853                         case '$'  : state = IN_VAR_FIRST; token.append(c); break;
 854                         case '\'' : state = IN_QUOTE;     break;
 855                         case '\\' : state = IN_BSLASH;    break;
 856                         default:
 857                             if (isWhiteSpace(c)){    // state = START;   //do nothing
 858                             } else if (isDigit(c)){     state = IN_NUM;    token.append(c);
 859                             } else if (isAlphabet(c)){  state = IN_STRING; token.append(c);
 860                             } else {terminated(ERROR);}
 861                     }
 862                     break;//START
 863                 case IN_NUM:
 864                     if (isDigit(c)){
 865                         token.append(c);
 866                     } else {
 867                         terminated(NUMBER);
 868                     }
 869                     break;//IN_NUM
 870                 case IN_VAR_FIRST:
 871                     if (isAlphabet(c)){
 872                         token.append(c);
 873                         state = IN_VAR;
 874                     } else {
 875                         terminated(ERROR);
 876                     }
 877                     break; // IN_VAR_FISRT
 878                 case IN_VAR:
 879                     if (isAlphabet(c) || isDigit(c)){
 880                         token.append(c);
 881                     } else {
 882                         terminated(VAR);
 883                     }
 884                     break;//IN_VAR
 885                 case IN_STRING:
 886                     // About the scanner's behavior for STRING, AT, and ESCAPE:
 887                     // All of them can be contacted with each other.
 888                     // This means the scanner will eat up as much as possible strings
 889                     //   (STRING, AT, and ESCAPE) at one time, with no regard of their
 890                     //   combining sequence.
 891                     //
 892                     if (c == '\''){
 893                         state = IN_QUOTE; // the first time we see single quote
 894                     } else if (c =='\\'){ // back slash character
 895                         state = IN_BSLASH;
 896                     } else if (isAlphabet(c) || isDigit(c)){
 897                         token.append(c);
 898                     } else{
 899                         terminated(STRING);
 900                     }
 901                     break;//IN_STRING
 902                 case IN_QUOTE:
 903                     if (c == '\''){ // the second time we see single quote
 904                         state = IN_STRING; // see document in IN_STRING
 905                     } else if ( c== '\\') { // backslah escape in quote
 906                         state = IN_QUOTE_BSLASH;
 907                     } else {
 908                         token.append(c);  // eat up everything, includes back slash
 909                     }
 910                     break;//IN_QUOTE
 911                 case IN_QUOTE_BSLASH:
 912                 case IN_BSLASH:
 913                     switch (c){
 914                         case 'n'  : token.append('\n'); break;
 915                         case 'r'  : token.append('\r'); break;
 916                         case 't'  : token.append('\t'); break;
 917                         case '\'' : token.append('\''); break;
 918                         case '\\' : token.append('\\'); break;
 919                         default: token.append(c); // unknown escaping, treat it as literal
 920                     }
 921                     if (state == IN_BSLASH){
 922                         state = IN_STRING; // see document in IN_STRING
 923                     } else { // state == IN_QUOTE_BSLASH
 924                         state = IN_QUOTE;
 925                     }
 926                     break;//IN_BSLASH
 927                 case DONE:  /* should never happen */
 928                 default:
 929                     working--;
 930                     tokenType = ERROR;
 931                     state = DONE;
 932                     break;
 933             }//switch(state)
 934         }//while (state != DONE)
 935
 936         return tokenType;
 937     }
 938 };//class Scanner
 939
 940 class Parser{
 941 friend UBool TestParser();
 942 friend class TestParserT;
 943 friend class LanguageGenerator_impl;
 944 private:
 945     Scanner s;
 946     TokenType & token;
 947     int min_max;   // for the evil infinite
 948
 949     UBool match(TokenType expected){
 950         if (token == expected) {
 951             token = s.getNextToken();
 952             return TRUE;
 953         } else {
 954             //s.dumpCurrentPoint();
 955             return FALSE;
 956         }
 957     }
 958
 959     UBool weight(int & value){
 960         if (token == NUMBER){
 961             int temp = atoi(s.token);
 962             match(NUMBER);
 963             if (match(PERCENT)){
 964                 value = temp;
 965                 return TRUE;
 966             }
 967         }
 968         return FALSE;
 969     }
 970
 971     UBool repeat (Pick* &node /*in,out*/){
 972         if (node == NULL) return FALSE;
 973
 974         int count = -2;
 975         int min = -2;
 976         int max = -2;
 977         UBool question = FALSE;
 978         switch (token){
 979             case QUESTION:
 980                 match(QUESTION);
 981                 min = 0;
 982                 max = 1;
 983                 count = 2;
 984                 question = TRUE;
 985                 break;
 986             case STAR:
 987                 match(STAR);
 988                 min = 0;
 989                 max = -1;
 990                 count = -1;
 991                 break;
 992             case PLUS:
 993                 match(PLUS);
 994                 min = 1;
 995                 max = -1;
 996                 count = -1;
 997                 break;
 998             case LBRACE:
 999                 match(LBRACE);
1000                 if (token != NUMBER){
1001                     return FALSE;
1002                 }else {
1003                     min = atoi(s.token);
1004                     match(NUMBER);
1005                     if (token == RBRACE){
1006                         match(RBRACE);
1007                         max = min;
1008                         count = 1;
1009                     } else if (token == COMMA) {
1010                         match(COMMA);
1011                         if (token == RBRACE){
1012                             match(RBRACE);
1013                             max = -1;
1014                             count = -1;
1015                         } else if (token == NUMBER) {
1016                             max = atoi(s.token);
1017                             match(NUMBER);
1018                             count = max - min + 1;
1019                             if (!match(RBRACE)) {
1020                                 return FALSE;
1021                             }
1022                         } else {
1023                             return FALSE;
1024                         }
1025                     } else {
1026                         return FALSE;
1027                     }
1028                 }
1029                 break;
1030             default:
1031                 return FALSE;
1032         }
1033
1034         if (count == -2 || min == -2 || max == -2){
1035             //ASSERT(FALSE);
1036             return FALSE;
1037         }
1038
1039         // eat up following weights
1040         Buffer_int weights;
1041         int w;
1042         while (weight(w)){
1043             weights.append(w);
1044         }
1045
1046         // for the evil infinite
1047         min_max = min_max > min ? min_max : min;
1048         min_max = min_max > max ? min_max : max;
1049         if (min_max > PSEUDO_INFINIT){
1050             return FALSE; // PSEUDO_INFINIT is less than the real maximum
1051         }
1052         if (max == -1){ // the evil infinite
1053             max = PSEUDO_INFINIT;
1054         }
1055         // for the strange question mark
1056         if (question && weights.content_size() > 0){
1057             Buffer_int w2;
1058             w2.append(DEFAULT_WEIGHT - weights[0]).append(weights[0]);
1059             node = new Repeat(node,min,max,&w2);
1060             return TRUE;
1061         }
1062         node = new Repeat(node,min,max,&weights);
1063         return TRUE;
1064     }
1065
1066     UBool core(Pick* &node /*out*/){
1067         if (node != NULL) return FALSE; //assert node == NULL
1068
1069         switch(token){
1070             case LPAR:
1071                 match(LPAR);
1072                 if(defination(node) && match(RPAR)){
1073                     return TRUE;
1074                 }
1075                 return FALSE;
1076             case VAR:
1077                 node = new Variable(&symbols, s.token);
1078                 match(VAR);
1079                 return TRUE;
1080             case STRING:
1081                 node = new Literal(s.token);
1082                 match(STRING);
1083                 return TRUE;
1084             default:
1085                 return FALSE;
1086         }
1087     }
1088     UBool modified(Pick* &node /*out*/){
1089         if (node != NULL) return FALSE; //assert node == NULL
1090
1091         if (!core(node)) {
1092             return FALSE;
1093         }
1094
1095         for (;;){
1096             switch(token){
1097                 case WAVE:
1098                     match(WAVE);
1099                     node = new Morph(*node);
1100                     break;
1101                 case AT:
1102                     match(AT);
1103                     node = new Quote(*node);
1104                     break;
1105                 case QUESTION:
1106                 case STAR:
1107                 case PLUS:
1108                 case LBRACE:
1109                     if (!repeat(node)) return FALSE;
1110                     break;
1111                 case SEMI:      // rule definiation closed
1112                 case RPAR:      // within parenthesis (core closed)
1113                 case BAR:       // in alternation
1114                 case NUMBER:    // in alternation, with weight
1115                 case LPAR:      // in sequence
1116                 case VAR:       // in sequence
1117                 case STRING:    // in sequence
1118                     return TRUE;
1119                 default:
1120                     return FALSE;
1121             }
1122         }
1123     }
1124
1125
1126     UBool sequence_list(Pick* &node /*in,out*/){
1127         if (node == NULL) return FALSE; // assert node != NULL
1128
1129         Sequence* seq = new Sequence();
1130         Pick * n = node;
1131
1132         while (token == VAR || token == STRING || token == LPAR){
1133             seq->append(n);
1134             n = NULL;
1135             if (modified(n)){
1136                 // go on
1137             } else {
1138                 goto FAIL;
1139             }
1140         }
1141
1142         if (token == SEMI || token == RPAR || token == BAR){
1143             seq->append(n);
1144             node = seq;
1145             return TRUE;
1146         }
1147 FAIL:
1148         delete seq;
1149         return FALSE;
1150
1151     }
1152
1153     UBool sequence(Pick* &node /*out*/){
1154         if (node != NULL) return FALSE; //assert node == NULL
1155
1156         if (!modified(node)) {
1157             return FALSE;
1158         }
1159
1160         if (token == VAR || token == STRING || token == LPAR){
1161             return sequence_list(node);
1162         } else {
1163             return TRUE; // just a modified
1164         }
1165     }
1166
1167     UBool alternation_list(Pick* &node /*in,out*/){
1168         if (node == NULL) return FALSE; // assert node != NULL
1169
1170         Alternation * alt = new Alternation();
1171         Pick * n = node;
1172         int w = DEFAULT_WEIGHT;
1173
1174         while (token == NUMBER || token == BAR){
1175             if(token == NUMBER) {
1176                 if (weight(w)){
1177                     if (token == BAR){
1178                         // the middle item, go on
1179                     } else {
1180                         // the last item or encounter error
1181                         break; //while
1182                     }
1183                 } else {
1184                     goto FAIL;
1185                 }
1186             } // else token == BAR
1187             match(BAR);
1188             alt->append(n,w);
1189
1190             n = NULL;
1191             w = DEFAULT_WEIGHT;
1192             if (sequence(n)){
1193                 // go on
1194             } else {
1195                 goto FAIL;
1196             }
1197         }
1198
1199         if (token == SEMI || token == RPAR) {
1200             alt->append(n,w);
1201             node = alt;
1202             return TRUE;
1203         }
1204 FAIL:
1205         delete alt;
1206         return FALSE;
1207     }
1208
1209     UBool alternation(Pick* &node /*out*/){
1210         if (node != NULL) return FALSE; //assert node == NULL
1211
1212         // 'sequence' has higher precedence than 'alternation'
1213         if (!sequence(node)){
1214             return FALSE;
1215         }
1216
1217         if (token == BAR || token == NUMBER){ // find a real alternation1, create it.
1218             return alternation_list(node);
1219         } else {
1220             return TRUE;    // just a sequence_old
1221         }
1222     }
1223
1224
1225     UBool defination(Pick* &node /*out*/){
1226         if (node != NULL) return FALSE; //assert node == NULL
1227         return alternation(node);
1228     }
1229
1230     UBool rule(){
1231         if (token == VAR){
1232             Buffer_char name;
1233             name.append_array(s.token, strlen(s.token) + 1);
1234             match(VAR);
1235
1236             if (match(EQ)){
1237                 Pick * t = NULL;
1238                 if(defination(t)){
1239                     symbols.put(name, t);
1240                     return match(SEMI);
1241                 }
1242             }
1243         }
1244         return FALSE;
1245     }
1246 public:
1247     UBool rules(){
1248         symbols.reset();
1249         token = s.getNextToken();
1250         while (rule()){
1251         }
1252         if (token == STREAM_END){
1253             return TRUE;
1254         } else {
1255             //s.dumpCurrentPoint();
1256             return FALSE;
1257         }
1258     }
1259
1260 public:
1261     SymbolTable symbols;
1262
1263     Parser(const char *const source):s(source), token(s.tokenType){
1264         min_max = -2;
1265     }
1266     UBool parse(){
1267         return rules();
1268     }
1269
1270 }; // class Parser
1271
1272
1273 ///////////////////////////////////////////////////////////
1274 //
1275 //
1276 //
1277
1278 int DumpScanner(Scanner & s, UBool dump = TRUE){
1279     int len = strlen(s.source);
1280     int error_start_offset = s.history - s.source;
1281     if (dump){
1282         printf("\n=================== DumpScanner ================\n");
1283         fwrite(s.source, len, 1, stdout);
1284         printf("\n-----parsed-------------------------------------\n");
1285         fwrite(s.source, s.history - s.source, 1, stdout);
1286         printf("\n-----current------------------------------------\n");
1287         fwrite(s.history, s.working - s.history, 1, stdout);
1288         printf("\n-----unparsed-----------------------------------\n");
1289         fwrite(s.working, (s.source + len - s.working), 1, stdout);
1290         printf("\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n");
1291     }
1292     return error_start_offset;
1293 }
1294
1295 class LanguageGenerator_impl{
1296 public:
1297     LanguageGenerator_impl(const char *const bnf_definition, const char *const top_node)
1298         :par(bnf_definition), top_node_name(top_node){
1299         srand((unsigned)time( NULL ));
1300     }
1301
1302     LanguageGenerator::PARSE_RESULT parseBNF(UBool debug = TRUE){
1303         if (par.parse()){
1304             if (par.symbols.find(top_node_name, &top_node_ref) == SymbolTable::HAS_REF) {
1305                 if (par.symbols.is_complete()) {
1306                     return LanguageGenerator::OK;
1307                 } else {
1308                     if (debug) printf("The bnf definition is incomplete.\n");
1309                     return LanguageGenerator::INCOMPLETE;
1310                 }
1311             } else {
1312                 if (debug) printf("No top node is found.\n");
1313                 return LanguageGenerator::NO_TOP_NODE;
1314             }
1315         } else {
1316             if(debug) {
1317                 printf("The bnf definition is wrong\n");
1318                 DumpScanner(par.s, TRUE);
1319             }
1320             return LanguageGenerator::BNF_DEF_WRONG;
1321         }
1322     }
1323     const char * next(){
1324         return top_node_ref->next();
1325     }
1326
1327 private:
1328     Parser par;
1329     const char *const top_node_name;
1330     Pick * top_node_ref;
1331 };
1332
1333 LanguageGenerator::LanguageGenerator():lang_gen(NULL){
1334 }
1335
1336 LanguageGenerator::~LanguageGenerator(){
1337     delete lang_gen;
1338 }
1339
1340 LanguageGenerator::PARSE_RESULT LanguageGenerator::parseBNF(const char *const bnf_definition /*in*/, const char *const top_node/*in*/, UBool debug){
1341     if (lang_gen){
1342         delete lang_gen;
1343     }
1344     lang_gen = new LanguageGenerator_impl(bnf_definition, top_node);
1345     PARSE_RESULT r = lang_gen->parseBNF(debug);
1346     if (r != OK){
1347         delete lang_gen;
1348         lang_gen = NULL;
1349         return r;
1350     } else {
1351         return r;
1352     }
1353 }
1354 const char *LanguageGenerator::next(){ // Return a null-terminated c-string. The buffer is owned by callee.
1355     if (lang_gen){
1356         return lang_gen->next();
1357     }else {
1358         return "";
1359     }
1360 }
1361
1362 ///////////////////////////////////////////////////////////
1363 //
1364 // The test code for WBNF
1365 //
1366
1367 #define CALL(fun) \
1368     if (fun()){ \
1369         printf("Pass: " #fun "\n");\
1370     } else { \
1371         printf("FAILED: !!! " #fun " !!!\n"); \
1372     }
1373
1374 #define DUMP_R(fun, var, times) \
1375     {printf("\n========= " #fun " =============\n"); \
1376     for (int i=0; i<times; i++) { \
1377         const char * t = var.next();\
1378         fwrite(t,strlen(t),1,stdout); \
1379         printf("\n");   \
1380     }   \
1381     printf("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n");}
1382
1383
1384
1385 #if TEST_WBNF_TEST
1386 static UBool TestQuote(){
1387     const char *const str = "This ' A !,z| qq [] .new\tline";
1388     //const char *const str_r = "This \\' A '!,'z'|' qq '[]' '.'new\tline";
1389     ////
1390     //// :(  we must quote our string to following C syntax
1391     ////     cannot type the literal here, it makes our code rather human unreadable
1392     ////     very very unconformable!
1393     ////
1394     ///*
1395     //*/
1396
1397     //const char *const s1    =   "ab'c";
1398     //const char (* s1_r1) [] = { "ab''c",    // ab''c
1399     //                            "ab\\'c",   // ab\'c
1400     //                           };//
1401     ///*
1402     // .      '.'     \.
1403     // ..     \.\.    '.'\.   '.'\.   '..'    // '.''.'  wrong
1404     //*/
1405
1406     //const char *const s2    =   "a..'.b";       // a..'.b
1407     //const char (*s2_r) []   = { "a'..''.'b"     // a'..''.'b
1408     //                           ,"a'..\\'.'b"    // a'..\'.'b
1409     //                           ,"a'..'\\''.'b"  // a'..'\''.'b
1410     //                          };//
1411
1412     //const char *const s3    =   "a..\\.b";      // a..\.b
1413     //const char (*s3_r) []   = { "a'..\\\\.'b"   // a'..\\.'b
1414     //                           ,"a'..'\\\\'.'b" // a'..'\\'.'b
1415     //                          };//
1416
1417     //                            // no catact operation, no choice, must be compact
1418
1419     srand((unsigned)time( NULL ));
1420
1421     //Escaper l(Escaper::NO, Escaper::NO, Escaper::RAND_ESC);
1422     Pick *p = new Literal(str);
1423     Quote q(*p);
1424
1425     DUMP_R(TestQuote, (*p), 1);
1426     DUMP_R(TestQuote, q, 20);
1427     return FALSE;
1428 }
1429 static UBool TestLiteral(){
1430     const char * s = "test string99.";
1431     Literal n(s);
1432     const char * r = n.next();
1433     return strcmp(s,r) == 0;
1434 }
1435
1436 static UBool TestSequence(){
1437     Sequence seq;
1438     seq.append(new Literal("abc "));
1439     seq.append(new Literal(", s"));
1440
1441     return strcmp(seq.next(), "abc , s") == 0;
1442 }
1443 static UBool TestAlternation(){
1444     srand((unsigned)time( NULL ));
1445     Alternation alt;
1446     alt.append(new Literal("aaa_10%"),10);
1447     alt.append(new Literal("bbb_0%"),0);
1448     alt.append(new Literal("ccc_10%"),10);
1449     alt.append(new Literal("ddddddd_50%"),50);
1450
1451     DUMP_R(TestAlternation, alt, 50);
1452
1453     return FALSE;
1454 }
1455
1456 static UBool TestBuffer(){
1457     Buffer_int t;
1458     t.append(1).append(0).append(5);
1459     int s = t.content_size();
1460     for (int i=0; i<s; ++i){
1461         printf("%d\n", t[i]);
1462     }
1463     return FALSE;
1464 }
1465
1466 static UBool TestWeightedRand(){
1467     srand((unsigned)time( NULL ));
1468     Buffer_int t;
1469     t.append(1).append(0).append(5);
1470     WeightedRand wr(&Buffer_int().append(10).append(0).append(50),4);
1471 //    WeightedRand wr(&t,3);
1472     for (int i=0; i< 50; ++i){
1473         printf("%d\n", wr.next());
1474     }
1475     return FALSE;
1476 }
1477
1478 static UBool TestRepeat(){
1479     srand((unsigned)time( NULL ));
1480     Repeat rep(new Literal("aaa1-5 "), 1, 5);
1481     DUMP_R(TestRepeat, rep, 50);
1482
1483     Repeat r2(new Literal("b{1,3}1%0%5% "), 1, 3, &Buffer_int().append(1).append(0).append(5));
1484     DUMP_R(TestRepeat, r2, 50);
1485
1486     Repeat r3(new Literal("aaa5-5 "), 5, 5);
1487     DUMP_R(TestRepeat, r3, 50);
1488
1489     return FALSE;
1490 }
1491
1492 static UBool TestVariable(){
1493     SymbolTable tab;
1494     Pick * value = new Literal("string1");
1495     Variable var1(&tab, "x", value);
1496
1497     Variable var2(&tab, "y");
1498 //    tab.put(var2, value); // TOFIX: point alias/recursion problem
1499     Pick * value2 = new Literal("string2");
1500     tab.put(var2, value2);
1501
1502     Pick * value3 = new Literal("string3");
1503     Variable var3(&tab, "z");
1504     tab.put("z", value3);
1505
1506     UBool pass;
1507     pass = strcmp(var1.next(), value->next()) == 0;
1508     pass = pass && strcmp(var2.next(), value2->next()) == 0;
1509     pass = pass && strcmp(var3.next(), value3->next()) == 0;
1510     return pass;
1511 }
1512
1513 static UBool TestSymbolTable(){
1514     Literal * n1 = new Literal("string1");
1515     Literal * n2 = new Literal("string2");
1516     SymbolTable t;
1517     t.put("abc", n1);
1518     t.put("$aaa", n2);
1519 //    t.put("alias", n1);  // TOFIX: point alias/recursion problem
1520     t.put("bbb");
1521
1522     UBool pass;
1523     pass = t.find(NULL) == SymbolTable::EMPTY;
1524     pass = pass && t.find("ccc") == SymbolTable::NO_VAR;
1525     pass = pass && t.find("bbb") == SymbolTable::NO_REF;
1526     pass = pass && t.find("abc") == SymbolTable::HAS_REF;
1527     pass = pass && t.find("$aaa") == SymbolTable::HAS_REF;
1528
1529     t.reset();
1530     pass = pass && t.find("abc") == SymbolTable::NO_VAR;
1531     return pass;
1532 }
1533
1534
1535 static UBool TestScanner(void){
1536     //const char str1[] = "$root = $command{0,5} $reset $mostRules{1,20};";
1537     //const char str1_r[][20] = {"$root", "=", "$command", "{", "0", ",", "5", "}",
1538     //    "$reset", "$mostRules", "{", "1", ",", "20", "}", ";"};
1539
1540     const char str2[] = "$p2 =(\\\\ $s $string $s)? 25%;";
1541     const char str2_r[][20] = {"$p2", "=", "(", "\\", "$s", "$string", "$s", ")", "?", "25", "%", ";"};
1542
1543     const char *str = str2;
1544     const char (*str_r)[20] = str2_r;
1545     int tokenNum = sizeof(str2_r)/sizeof(char[20]);
1546
1547     Scanner t(str);
1548     UBool pass = TRUE;
1549     t.getNextToken();
1550     int i = 0;
1551     while (pass){
1552         if (t.tokenType == STREAM_END){
1553             pass = pass? i == tokenNum : FALSE;
1554             break;//while
1555         } else if (t.tokenType == ERROR){
1556             pass = FALSE;
1557             break;//while
1558         } else {
1559             pass = strcmp( &(t.token[0]), str_r[i++]) == 0;
1560             t.getNextToken();
1561         }
1562     }
1563
1564     //const char ts[] = "$commandList = '['"
1565     //" ( alternate ' ' $alternateOptions"
1566     //" | backwards ' 2'"
1567     //" | normalization ' ' $onoff "
1568     //" | caseLevel ' ' $onoff "
1569     //" | hiraganaQ ' ' $onoff"
1570     //" | caseFirst ' ' $caseFirstOptions"
1571     //" | strength ' ' $strengthOptions"
1572     //" ) ']';" ;
1573
1574     //Scanner t2(ts);
1575     //pass = TRUE;
1576     //do {
1577     //    t2.getNextToken();
1578     //    if (t2.tokenType == ERROR){
1579     //        DumpScanner(t2);
1580     //        return FALSE;
1581     //    }
1582     //}while (t.tokenType != STREAM_END);
1583
1584     return pass;
1585 }
1586
1587 class TestParserT {
1588 public:
1589 UBool operator () (const char *const str, const int exp_error_offset = -1, const UBool dump = TRUE){
1590     Parser par(str);
1591     if (par.rules()){
1592         if ( exp_error_offset == -1){
1593             return TRUE;
1594         }else {
1595             DumpScanner(par.s,dump);
1596             return FALSE;
1597         }
1598     }else {
1599         return DumpScanner(par.s, dump) == exp_error_offset;
1600     }
1601 }
1602 };
1603
1604 UBool TestParser(){
1605     TestParserT test;
1606
1607     UBool pass = TRUE;
1608     pass = pass && test ("$s = ' ' ? 50%;");
1609     pass = pass && test("$x = ($var {1,2}) 3%;");         // legal
1610     pass = pass && test("$x = $var {1,2} 3% | b 4%;");    // legal
1611     pass = pass && test("$x = $var {1,2} 3%;");           // legal
1612     pass = pass && test("$m = $c ? 2% 4% | $r 5% | $n 25%;"); // legal
1613     pass = pass && test("$a = b ? 2% | c 5%;");               // legal
1614     pass = pass && test("$x = A B 5% C 10% | D;", 8, FALSE);  // illegal 5%
1615     pass = pass && test("$x = aa 45% | bb 5% cc;", 19, FALSE);// illegal cc
1616     pass = pass && test("$x = (b 5%) (c 6%);");               // legal
1617     pass = pass && test("$x = (b 5%) c 6%;", 13, FALSE);      // illegal 6%
1618     pass = pass && test("$x = b 5% (c 6%);", 9, FALSE);       // illegal (c 6%)
1619     pass = pass && test("$x = b 5% c 6%;", 9, FALSE);         // illegal c 6%
1620     pass = pass && test("$x = b 5%;");                        // legal
1621     pass = pass && test("$x = aa 45% | bb 5% cc;", 19, FALSE);// illegal cc
1622     pass = pass && test("$x = a | b  | c 4% | d 5%;");        // legal
1623     pass = pass && test("$s = ' ' ? 50% abc;");               // legal
1624     pass = pass && test("$s =  a | c d | e f;");              // legal
1625     pass = pass && test( "$z = q 0% | p 1% | r 100%;");         // legal How to check parsed tree??
1626
1627     pass = pass && test("$s = ' ' ? 50%;");
1628     pass = pass && test("$relationList = '<' | '<<' |  ';' | '<<<' | ',' | '=';");
1629     pass = pass && test("$p1 = ($string $s '|' $s)? 25%;");
1630     pass = pass && test("$p2 = (\\\\ $s $string $s)? 25%;");
1631     pass = pass && test("$rel2 = $p1 $string $s $p2;");
1632     pass = pass && test("$relation = $relationList $s ($rel1 | $rel2) $crlf;");
1633     pass = pass && test("$command = $commandList $crlf;");
1634     pass = pass && test("$reset = '&' $s ($beforeList $s)? 10% ($positionList 100% | $string 10%) $crlf;");
1635     pass = pass && test("$mostRules = $command 1% | $reset 5% | $relation 25%;");
1636     pass = pass && test("$root = $command{0,5} $reset $mostRules{1,20};");
1637
1638     const char collationBNF[] =
1639     "$s = ' '? 50%;"
1640     "$crlf = '\r\n';"
1641
1642     "$alternateOptions = non'-'ignorable | shifted;"
1643     "$onoff = on | off;"
1644     "$caseFirstOptions = off | upper | lower;"
1645     "$strengthOptions = '1' | '2' | '3' | '4' | 'I';"
1646     "$commandList = '['"
1647     " ( alternate ' ' $alternateOptions"
1648     " | backwards ' 2'"
1649     " | normalization ' ' $onoff "
1650     " | caseLevel ' ' $onoff "
1651     " | hiraganaQ ' ' $onoff"
1652     " | caseFirst ' ' $caseFirstOptions"
1653     " | strength ' ' $strengthOptions"
1654     " ) ']';"
1655     "$command = $commandList $crlf;"
1656
1657     "$ignorableTypes = (tertiary | secondary | primary) ' ' ignorable;"
1658     "$allTypes = variable | regular | implicit | trailing | $ignorableTypes;"
1659     "$positionList = '[' (first | last) ' ' $allTypes ']';"
1660
1661     "$beforeList = '[before ' ('1' | '2' | '3') ']';"
1662
1663     "$relationList = ("
1664     "   '<'"
1665     " | '<<'"
1666     " | ';'"
1667     " | '<<<'"
1668     " | ','"
1669     " | '='"
1670     ");"
1671     "$string = $magic;"
1672     "$rel1 = '[variable top]' $s;"
1673     "$p1 = ($string $s '|' $s)? 25%;"
1674     "$p2 = (\\\\ $s $string $s)? 25%;"
1675     "$rel2 = $p1 $string $s $p2;"
1676     "$relation = $relationList $s ($rel1 | $rel2) $crlf;"
1677
1678     "$reset = '&' $s ($beforeList $s)? 10% ($positionList 1% | $string 10%) $crlf;"
1679     "$mostRules = $command 1% | $reset 5% | $relation 25%;"
1680     "$root = $command{0,5} $reset $mostRules{1,20};"
1681     ;
1682
1683     pass = pass && test(collationBNF);
1684
1685
1686     return pass;
1687 }
1688
1689 static UBool TestMorph(){
1690     srand((unsigned)time( NULL ));
1691
1692     Alternation * alt = new Alternation();
1693
1694     (*alt)
1695     .append(new Literal("a")).append(new Literal("b")).append(new Literal("c"))
1696     .append(new Literal("d")).append(new Literal("e")).append(new Literal("f"))
1697     .append(new Literal("g")).append(new Literal("h")).append(new Literal("i"))
1698     .append(new Literal("j")).append(new Literal("k")).append(new Literal("l"))
1699     .append(new Literal("m")).append(new Literal("n")).append(new Literal("o"))
1700     ;
1701
1702     Repeat * rep = new Repeat( alt ,5,5 );
1703     Morph m( *rep);
1704
1705 //    DUMP_R(TestMorph,(*rep),20);
1706     DUMP_R(TestMorph,m,100);
1707
1708     return FALSE;
1709 }
1710
1711 #endif
1712
1713 static UBool TestLanguageGenerator(){
1714     //LanguageGenerator g;
1715     //const char *const s = "$s = p 0% | q 1%;";
1716     //g.parseBNF(s, "$s");
1717     UBool pass;
1718     //= strcmp("q", g.next()) == 0;
1719
1720     const char *const def =
1721         //"$a = $b;"
1722         //"$b = $c;"
1723         //"$c = $t;"
1724         //"$t = abc $z{1,2};"
1725         //"$k = a | b | c | d | e | f | g ;"
1726         //"$z = q 0% | p 1% | r 1%;"
1727         "$x = a ? 0%;"
1728         ; // end of string
1729 //    const char * s = "abczz";
1730 //
1731 //
1732     LanguageGenerator g;
1733     pass = g.parseBNF(def, "$x",TRUE);
1734 ////    LanguageGenerator g(collationBNF, "$root", "$magic", new MagicNode());
1735 //
1736     if (pass != LanguageGenerator::OK) return FALSE;
1737
1738     DUMP_R(TestLanguageGenerator, g, 20);
1739     return pass;
1740
1741     ////UBool pass = strcmp(s,r) == 0;
1742
1743     //if (pass){
1744     //    printf("TestRandomLanguageGenerator passed.\n");
1745     //} else {
1746     //    printf("TestRandomLanguageGenerator FAILED!!!\n");
1747     //}
1748     //return pass;
1749 }
1750
1751 void TestWbnf(void){
1752     srand((unsigned)time( NULL ));
1753
1754     //CALL(TestLiteral);
1755     //CALL(TestSequence);
1756     //CALL(TestSymbolTable);
1757     //CALL(TestVariable);
1758
1759     //TestRepeat();
1760     //TestAlternation();
1761     //TestMorph();
1762
1763     //TestQuote();
1764     //TestBuffer();
1765     //TestWeightedRand();
1766
1767     //CALL(TestScanner);
1768     //CALL(TestParser);
1769     CALL(TestLanguageGenerator);
1770 }
1771