]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/wbnf.cpp
ICU-511.34.tar.gz
[apple/icu.git] / icuSources / test / intltest / wbnf.cpp
1 /*
2 ******************************************************************************
3 * Copyright (C) 2005-2007, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 ******************************************************************************
6 */
7
8 #include <stdio.h>
9 #include <string.h>
10 #include <stdlib.h>
11 #include <time.h>
12
13 #include "wbnf.h"
14
15 // Most of this code is meant to test the test code. It's a self test.
16 // Normally this isn't run.
17 #define TEST_WBNF_TEST 0
18
19 ///////////////////////////////////////////////////////////
20 //
21 // Constants and the most basic helper classes
22 //
23
24 static const char DIGIT_CHAR[] = "0123456789";
25 static const char WHITE_SPACE[] = {'\t', ' ', '\r', '\n', 0};
26 static const char ALPHABET[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
27 static const char SPECIAL[] = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~";
28
29 static inline UBool isInList(const char c /*in*/, const char list[] /*in*/){
30 const char * p = list;
31 for (;*p != 0 && *p != c; p++);
32 return *p?TRUE:FALSE;
33 }
34 static inline UBool isDigit(char c) {return isInList(c, DIGIT_CHAR);}
35 static inline UBool isWhiteSpace(char c) {return isInList(c, WHITE_SPACE);}
36 static inline UBool isAlphabet(char c) {return isInList(c, ALPHABET);}
37 static inline UBool isSpecialAsciiChar(char c) {return isInList(c,SPECIAL);}
38
39
40
41 ///////////////////////////////////////////////////////////
42 //
43 // Helper classes
44 //
45
46 class Buffer_byte{
47 // Utility class, can be treated as an auto expanded array. no boundary check.
48
49 typedef char byte;
50 byte * start;
51 byte * current;
52 int buffer_size; // size unit is byte
53 public:
54 inline int content_size(){return current - start;} // size unit is byte
55
56 private:
57 inline void expand(int add_size = 100){ // size unit is byte
58 int new_size = buffer_size + add_size;
59
60 int cs_snap = content_size();
61 start = (byte *) realloc(start, new_size); // may change the value of start
62 current = start + cs_snap;
63
64 memset(current, 0, add_size);
65 buffer_size = new_size;
66 }
67
68 inline void expand_to(int size){
69 int r = size - buffer_size;
70 if (r > 0) {
71 expand(r); // simply expand, no block alignment
72 }
73 }
74 Buffer_byte(const Buffer_byte &);
75 Buffer_byte & operator = (const Buffer_byte &);
76 public:
77 Buffer_byte():start(NULL),current(start),buffer_size(0){
78 expand();
79 }
80 ~Buffer_byte(){
81 free(start);
82 }
83
84 inline void reset(){
85 start != NULL ? memset(start, 0, buffer_size) : 0;
86 current = start;
87 }
88
89 // Using memory copy method to append a C array to buffer,
90 inline void append(const void * c, int size){ // size unit is byte
91 expand_to(content_size() + size) ;
92 memcpy(current, c, size);
93 current = current + size;
94 }
95
96 byte * buffer(){
97 return start;
98 }
99 };
100
101 /*
102 The class(es) try to work as bulid-in array, so it overloads these two operators
103 operator type *();
104 type & operator[];
105 The first is used to auto type convert, the latter is used to select member.
106
107 A small trick is the class does not overload the address-of operator. This
108 behavior is different from bulid-in array, but it give us the opportunity
109 to get the address of the class itself.
110 */
111 //template<typename type>
112 // class BUFFER{
113 // typedef BUFFER name;
114 #define BUFFER(type, name)\
115 class name {\
116 private:\
117 Buffer_byte buf;\
118 public:\
119 name & reset() {buf.reset(); return *this;}\
120 name & append(type c) {buf.append(&c, sizeof(type)); return *this;}\
121 name & append_array(const type * p, int size) {buf.append(p, sizeof(type)*size); return *this;}\
122 type & operator [] (int i) { return ((type *) buf.buffer())[i];}\
123 operator type *(){return (type *) buf.buffer();} \
124 int content_size(){return buf.content_size() / sizeof(type);}\
125 }
126
127
128 class Pick{
129 /* The Pick is the basic language generator element*/
130 public:
131 // generate a string accroding the syntax
132 // Return a null-terminated c-string. The buffer is owned by callee.
133 virtual const char* next() = 0;
134 virtual ~Pick(){};
135 };
136
137 //typedef BUFFER<char> Buffer_char;
138 //typedef BUFFER<int> Buffer_int;
139 //typedef BUFFER<Pick *> Buffer_pPick;
140 BUFFER(char, Buffer_char);
141 BUFFER(int, Buffer_int);
142 BUFFER(Pick *, Buffer_pPick);
143
144 class SymbolTable{
145 /* Helper class.
146 * It's a mapping table between 'variable name' and its 'active Pick object'
147 */
148 private:
149 Buffer_char name_buffer; // var names storage space
150
151 Buffer_int names; // points to name (offset in name_buffer)
152 Buffer_pPick refs; // points to Pick
153
154 int get_index(const char *const var_name){
155 int len = names.content_size();
156 for (int i=0; i< len; i++){
157 if (strcmp(var_name, name_buffer + names[i]) == 0){
158 return i;
159 }
160 }
161 return -1;
162 }
163
164 public:
165 enum RESULT {EMPTY, NO_VAR, NO_REF, HAS_REF};
166
167 RESULT find(const char *const var_name /*[in] c-string*/, Pick * * ref = NULL /*[out] Pick* */){
168 if (!var_name) return EMPTY; // NULL name
169
170 int i = get_index(var_name);
171 if (i == -1){
172 return NO_VAR; // new name
173 }
174 if (!refs[i]){ // exist name, no ref
175 return NO_REF;
176 } else {
177 if (ref) {
178 *ref = refs[i];
179 }
180 return HAS_REF; // exist name, has ref
181 }
182 }
183
184 void put(const char *const var_name, Pick *const var_ref = NULL){
185 int i = get_index(var_name);
186 switch(find(var_name)){
187 case EMPTY: // NULL name
188 break;
189 case NO_VAR: // new name
190 int offset;
191 offset = name_buffer.content_size();
192 name_buffer.append_array(var_name, strlen(var_name) + 1);
193 names.append(offset);
194 refs.append(var_ref);
195 break;
196 case NO_REF: // exist name, no ref
197 refs[i] = var_ref; // link definition with variable
198 break;
199 case HAS_REF: // exist name, has ref
200 if (var_ref){
201 refs[i] = var_ref;
202 }
203 break;
204 default:
205 ; // ASSERT(FALSE);
206 }
207 return;
208 }
209
210 UBool is_complete(){
211 int n = names.content_size();
212 for (int i=0; i<n; ++i){
213 if (refs[i] == NULL){
214 return FALSE;
215 }
216 }
217 return TRUE;
218 }
219
220 void reset(){
221 names.reset();
222 name_buffer.reset();
223
224 // release memory here
225 int s = refs.content_size();
226 for (int i=0; i < s; i++){
227 delete refs[i]; // TOFIX: point alias/recursion problem
228 }
229 refs.reset();
230 }
231
232 ~SymbolTable(){
233 reset();
234 }
235 };
236
237
238 /*
239 // Document of class Escaper
240 //
241 // ATTENTION:
242 // From http://icu-project.org/userguide/Collate_Customization.html.
243 // We get the precedence of escape/quote operations
244 //
245 // (highest) 1. backslash \
246 // 2. two single quotes ''
247 // 3. quoting ' '
248 //
249 // ICU Collation should accept following as the same string.
250 //
251 // 1) 'ab'c _
252 // 2) a\bc \
253 // 3) a'b'\c |- They are equal.
254 // 4) abc _/
255 //
256 // From "two single quotes", we have following deductions
257 // D1. empty quoting is illgal. (obviously)
258 // D2. no contact operation between two quotings
259 // '.''.' is not .. it is .'.
260 // D3. "two single quotes" cannot contact two quoting simultaneously
261 // '..''''.' is not ..'. it is ..''.
262 // NOTICE:
263 // "two single quotes" can contact before one quoting
264 // '''.' is '.
265 // "two single quotes" can literally contact after one quoting
266 // But, from syntax, it's one quoting including a "two single quotes"
267 // '.''' is .'
268 // D4. "two single quotes" cannot solely be included in quoting
269 // '''' is not ' it is ''
270 // NOTICE: These are legal
271 // '.''.' is .'.
272 // '.''' is .'
273 //
274 // dicision
275 // /\
276 // /__\
277 // output buffer input buffer
278 //
279 // To make our dicision (within an atom operation) without caring input and output buffer,
280 // following calling pattern (within an atom operation) shall be avoided
281 //
282 // P1 open_quoting() then close_quoting() (direct violation) D1
283 // P2 close_quoting() then open_quoting() (direct violation) D2
284 // P3 empty open_quoting() (indirect violation) D1, D4
285 // P4 empty close_quoting() (indirect violation) D2, D3
286 // P5 open_quoting() then two single quotes (indirect violation) D4
287 // P6 close_quoting() then two single quotes (indirect violation) D3
288 //
289 // two single quotes escaping will not open_ or close_ quoting()
290 // The choice will not lose some quoing forms.
291 //
292 // For open_quoting(),
293 // we may get this form quoting ''' P5
294 // It may raise a bug ''''x
295 // If we expect
296 // '''.' let the next char open the quoting
297 // '.''.' the quoting is already opened by preceding char
298 //
299 // For close_quoting()
300 // we will get this form quoting '.''' P6
301 // It may raise a bug '.''''.'
302 // If we expect
303 // '.'''\. let the next char close the quoting
304 // '.''''.' the expectation is wrong! using '.'\''.' instead
305 //
306 // It's a hard work to re-adjust generation opportunity for various escaping form.
307 // We just simply ignore it.
308 */
309 class Escaper{
310 public:
311 enum CHOICE {YES, NO, RAND};
312 enum ESCAPE_FORM {BSLASH_ONLY, QUOTE_ONLY, QUOTE_AND_BSLAH, RAND_ESC};
313 private:
314 class Bool{ // A wrapper class for CHOICE, to auto adapter UBool class
315 private:
316 const CHOICE tag;
317 public:
318 Bool(CHOICE flag=RAND):tag(flag){}
319 operator UBool() { // conversion operator
320 return tag == RAND ? rand()%2 : tag == YES;
321 //if (tag == RAND){
322 // return rand()%2 == 1;
323 //} else {
324 // return tag == YES ? TRUE : FALSE;
325 //}
326 }
327 };
328 public:
329 Escaper(CHOICE escapeLiteral = RAND,
330 CHOICE twoQuotesEscape = RAND,
331 ESCAPE_FORM escapeForm = RAND_ESC):
332 escape_form(escapeForm),
333 escape_literal(escapeLiteral),
334 two_quotes_escape(twoQuotesEscape),
335 is_quoting(FALSE){}
336 private:
337 Buffer_char str;
338 ESCAPE_FORM escape_form;
339 Bool escape_literal;
340 Bool two_quotes_escape;
341 UBool quote_escape;
342 UBool bslash_escape;
343 UBool is_quoting;
344
345 void set_options(){
346 ESCAPE_FORM t = escape_form == RAND_ESC ? (ESCAPE_FORM) (rand()%3) : escape_form;
347 switch (t){
348 case BSLASH_ONLY :
349 bslash_escape = TRUE; quote_escape = FALSE; break;
350 case QUOTE_ONLY:
351 bslash_escape = FALSE;quote_escape = TRUE; break;
352 case QUOTE_AND_BSLAH:
353 bslash_escape = TRUE; quote_escape = TRUE; break;
354 default:
355 ;// error
356 }
357 }
358
359 void reset(){
360 str.reset();
361 is_quoting = FALSE;
362 }
363
364 inline void open_quoting(){
365 if(is_quoting){
366 // do nothing
367 } else {
368 str.append('\'');
369 is_quoting = TRUE;
370 }
371 }
372 inline void close_quoting(){
373 if(is_quoting){
374 str.append('\'');
375 is_quoting = FALSE;
376 } else {
377 // do nothing
378 }
379 }
380
381 // str [in] null-terminated c-string
382 void append(const char * strToAppend){
383 for(;*strToAppend != 0; strToAppend++){
384 append(*strToAppend);
385 }
386 }
387
388 inline void append(const char c){
389 set_options();
390
391 if (c == '\\'){
392 quote_escape ? open_quoting() : close_quoting();
393 //bslash_escape always true here
394 str.append('\\');
395 str.append('\\');
396 } else if (c == '\''){
397 if (two_quotes_escape){ // quoted using two single quotes
398 // See documents in anonymous.design
399 str.append('\'');
400 str.append('\'');
401 } else{
402 quote_escape ? open_quoting() : close_quoting();
403 //bslash_escape always true here
404 str.append('\\');
405 str.append('\'');
406 }
407 } else if (isSpecialAsciiChar(c) || isWhiteSpace(c)){
408 quote_escape ? open_quoting() : close_quoting();
409 if (bslash_escape) str.append('\\');
410 str.append(c);
411 } else { //if (isAlphabet(c) || isDigit(c) || TRUE){ // treat others as literal
412 if (escape_literal){
413 quote_escape ? open_quoting() : close_quoting();
414 if (bslash_escape) str.append('\\');
415 str.append(c);
416 } else {
417 close_quoting();
418 str.append(c);
419 }
420 }
421 }
422
423 public:
424 // Return a null-terminate c-string. The buffer is owned by callee.
425 char * operator()(const char * literal /*c-string*/){
426 str.reset();
427 for(;*literal != 0; literal++){
428 append(*literal);
429 }
430 close_quoting(); // P4 exception, to close whole quoting
431 return str;
432 }
433 };
434
435 class WeightedRand{
436 // Return a random number in [0, size)
437 // Every number has different chance (aka weight) to be selected.
438 private:
439 Buffer_int weights;
440 double total;
441 WeightedRand(const WeightedRand &);
442 WeightedRand & operator = (const WeightedRand &);
443 public:
444 WeightedRand(Buffer_int * weight_list = NULL, int size = 0){
445 if ( weight_list == NULL){
446 for (int i=0; i<size; ++i) weights.append(DEFAULT_WEIGHT);
447 } else {
448 int s = weight_list->content_size();
449 if (s < size){
450 weights.append_array( (*weight_list),s);
451 for (int i=s; i<size; ++i) weights.append(DEFAULT_WEIGHT);
452 } else { // s >= size
453 weights.append_array( (*weight_list),size);
454 }
455 }
456 total = 0;
457 int c = weights.content_size();
458 for (int i=0; i<c; ++i){
459 total += weights[i];
460 }
461 }
462
463 void append(int weight){
464 weights.append(weight);
465 total += weight;
466 }
467
468 // Give a random number with the consideration of weight.
469 // Every random number is associated with a weight.
470 // It identifies the chance to be selected,
471 // larger weight has more chance to be selected.
472 //
473 //
474 // ______________________ every slot has equal chance
475 //
476 // [____][_][___][______] each item has different slots, hence different chance
477 //
478 //
479 // The algorithms to generate the number is illustrated by preceding figure.
480 // First, a slot is selected by rand(). Then we translate the slot to corresponding item.
481 //
482 int next(){
483 // get a random in [0,1]
484 double reference_mark = (double)rand() / (double)RAND_MAX;
485
486 // get the slot's index, 0 <= mark <= total;
487 double mark = total * reference_mark;
488
489 // translate the slot to corresponding item
490 int i=0;
491 for (;;){
492 mark -= weights[i]; // 0 <= mark <= total
493 if (mark <= 0)
494 break;
495 i++;
496 }
497 return i;
498 }
499 };
500
501 ///////////////////////////////////////////////////////////
502 //
503 // The parser result nodes
504 //
505
506 class Literal : public Pick {
507 public:
508 virtual const char* next(){
509 return str;
510 }
511 Literal(const char * s /*c-string*/){
512 str.append_array(s, strlen(s) + 1);
513 }
514 private:
515 Buffer_char str; //null-terminated c-string
516 };
517
518 class Variable : public Pick {
519 public:
520 Variable(SymbolTable * symbols, const char * varName, Pick * varRef = NULL){
521 this->var_name.append_array(varName, strlen(varName) + 1);
522 if ((symbol_table = symbols)){
523 symbol_table->put(varName, varRef);
524 }
525 }
526
527 operator const char *(){
528 return var_name;
529 }
530
531 virtual const char* next(){
532 if (symbol_table){
533 Pick * var_ref = NULL;
534 symbol_table->find(var_name, &var_ref);
535 if (var_ref) {
536 return var_ref->next();
537 }
538 }
539 return ""; // dumb string
540 }
541 private:
542 Buffer_char var_name;
543 SymbolTable * symbol_table;
544 };
545
546 class Quote : public Pick{
547 public:
548 Quote(Pick & base):item(base),e(Escaper::NO, Escaper::NO, Escaper::BSLASH_ONLY){
549 }
550 virtual const char* next(){
551 return e(item.next());
552 }
553 private:
554 Pick & item;
555 Buffer_char str;
556 Escaper e;
557 };
558
559
560 class Morph : public Pick{
561 /*
562 The difference between morph and an arbitrary random string is that
563 a morph changes slowly. When we build collation rules, for example,
564 it is a much better test if the strings we use are all in the same
565 'neighborhood'; they share many common characters.
566 */
567 public:
568 Morph(Pick & base):item(base){}
569
570 virtual const char* next(){
571 current.reset();
572 const char * s = item.next();
573 current.append_array(s, strlen(s) + 1);
574 if (last.content_size() == 0) {
575 str.reset();
576 last.reset();
577 str.append_array(current, current.content_size());
578 last.append_array(current, current.content_size());
579 } else {
580 morph();
581 }
582 return str;
583 }
584 private:
585 Pick & item;
586 Buffer_char str;
587 Buffer_char last;
588 Buffer_char current;
589
590 char * p_last;
591 char * p_curr;
592
593 void copy_curr(){
594 if (*p_curr) {
595 str.append(*p_curr);
596 p_curr++;
597 }
598 }
599
600 void copy_last(){
601 if (*p_last) {
602 str.append(*p_last);
603 p_last++;
604 }
605 }
606
607 // copy 0, 1, or 2 character(s) to str
608 void copy(){
609 static WeightedRand wr(& Buffer_int().append(DEFAULT_WEIGHT * 10), 5);
610
611 switch (wr.next()){
612 case 0: // copy last -- has 10 times chance than others
613 copy_last();
614 break;
615 case 1: // copy both
616 copy_curr();
617 copy_last();
618 break;
619 case 2: // copy both
620 copy_last();
621 copy_curr();
622 break;
623 case 3:
624 copy_curr();
625 break;
626 case 4: // copy nothing
627 break;
628 default:
629 // ASSERT(FALSE);
630 ;
631 }
632 }
633
634 void morph(void){
635 int min = strlen(last);
636 int max = strlen(current);
637 if (min > max){
638 int temp = min;
639 min = max;
640 max = temp;
641 }
642
643 int len = min + rand()%(max - min + 1); // min + [0, diff]
644 p_curr = current;
645 p_last = last;
646 str.reset();
647
648 for (; str.content_size()<len && *p_curr && *p_last;){
649 copy(); // copy 0, 1, or 2 character(s) to str
650 }
651
652 if (str.content_size() == len) {
653 str.append(0);
654 final();
655 return;
656 }
657
658 if (str.content_size() > len) { // if the last copy copied two characters
659 str[len]=0;
660 final();
661 return;
662 }
663
664 // str.content_size() < len
665 if (*p_last) {
666 for (; str.content_size() < len; copy_last());
667 } else if (*p_curr){
668 for (; str.content_size() < len; copy_curr());
669 }
670
671 int last_len = last.content_size();
672 for (;str.content_size() < len;){
673 str.append(last[rand()%last_len]);
674 }
675 str.append(0);
676 final();
677 }
678
679 void final(){
680 last.reset();
681 last.append_array(current, current.content_size());
682 }
683 };
684
685 class Sequence : public Pick {
686 public:
687 virtual const char* next(){
688 str.reset();
689 int s = items.content_size();
690 for(int i=0; i < s; i++){
691 const char * t = items[i]->next();
692 str.append_array(t, strlen(t));
693 }
694 str.append(0); // terminal null
695 return str;
696 }
697
698 void append (Pick * node){
699 items.append(node);
700 }
701
702 virtual ~Sequence(){
703 int s = items.content_size();
704 for(int i=0; i < s; i++){
705 //How can assure the item is got from heap?
706 //Let's assume it.
707 delete items[i]; // TOFIX: point alias/recursion problem
708 items[i] = NULL;
709 }
710 }
711 private:
712 Buffer_pPick items;
713 Buffer_char str; //null-terminated c-string
714 };
715
716 class Repeat : public Pick {
717 private:
718 Pick * item;
719 Buffer_char str;
720 WeightedRand wr;
721 int min;
722 int max;
723 int select_a_count(){
724 return min + wr.next();
725 }
726 public:
727 virtual const char* next(){
728 str.reset();
729 int c = select_a_count();
730 for(int i=0; i< c; i++){
731 const char * t = item->next();
732 str.append_array(t, strlen(t));
733 }
734 str.append(0);
735 return str;
736 }
737
738 Repeat(Pick * base, int minCount =0, int maxCount = 1, Buffer_int * weights = NULL):
739 wr(weights, maxCount-minCount +1) {
740 this->item = base;
741 this->min = minCount;
742 this->max = maxCount;
743 }
744 virtual ~Repeat(){
745 delete item; // TOFIX: point alias/recursion problem
746 item = NULL;
747 }
748 };
749
750
751 class Alternation : public Pick {
752 public:
753 virtual const char* next(){
754 str.reset();
755 int i = wr.next();
756 const char * t = items[i]->next();
757 str.append_array(t, strlen(t) + 1);
758 return str;
759 }
760 virtual ~Alternation(){
761 int s = items.content_size();
762 for(int i=0; i < s; i++){
763 delete items[i]; // TOFIX: point alias/recursion problem
764 items[i] = NULL;
765 }
766 }
767
768 Alternation & append (Pick * node, int weight = DEFAULT_WEIGHT){
769 items.append(node);
770 wr.append(weight);
771 return *this;
772 }
773 private:
774 Buffer_pPick items;
775 Buffer_char str; // null-terminated c-string
776 WeightedRand wr;
777 };
778
779 ///////////////////////////////////////////////////////////
780 //
781 // The parser
782 //
783
784 enum TokenType {STRING, VAR, NUMBER, STREAM_END, ERROR, QUESTION, STAR, PLUS, LBRACE, RBRACE, LPAR, RPAR, SEMI, EQ, COMMA, BAR, AT, WAVE, PERCENT};
785
786 class Scanner{
787 friend int DumpScanner(Scanner & s, UBool dumb);
788 private:
789 const char * source;
790 const char * working;
791 const char * history; // for debug
792 enum StateType {START, IN_NUM, IN_VAR_FIRST, IN_VAR, IN_QUOTE, IN_QUOTE_BSLASH, IN_BSLASH, IN_STRING, DONE};
793 StateType state;
794 void terminated(TokenType t){
795 working--; // return the peeked character
796 tokenType = t;
797 token.append(0); // close buffer
798 state = DONE;
799 }
800 public:
801 // the buffer of "source" is owned by caller
802 Scanner(const char *src/*[in] c-string*/ = NULL):source(src){
803 working = src;
804 history = working;
805 state = DONE;
806 tokenType = ERROR;
807 }
808
809 //void setSource(const char *const src /*[in] c-string*/){
810 // *(&const_cast<const char *>(source)) = src;
811 //}
812
813 Buffer_char token;
814 TokenType tokenType;
815
816 TokenType getNextToken(){
817 token.reset();
818 state = START;
819 history = working; // for debug
820 while (state != DONE){
821 char c = *working++;
822 if (c == 0 && state != START){//avoid buffer overflow. for IN_QUOE, IN_ESCAPE
823 terminated(ERROR);
824 break; // while
825 }
826 switch(state){
827 case START:
828 tokenType = ERROR;
829 switch(c){
830 case '?' : tokenType = QUESTION; break;
831 case '*' : tokenType = STAR; break;
832 case '+' : tokenType = PLUS; break;
833 case '{' : tokenType = LBRACE; break;
834 case '}' : tokenType = RBRACE; break;
835 case '(' : tokenType = LPAR; break;
836 case ')' : tokenType = RPAR; break;
837 case ';' : tokenType = SEMI; break;
838 case '=' : tokenType = EQ; break;
839 case ',' : tokenType = COMMA; break;
840 case '|' : tokenType = BAR; break;
841 case '@' : tokenType = AT; break;
842 case '~' : tokenType = WAVE; break;
843 case '%' : tokenType = PERCENT; break;
844 case 0 : tokenType = STREAM_END; working-- /*avoid buffer overflow*/; break;
845 }
846 if (tokenType != ERROR){
847 token.append(c);
848 token.append(0);
849 state = DONE;
850 break; // START
851 }
852 switch(c){
853 case '$' : state = IN_VAR_FIRST; token.append(c); break;
854 case '\'' : state = IN_QUOTE; break;
855 case '\\' : state = IN_BSLASH; break;
856 default:
857 if (isWhiteSpace(c)){ // state = START; //do nothing
858 } else if (isDigit(c)){ state = IN_NUM; token.append(c);
859 } else if (isAlphabet(c)){ state = IN_STRING; token.append(c);
860 } else {terminated(ERROR);}
861 }
862 break;//START
863 case IN_NUM:
864 if (isDigit(c)){
865 token.append(c);
866 } else {
867 terminated(NUMBER);
868 }
869 break;//IN_NUM
870 case IN_VAR_FIRST:
871 if (isAlphabet(c)){
872 token.append(c);
873 state = IN_VAR;
874 } else {
875 terminated(ERROR);
876 }
877 break; // IN_VAR_FISRT
878 case IN_VAR:
879 if (isAlphabet(c) || isDigit(c)){
880 token.append(c);
881 } else {
882 terminated(VAR);
883 }
884 break;//IN_VAR
885 case IN_STRING:
886 // About the scanner's behavior for STRING, AT, and ESCAPE:
887 // All of them can be contacted with each other.
888 // This means the scanner will eat up as much as possible strings
889 // (STRING, AT, and ESCAPE) at one time, with no regard of their
890 // combining sequence.
891 //
892 if (c == '\''){
893 state = IN_QUOTE; // the first time we see single quote
894 } else if (c =='\\'){ // back slash character
895 state = IN_BSLASH;
896 } else if (isAlphabet(c) || isDigit(c)){
897 token.append(c);
898 } else{
899 terminated(STRING);
900 }
901 break;//IN_STRING
902 case IN_QUOTE:
903 if (c == '\''){ // the second time we see single quote
904 state = IN_STRING; // see document in IN_STRING
905 } else if ( c== '\\') { // backslah escape in quote
906 state = IN_QUOTE_BSLASH;
907 } else {
908 token.append(c); // eat up everything, includes back slash
909 }
910 break;//IN_QUOTE
911 case IN_QUOTE_BSLASH:
912 case IN_BSLASH:
913 switch (c){
914 case 'n' : token.append('\n'); break;
915 case 'r' : token.append('\r'); break;
916 case 't' : token.append('\t'); break;
917 case '\'' : token.append('\''); break;
918 case '\\' : token.append('\\'); break;
919 default: token.append(c); // unknown escaping, treat it as literal
920 }
921 if (state == IN_BSLASH){
922 state = IN_STRING; // see document in IN_STRING
923 } else { // state == IN_QUOTE_BSLASH
924 state = IN_QUOTE;
925 }
926 break;//IN_BSLASH
927 case DONE: /* should never happen */
928 default:
929 working--;
930 tokenType = ERROR;
931 state = DONE;
932 break;
933 }//switch(state)
934 }//while (state != DONE)
935
936 return tokenType;
937 }
938 };//class Scanner
939
940 class Parser{
941 friend UBool TestParser();
942 friend class TestParserT;
943 friend class LanguageGenerator_impl;
944 private:
945 Scanner s;
946 TokenType & token;
947 int min_max; // for the evil infinite
948
949 UBool match(TokenType expected){
950 if (token == expected) {
951 token = s.getNextToken();
952 return TRUE;
953 } else {
954 //s.dumpCurrentPoint();
955 return FALSE;
956 }
957 }
958
959 UBool weight(int & value){
960 if (token == NUMBER){
961 int temp = atoi(s.token);
962 match(NUMBER);
963 if (match(PERCENT)){
964 value = temp;
965 return TRUE;
966 }
967 }
968 return FALSE;
969 }
970
971 UBool repeat (Pick* &node /*in,out*/){
972 if (node == NULL) return FALSE;
973
974 int count = -2;
975 int min = -2;
976 int max = -2;
977 UBool question = FALSE;
978 switch (token){
979 case QUESTION:
980 match(QUESTION);
981 min = 0;
982 max = 1;
983 count = 2;
984 question = TRUE;
985 break;
986 case STAR:
987 match(STAR);
988 min = 0;
989 max = -1;
990 count = -1;
991 break;
992 case PLUS:
993 match(PLUS);
994 min = 1;
995 max = -1;
996 count = -1;
997 break;
998 case LBRACE:
999 match(LBRACE);
1000 if (token != NUMBER){
1001 return FALSE;
1002 }else {
1003 min = atoi(s.token);
1004 match(NUMBER);
1005 if (token == RBRACE){
1006 match(RBRACE);
1007 max = min;
1008 count = 1;
1009 } else if (token == COMMA) {
1010 match(COMMA);
1011 if (token == RBRACE){
1012 match(RBRACE);
1013 max = -1;
1014 count = -1;
1015 } else if (token == NUMBER) {
1016 max = atoi(s.token);
1017 match(NUMBER);
1018 count = max - min + 1;
1019 if (!match(RBRACE)) {
1020 return FALSE;
1021 }
1022 } else {
1023 return FALSE;
1024 }
1025 } else {
1026 return FALSE;
1027 }
1028 }
1029 break;
1030 default:
1031 return FALSE;
1032 }
1033
1034 if (count == -2 || min == -2 || max == -2){
1035 //ASSERT(FALSE);
1036 return FALSE;
1037 }
1038
1039 // eat up following weights
1040 Buffer_int weights;
1041 int w;
1042 while (weight(w)){
1043 weights.append(w);
1044 }
1045
1046 // for the evil infinite
1047 min_max = min_max > min ? min_max : min;
1048 min_max = min_max > max ? min_max : max;
1049 if (min_max > PSEUDO_INFINIT){
1050 return FALSE; // PSEUDO_INFINIT is less than the real maximum
1051 }
1052 if (max == -1){ // the evil infinite
1053 max = PSEUDO_INFINIT;
1054 }
1055 // for the strange question mark
1056 if (question && weights.content_size() > 0){
1057 Buffer_int w2;
1058 w2.append(DEFAULT_WEIGHT - weights[0]).append(weights[0]);
1059 node = new Repeat(node,min,max,&w2);
1060 return TRUE;
1061 }
1062 node = new Repeat(node,min,max,&weights);
1063 return TRUE;
1064 }
1065
1066 UBool core(Pick* &node /*out*/){
1067 if (node != NULL) return FALSE; //assert node == NULL
1068
1069 switch(token){
1070 case LPAR:
1071 match(LPAR);
1072 if(defination(node) && match(RPAR)){
1073 return TRUE;
1074 }
1075 return FALSE;
1076 case VAR:
1077 node = new Variable(&symbols, s.token);
1078 match(VAR);
1079 return TRUE;
1080 case STRING:
1081 node = new Literal(s.token);
1082 match(STRING);
1083 return TRUE;
1084 default:
1085 return FALSE;
1086 }
1087 }
1088 UBool modified(Pick* &node /*out*/){
1089 if (node != NULL) return FALSE; //assert node == NULL
1090
1091 if (!core(node)) {
1092 return FALSE;
1093 }
1094
1095 for (;;){
1096 switch(token){
1097 case WAVE:
1098 match(WAVE);
1099 node = new Morph(*node);
1100 break;
1101 case AT:
1102 match(AT);
1103 node = new Quote(*node);
1104 break;
1105 case QUESTION:
1106 case STAR:
1107 case PLUS:
1108 case LBRACE:
1109 if (!repeat(node)) return FALSE;
1110 break;
1111 case SEMI: // rule definiation closed
1112 case RPAR: // within parenthesis (core closed)
1113 case BAR: // in alternation
1114 case NUMBER: // in alternation, with weight
1115 case LPAR: // in sequence
1116 case VAR: // in sequence
1117 case STRING: // in sequence
1118 return TRUE;
1119 default:
1120 return FALSE;
1121 }
1122 }
1123 }
1124
1125
1126 UBool sequence_list(Pick* &node /*in,out*/){
1127 if (node == NULL) return FALSE; // assert node != NULL
1128
1129 Sequence* seq = new Sequence();
1130 Pick * n = node;
1131
1132 while (token == VAR || token == STRING || token == LPAR){
1133 seq->append(n);
1134 n = NULL;
1135 if (modified(n)){
1136 // go on
1137 } else {
1138 goto FAIL;
1139 }
1140 }
1141
1142 if (token == SEMI || token == RPAR || token == BAR){
1143 seq->append(n);
1144 node = seq;
1145 return TRUE;
1146 }
1147 FAIL:
1148 delete seq;
1149 return FALSE;
1150
1151 }
1152
1153 UBool sequence(Pick* &node /*out*/){
1154 if (node != NULL) return FALSE; //assert node == NULL
1155
1156 if (!modified(node)) {
1157 return FALSE;
1158 }
1159
1160 if (token == VAR || token == STRING || token == LPAR){
1161 return sequence_list(node);
1162 } else {
1163 return TRUE; // just a modified
1164 }
1165 }
1166
1167 UBool alternation_list(Pick* &node /*in,out*/){
1168 if (node == NULL) return FALSE; // assert node != NULL
1169
1170 Alternation * alt = new Alternation();
1171 Pick * n = node;
1172 int w = DEFAULT_WEIGHT;
1173
1174 while (token == NUMBER || token == BAR){
1175 if(token == NUMBER) {
1176 if (weight(w)){
1177 if (token == BAR){
1178 // the middle item, go on
1179 } else {
1180 // the last item or encounter error
1181 break; //while
1182 }
1183 } else {
1184 goto FAIL;
1185 }
1186 } // else token == BAR
1187 match(BAR);
1188 alt->append(n,w);
1189
1190 n = NULL;
1191 w = DEFAULT_WEIGHT;
1192 if (sequence(n)){
1193 // go on
1194 } else {
1195 goto FAIL;
1196 }
1197 }
1198
1199 if (token == SEMI || token == RPAR) {
1200 alt->append(n,w);
1201 node = alt;
1202 return TRUE;
1203 }
1204 FAIL:
1205 delete alt;
1206 return FALSE;
1207 }
1208
1209 UBool alternation(Pick* &node /*out*/){
1210 if (node != NULL) return FALSE; //assert node == NULL
1211
1212 // 'sequence' has higher precedence than 'alternation'
1213 if (!sequence(node)){
1214 return FALSE;
1215 }
1216
1217 if (token == BAR || token == NUMBER){ // find a real alternation1, create it.
1218 return alternation_list(node);
1219 } else {
1220 return TRUE; // just a sequence_old
1221 }
1222 }
1223
1224
1225 UBool defination(Pick* &node /*out*/){
1226 if (node != NULL) return FALSE; //assert node == NULL
1227 return alternation(node);
1228 }
1229
1230 UBool rule(){
1231 if (token == VAR){
1232 Buffer_char name;
1233 name.append_array(s.token, strlen(s.token) + 1);
1234 match(VAR);
1235
1236 if (match(EQ)){
1237 Pick * t = NULL;
1238 if(defination(t)){
1239 symbols.put(name, t);
1240 return match(SEMI);
1241 }
1242 }
1243 }
1244 return FALSE;
1245 }
1246 public:
1247 UBool rules(){
1248 symbols.reset();
1249 token = s.getNextToken();
1250 while (rule()){
1251 }
1252 if (token == STREAM_END){
1253 return TRUE;
1254 } else {
1255 //s.dumpCurrentPoint();
1256 return FALSE;
1257 }
1258 }
1259
1260 public:
1261 SymbolTable symbols;
1262
1263 Parser(const char *const source):s(source), token(s.tokenType){
1264 min_max = -2;
1265 }
1266 UBool parse(){
1267 return rules();
1268 }
1269
1270 }; // class Parser
1271
1272
1273 ///////////////////////////////////////////////////////////
1274 //
1275 //
1276 //
1277
1278 int DumpScanner(Scanner & s, UBool dump = TRUE){
1279 int len = strlen(s.source);
1280 int error_start_offset = s.history - s.source;
1281 if (dump){
1282 printf("\n=================== DumpScanner ================\n");
1283 fwrite(s.source, len, 1, stdout);
1284 printf("\n-----parsed-------------------------------------\n");
1285 fwrite(s.source, s.history - s.source, 1, stdout);
1286 printf("\n-----current------------------------------------\n");
1287 fwrite(s.history, s.working - s.history, 1, stdout);
1288 printf("\n-----unparsed-----------------------------------\n");
1289 fwrite(s.working, (s.source + len - s.working), 1, stdout);
1290 printf("\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n");
1291 }
1292 return error_start_offset;
1293 }
1294
1295 class LanguageGenerator_impl{
1296 public:
1297 LanguageGenerator_impl(const char *const bnf_definition, const char *const top_node)
1298 :par(bnf_definition), top_node_name(top_node){
1299 srand((unsigned)time( NULL ));
1300 }
1301
1302 LanguageGenerator::PARSE_RESULT parseBNF(UBool debug = TRUE){
1303 if (par.parse()){
1304 if (par.symbols.find(top_node_name, &top_node_ref) == SymbolTable::HAS_REF) {
1305 if (par.symbols.is_complete()) {
1306 return LanguageGenerator::OK;
1307 } else {
1308 if (debug) printf("The bnf definition is incomplete.\n");
1309 return LanguageGenerator::INCOMPLETE;
1310 }
1311 } else {
1312 if (debug) printf("No top node is found.\n");
1313 return LanguageGenerator::NO_TOP_NODE;
1314 }
1315 } else {
1316 if(debug) {
1317 printf("The bnf definition is wrong\n");
1318 DumpScanner(par.s, TRUE);
1319 }
1320 return LanguageGenerator::BNF_DEF_WRONG;
1321 }
1322 }
1323 const char * next(){
1324 return top_node_ref->next();
1325 }
1326
1327 private:
1328 Parser par;
1329 const char *const top_node_name;
1330 Pick * top_node_ref;
1331 };
1332
1333 LanguageGenerator::LanguageGenerator():lang_gen(NULL){
1334 }
1335
1336 LanguageGenerator::~LanguageGenerator(){
1337 delete lang_gen;
1338 }
1339
1340 LanguageGenerator::PARSE_RESULT LanguageGenerator::parseBNF(const char *const bnf_definition /*in*/, const char *const top_node/*in*/, UBool debug){
1341 if (lang_gen){
1342 delete lang_gen;
1343 }
1344 lang_gen = new LanguageGenerator_impl(bnf_definition, top_node);
1345 PARSE_RESULT r = lang_gen->parseBNF(debug);
1346 if (r != OK){
1347 delete lang_gen;
1348 lang_gen = NULL;
1349 return r;
1350 } else {
1351 return r;
1352 }
1353 }
1354 const char *LanguageGenerator::next(){ // Return a null-terminated c-string. The buffer is owned by callee.
1355 if (lang_gen){
1356 return lang_gen->next();
1357 }else {
1358 return "";
1359 }
1360 }
1361
1362 ///////////////////////////////////////////////////////////
1363 //
1364 // The test code for WBNF
1365 //
1366
1367 #define CALL(fun) \
1368 if (fun()){ \
1369 printf("Pass: " #fun "\n");\
1370 } else { \
1371 printf("FAILED: !!! " #fun " !!!\n"); \
1372 }
1373
1374 #define DUMP_R(fun, var, times) \
1375 {printf("\n========= " #fun " =============\n"); \
1376 for (int i=0; i<times; i++) { \
1377 const char * t = var.next();\
1378 fwrite(t,strlen(t),1,stdout); \
1379 printf("\n"); \
1380 } \
1381 printf("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n");}
1382
1383
1384
1385 #if TEST_WBNF_TEST
1386 static UBool TestQuote(){
1387 const char *const str = "This ' A !,z| qq [] .new\tline";
1388 //const char *const str_r = "This \\' A '!,'z'|' qq '[]' '.'new\tline";
1389 ////
1390 //// :( we must quote our string to following C syntax
1391 //// cannot type the literal here, it makes our code rather human unreadable
1392 //// very very unconformable!
1393 ////
1394 ///*
1395 //*/
1396
1397 //const char *const s1 = "ab'c";
1398 //const char (* s1_r1) [] = { "ab''c", // ab''c
1399 // "ab\\'c", // ab\'c
1400 // };//
1401 ///*
1402 // . '.' \.
1403 // .. \.\. '.'\. '.'\. '..' // '.''.' wrong
1404 //*/
1405
1406 //const char *const s2 = "a..'.b"; // a..'.b
1407 //const char (*s2_r) [] = { "a'..''.'b" // a'..''.'b
1408 // ,"a'..\\'.'b" // a'..\'.'b
1409 // ,"a'..'\\''.'b" // a'..'\''.'b
1410 // };//
1411
1412 //const char *const s3 = "a..\\.b"; // a..\.b
1413 //const char (*s3_r) [] = { "a'..\\\\.'b" // a'..\\.'b
1414 // ,"a'..'\\\\'.'b" // a'..'\\'.'b
1415 // };//
1416
1417 // // no catact operation, no choice, must be compact
1418
1419 srand((unsigned)time( NULL ));
1420
1421 //Escaper l(Escaper::NO, Escaper::NO, Escaper::RAND_ESC);
1422 Pick *p = new Literal(str);
1423 Quote q(*p);
1424
1425 DUMP_R(TestQuote, (*p), 1);
1426 DUMP_R(TestQuote, q, 20);
1427 return FALSE;
1428 }
1429 static UBool TestLiteral(){
1430 const char * s = "test string99.";
1431 Literal n(s);
1432 const char * r = n.next();
1433 return strcmp(s,r) == 0;
1434 }
1435
1436 static UBool TestSequence(){
1437 Sequence seq;
1438 seq.append(new Literal("abc "));
1439 seq.append(new Literal(", s"));
1440
1441 return strcmp(seq.next(), "abc , s") == 0;
1442 }
1443 static UBool TestAlternation(){
1444 srand((unsigned)time( NULL ));
1445 Alternation alt;
1446 alt.append(new Literal("aaa_10%"),10);
1447 alt.append(new Literal("bbb_0%"),0);
1448 alt.append(new Literal("ccc_10%"),10);
1449 alt.append(new Literal("ddddddd_50%"),50);
1450
1451 DUMP_R(TestAlternation, alt, 50);
1452
1453 return FALSE;
1454 }
1455
1456 static UBool TestBuffer(){
1457 Buffer_int t;
1458 t.append(1).append(0).append(5);
1459 int s = t.content_size();
1460 for (int i=0; i<s; ++i){
1461 printf("%d\n", t[i]);
1462 }
1463 return FALSE;
1464 }
1465
1466 static UBool TestWeightedRand(){
1467 srand((unsigned)time( NULL ));
1468 Buffer_int t;
1469 t.append(1).append(0).append(5);
1470 WeightedRand wr(&Buffer_int().append(10).append(0).append(50),4);
1471 // WeightedRand wr(&t,3);
1472 for (int i=0; i< 50; ++i){
1473 printf("%d\n", wr.next());
1474 }
1475 return FALSE;
1476 }
1477
1478 static UBool TestRepeat(){
1479 srand((unsigned)time( NULL ));
1480 Repeat rep(new Literal("aaa1-5 "), 1, 5);
1481 DUMP_R(TestRepeat, rep, 50);
1482
1483 Repeat r2(new Literal("b{1,3}1%0%5% "), 1, 3, &Buffer_int().append(1).append(0).append(5));
1484 DUMP_R(TestRepeat, r2, 50);
1485
1486 Repeat r3(new Literal("aaa5-5 "), 5, 5);
1487 DUMP_R(TestRepeat, r3, 50);
1488
1489 return FALSE;
1490 }
1491
1492 static UBool TestVariable(){
1493 SymbolTable tab;
1494 Pick * value = new Literal("string1");
1495 Variable var1(&tab, "x", value);
1496
1497 Variable var2(&tab, "y");
1498 // tab.put(var2, value); // TOFIX: point alias/recursion problem
1499 Pick * value2 = new Literal("string2");
1500 tab.put(var2, value2);
1501
1502 Pick * value3 = new Literal("string3");
1503 Variable var3(&tab, "z");
1504 tab.put("z", value3);
1505
1506 UBool pass;
1507 pass = strcmp(var1.next(), value->next()) == 0;
1508 pass = pass && strcmp(var2.next(), value2->next()) == 0;
1509 pass = pass && strcmp(var3.next(), value3->next()) == 0;
1510 return pass;
1511 }
1512
1513 static UBool TestSymbolTable(){
1514 Literal * n1 = new Literal("string1");
1515 Literal * n2 = new Literal("string2");
1516 SymbolTable t;
1517 t.put("abc", n1);
1518 t.put("$aaa", n2);
1519 // t.put("alias", n1); // TOFIX: point alias/recursion problem
1520 t.put("bbb");
1521
1522 UBool pass;
1523 pass = t.find(NULL) == SymbolTable::EMPTY;
1524 pass = pass && t.find("ccc") == SymbolTable::NO_VAR;
1525 pass = pass && t.find("bbb") == SymbolTable::NO_REF;
1526 pass = pass && t.find("abc") == SymbolTable::HAS_REF;
1527 pass = pass && t.find("$aaa") == SymbolTable::HAS_REF;
1528
1529 t.reset();
1530 pass = pass && t.find("abc") == SymbolTable::NO_VAR;
1531 return pass;
1532 }
1533
1534
1535 static UBool TestScanner(void){
1536 //const char str1[] = "$root = $command{0,5} $reset $mostRules{1,20};";
1537 //const char str1_r[][20] = {"$root", "=", "$command", "{", "0", ",", "5", "}",
1538 // "$reset", "$mostRules", "{", "1", ",", "20", "}", ";"};
1539
1540 const char str2[] = "$p2 =(\\\\ $s $string $s)? 25%;";
1541 const char str2_r[][20] = {"$p2", "=", "(", "\\", "$s", "$string", "$s", ")", "?", "25", "%", ";"};
1542
1543 const char *str = str2;
1544 const char (*str_r)[20] = str2_r;
1545 int tokenNum = sizeof(str2_r)/sizeof(char[20]);
1546
1547 Scanner t(str);
1548 UBool pass = TRUE;
1549 t.getNextToken();
1550 int i = 0;
1551 while (pass){
1552 if (t.tokenType == STREAM_END){
1553 pass = pass? i == tokenNum : FALSE;
1554 break;//while
1555 } else if (t.tokenType == ERROR){
1556 pass = FALSE;
1557 break;//while
1558 } else {
1559 pass = strcmp( &(t.token[0]), str_r[i++]) == 0;
1560 t.getNextToken();
1561 }
1562 }
1563
1564 //const char ts[] = "$commandList = '['"
1565 //" ( alternate ' ' $alternateOptions"
1566 //" | backwards ' 2'"
1567 //" | normalization ' ' $onoff "
1568 //" | caseLevel ' ' $onoff "
1569 //" | hiraganaQ ' ' $onoff"
1570 //" | caseFirst ' ' $caseFirstOptions"
1571 //" | strength ' ' $strengthOptions"
1572 //" ) ']';" ;
1573
1574 //Scanner t2(ts);
1575 //pass = TRUE;
1576 //do {
1577 // t2.getNextToken();
1578 // if (t2.tokenType == ERROR){
1579 // DumpScanner(t2);
1580 // return FALSE;
1581 // }
1582 //}while (t.tokenType != STREAM_END);
1583
1584 return pass;
1585 }
1586
1587 class TestParserT {
1588 public:
1589 UBool operator () (const char *const str, const int exp_error_offset = -1, const UBool dump = TRUE){
1590 Parser par(str);
1591 if (par.rules()){
1592 if ( exp_error_offset == -1){
1593 return TRUE;
1594 }else {
1595 DumpScanner(par.s,dump);
1596 return FALSE;
1597 }
1598 }else {
1599 return DumpScanner(par.s, dump) == exp_error_offset;
1600 }
1601 }
1602 };
1603
1604 UBool TestParser(){
1605 TestParserT test;
1606
1607 UBool pass = TRUE;
1608 pass = pass && test ("$s = ' ' ? 50%;");
1609 pass = pass && test("$x = ($var {1,2}) 3%;"); // legal
1610 pass = pass && test("$x = $var {1,2} 3% | b 4%;"); // legal
1611 pass = pass && test("$x = $var {1,2} 3%;"); // legal
1612 pass = pass && test("$m = $c ? 2% 4% | $r 5% | $n 25%;"); // legal
1613 pass = pass && test("$a = b ? 2% | c 5%;"); // legal
1614 pass = pass && test("$x = A B 5% C 10% | D;", 8, FALSE); // illegal 5%
1615 pass = pass && test("$x = aa 45% | bb 5% cc;", 19, FALSE);// illegal cc
1616 pass = pass && test("$x = (b 5%) (c 6%);"); // legal
1617 pass = pass && test("$x = (b 5%) c 6%;", 13, FALSE); // illegal 6%
1618 pass = pass && test("$x = b 5% (c 6%);", 9, FALSE); // illegal (c 6%)
1619 pass = pass && test("$x = b 5% c 6%;", 9, FALSE); // illegal c 6%
1620 pass = pass && test("$x = b 5%;"); // legal
1621 pass = pass && test("$x = aa 45% | bb 5% cc;", 19, FALSE);// illegal cc
1622 pass = pass && test("$x = a | b | c 4% | d 5%;"); // legal
1623 pass = pass && test("$s = ' ' ? 50% abc;"); // legal
1624 pass = pass && test("$s = a | c d | e f;"); // legal
1625 pass = pass && test( "$z = q 0% | p 1% | r 100%;"); // legal How to check parsed tree??
1626
1627 pass = pass && test("$s = ' ' ? 50%;");
1628 pass = pass && test("$relationList = '<' | '<<' | ';' | '<<<' | ',' | '=';");
1629 pass = pass && test("$p1 = ($string $s '|' $s)? 25%;");
1630 pass = pass && test("$p2 = (\\\\ $s $string $s)? 25%;");
1631 pass = pass && test("$rel2 = $p1 $string $s $p2;");
1632 pass = pass && test("$relation = $relationList $s ($rel1 | $rel2) $crlf;");
1633 pass = pass && test("$command = $commandList $crlf;");
1634 pass = pass && test("$reset = '&' $s ($beforeList $s)? 10% ($positionList 100% | $string 10%) $crlf;");
1635 pass = pass && test("$mostRules = $command 1% | $reset 5% | $relation 25%;");
1636 pass = pass && test("$root = $command{0,5} $reset $mostRules{1,20};");
1637
1638 const char collationBNF[] =
1639 "$s = ' '? 50%;"
1640 "$crlf = '\r\n';"
1641
1642 "$alternateOptions = non'-'ignorable | shifted;"
1643 "$onoff = on | off;"
1644 "$caseFirstOptions = off | upper | lower;"
1645 "$strengthOptions = '1' | '2' | '3' | '4' | 'I';"
1646 "$commandList = '['"
1647 " ( alternate ' ' $alternateOptions"
1648 " | backwards ' 2'"
1649 " | normalization ' ' $onoff "
1650 " | caseLevel ' ' $onoff "
1651 " | hiraganaQ ' ' $onoff"
1652 " | caseFirst ' ' $caseFirstOptions"
1653 " | strength ' ' $strengthOptions"
1654 " ) ']';"
1655 "$command = $commandList $crlf;"
1656
1657 "$ignorableTypes = (tertiary | secondary | primary) ' ' ignorable;"
1658 "$allTypes = variable | regular | implicit | trailing | $ignorableTypes;"
1659 "$positionList = '[' (first | last) ' ' $allTypes ']';"
1660
1661 "$beforeList = '[before ' ('1' | '2' | '3') ']';"
1662
1663 "$relationList = ("
1664 " '<'"
1665 " | '<<'"
1666 " | ';'"
1667 " | '<<<'"
1668 " | ','"
1669 " | '='"
1670 ");"
1671 "$string = $magic;"
1672 "$rel1 = '[variable top]' $s;"
1673 "$p1 = ($string $s '|' $s)? 25%;"
1674 "$p2 = (\\\\ $s $string $s)? 25%;"
1675 "$rel2 = $p1 $string $s $p2;"
1676 "$relation = $relationList $s ($rel1 | $rel2) $crlf;"
1677
1678 "$reset = '&' $s ($beforeList $s)? 10% ($positionList 1% | $string 10%) $crlf;"
1679 "$mostRules = $command 1% | $reset 5% | $relation 25%;"
1680 "$root = $command{0,5} $reset $mostRules{1,20};"
1681 ;
1682
1683 pass = pass && test(collationBNF);
1684
1685
1686 return pass;
1687 }
1688
1689 static UBool TestMorph(){
1690 srand((unsigned)time( NULL ));
1691
1692 Alternation * alt = new Alternation();
1693
1694 (*alt)
1695 .append(new Literal("a")).append(new Literal("b")).append(new Literal("c"))
1696 .append(new Literal("d")).append(new Literal("e")).append(new Literal("f"))
1697 .append(new Literal("g")).append(new Literal("h")).append(new Literal("i"))
1698 .append(new Literal("j")).append(new Literal("k")).append(new Literal("l"))
1699 .append(new Literal("m")).append(new Literal("n")).append(new Literal("o"))
1700 ;
1701
1702 Repeat * rep = new Repeat( alt ,5,5 );
1703 Morph m( *rep);
1704
1705 // DUMP_R(TestMorph,(*rep),20);
1706 DUMP_R(TestMorph,m,100);
1707
1708 return FALSE;
1709 }
1710
1711 #endif
1712
1713 static UBool TestLanguageGenerator(){
1714 //LanguageGenerator g;
1715 //const char *const s = "$s = p 0% | q 1%;";
1716 //g.parseBNF(s, "$s");
1717 UBool pass;
1718 //= strcmp("q", g.next()) == 0;
1719
1720 const char *const def =
1721 //"$a = $b;"
1722 //"$b = $c;"
1723 //"$c = $t;"
1724 //"$t = abc $z{1,2};"
1725 //"$k = a | b | c | d | e | f | g ;"
1726 //"$z = q 0% | p 1% | r 1%;"
1727 "$x = a ? 0%;"
1728 ; // end of string
1729 // const char * s = "abczz";
1730 //
1731 //
1732 LanguageGenerator g;
1733 pass = g.parseBNF(def, "$x",TRUE);
1734 //// LanguageGenerator g(collationBNF, "$root", "$magic", new MagicNode());
1735 //
1736 if (pass != LanguageGenerator::OK) return FALSE;
1737
1738 DUMP_R(TestLanguageGenerator, g, 20);
1739 return pass;
1740
1741 ////UBool pass = strcmp(s,r) == 0;
1742
1743 //if (pass){
1744 // printf("TestRandomLanguageGenerator passed.\n");
1745 //} else {
1746 // printf("TestRandomLanguageGenerator FAILED!!!\n");
1747 //}
1748 //return pass;
1749 }
1750
1751 void TestWbnf(void){
1752 srand((unsigned)time( NULL ));
1753
1754 //CALL(TestLiteral);
1755 //CALL(TestSequence);
1756 //CALL(TestSymbolTable);
1757 //CALL(TestVariable);
1758
1759 //TestRepeat();
1760 //TestAlternation();
1761 //TestMorph();
1762
1763 //TestQuote();
1764 //TestBuffer();
1765 //TestWeightedRand();
1766
1767 //CALL(TestScanner);
1768 //CALL(TestParser);
1769 CALL(TestLanguageGenerator);
1770 }
1771