]>
git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/wbnf.cpp
2 ******************************************************************************
3 * Copyright (C) 2005-2007, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 ******************************************************************************
15 // Most of this code is meant to test the test code. It's a self test.
16 // Normally this isn't run.
17 #define TEST_WBNF_TEST 0
19 ///////////////////////////////////////////////////////////
21 // Constants and the most basic helper classes
24 static const char DIGIT_CHAR
[] = "0123456789";
25 static const char WHITE_SPACE
[] = {'\t', ' ', '\r', '\n', 0};
26 static const char ALPHABET
[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
27 static const char SPECIAL
[] = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~";
29 static inline UBool
isInList(const char c
/*in*/, const char list
[] /*in*/){
30 const char * p
= list
;
31 for (;*p
!= 0 && *p
!= c
; p
++);
34 static inline UBool
isDigit(char c
) {return isInList(c
, DIGIT_CHAR
);}
35 static inline UBool
isWhiteSpace(char c
) {return isInList(c
, WHITE_SPACE
);}
36 static inline UBool
isAlphabet(char c
) {return isInList(c
, ALPHABET
);}
37 static inline UBool
isSpecialAsciiChar(char c
) {return isInList(c
,SPECIAL
);}
41 ///////////////////////////////////////////////////////////
47 // Utility class, can be treated as an auto expanded array. no boundary check.
52 int buffer_size
; // size unit is byte
54 inline int content_size(){return current
- start
;} // size unit is byte
57 inline void expand(int add_size
= 100){ // size unit is byte
58 int new_size
= buffer_size
+ add_size
;
60 int cs_snap
= content_size();
61 start
= (byte
*) realloc(start
, new_size
); // may change the value of start
62 current
= start
+ cs_snap
;
64 memset(current
, 0, add_size
);
65 buffer_size
= new_size
;
68 inline void expand_to(int size
){
69 int r
= size
- buffer_size
;
71 expand(r
); // simply expand, no block alignment
74 Buffer_byte(const Buffer_byte
&);
75 Buffer_byte
& operator = (const Buffer_byte
&);
77 Buffer_byte():start(NULL
),current(start
),buffer_size(0){
85 start
!= NULL
? memset(start
, 0, buffer_size
) : 0;
89 // Using memory copy method to append a C array to buffer,
90 inline void append(const void * c
, int size
){ // size unit is byte
91 expand_to(content_size() + size
) ;
92 memcpy(current
, c
, size
);
93 current
= current
+ size
;
102 The class(es) try to work as bulid-in array, so it overloads these two operators
105 The first is used to auto type convert, the latter is used to select member.
107 A small trick is the class does not overload the address-of operator. This
108 behavior is different from bulid-in array, but it give us the opportunity
109 to get the address of the class itself.
111 //template<typename type>
113 // typedef BUFFER name;
114 #define BUFFER(type, name)\
119 name & reset() {buf.reset(); return *this;}\
120 name & append(type c) {buf.append(&c, sizeof(type)); return *this;}\
121 name & append_array(const type * p, int size) {buf.append(p, sizeof(type)*size); return *this;}\
122 type & operator [] (int i) { return ((type *) buf.buffer())[i];}\
123 operator type *(){return (type *) buf.buffer();} \
124 int content_size(){return buf.content_size() / sizeof(type);}\
129 /* The Pick is the basic language generator element*/
131 // generate a string accroding the syntax
132 // Return a null-terminated c-string. The buffer is owned by callee.
133 virtual const char* next() = 0;
137 //typedef BUFFER<char> Buffer_char;
138 //typedef BUFFER<int> Buffer_int;
139 //typedef BUFFER<Pick *> Buffer_pPick;
140 BUFFER(char, Buffer_char
);
141 BUFFER(int, Buffer_int
);
142 BUFFER(Pick
*, Buffer_pPick
);
146 * It's a mapping table between 'variable name' and its 'active Pick object'
149 Buffer_char name_buffer
; // var names storage space
151 Buffer_int names
; // points to name (offset in name_buffer)
152 Buffer_pPick refs
; // points to Pick
154 int get_index(const char *const var_name
){
155 int len
= names
.content_size();
156 for (int i
=0; i
< len
; i
++){
157 if (strcmp(var_name
, name_buffer
+ names
[i
]) == 0){
165 enum RESULT
{EMPTY
, NO_VAR
, NO_REF
, HAS_REF
};
167 RESULT
find(const char *const var_name
/*[in] c-string*/, Pick
* * ref
= NULL
/*[out] Pick* */){
168 if (!var_name
) return EMPTY
; // NULL name
170 int i
= get_index(var_name
);
172 return NO_VAR
; // new name
174 if (!refs
[i
]){ // exist name, no ref
180 return HAS_REF
; // exist name, has ref
184 void put(const char *const var_name
, Pick
*const var_ref
= NULL
){
185 int i
= get_index(var_name
);
186 switch(find(var_name
)){
187 case EMPTY
: // NULL name
189 case NO_VAR
: // new name
191 offset
= name_buffer
.content_size();
192 name_buffer
.append_array(var_name
, strlen(var_name
) + 1);
193 names
.append(offset
);
194 refs
.append(var_ref
);
196 case NO_REF
: // exist name, no ref
197 refs
[i
] = var_ref
; // link definition with variable
199 case HAS_REF
: // exist name, has ref
211 int n
= names
.content_size();
212 for (int i
=0; i
<n
; ++i
){
213 if (refs
[i
] == NULL
){
224 // release memory here
225 int s
= refs
.content_size();
226 for (int i
=0; i
< s
; i
++){
227 delete refs
[i
]; // TOFIX: point alias/recursion problem
239 // Document of class Escaper
242 // From http://icu-project.org/userguide/Collate_Customization.html.
243 // We get the precedence of escape/quote operations
245 // (highest) 1. backslash \
246 // 2. two single quotes ''
249 // ICU Collation should accept following as the same string.
253 // 3) a'b'\c |- They are equal.
256 // From "two single quotes", we have following deductions
257 // D1. empty quoting is illgal. (obviously)
258 // D2. no contact operation between two quotings
259 // '.''.' is not .. it is .'.
260 // D3. "two single quotes" cannot contact two quoting simultaneously
261 // '..''''.' is not ..'. it is ..''.
263 // "two single quotes" can contact before one quoting
265 // "two single quotes" can literally contact after one quoting
266 // But, from syntax, it's one quoting including a "two single quotes"
268 // D4. "two single quotes" cannot solely be included in quoting
269 // '''' is not ' it is ''
270 // NOTICE: These are legal
277 // output buffer input buffer
279 // To make our dicision (within an atom operation) without caring input and output buffer,
280 // following calling pattern (within an atom operation) shall be avoided
282 // P1 open_quoting() then close_quoting() (direct violation) D1
283 // P2 close_quoting() then open_quoting() (direct violation) D2
284 // P3 empty open_quoting() (indirect violation) D1, D4
285 // P4 empty close_quoting() (indirect violation) D2, D3
286 // P5 open_quoting() then two single quotes (indirect violation) D4
287 // P6 close_quoting() then two single quotes (indirect violation) D3
289 // two single quotes escaping will not open_ or close_ quoting()
290 // The choice will not lose some quoing forms.
292 // For open_quoting(),
293 // we may get this form quoting ''' P5
294 // It may raise a bug ''''x
296 // '''.' let the next char open the quoting
297 // '.''.' the quoting is already opened by preceding char
299 // For close_quoting()
300 // we will get this form quoting '.''' P6
301 // It may raise a bug '.''''.'
303 // '.'''\. let the next char close the quoting
304 // '.''''.' the expectation is wrong! using '.'\''.' instead
306 // It's a hard work to re-adjust generation opportunity for various escaping form.
307 // We just simply ignore it.
311 enum CHOICE
{YES
, NO
, RAND
};
312 enum ESCAPE_FORM
{BSLASH_ONLY
, QUOTE_ONLY
, QUOTE_AND_BSLAH
, RAND_ESC
};
314 class Bool
{ // A wrapper class for CHOICE, to auto adapter UBool class
318 Bool(CHOICE flag
=RAND
):tag(flag
){}
319 operator UBool() { // conversion operator
320 return tag
== RAND
? rand()%2
: tag
== YES
;
322 // return rand()%2 == 1;
324 // return tag == YES ? TRUE : FALSE;
329 Escaper(CHOICE escapeLiteral
= RAND
,
330 CHOICE twoQuotesEscape
= RAND
,
331 ESCAPE_FORM escapeForm
= RAND_ESC
):
332 escape_form(escapeForm
),
333 escape_literal(escapeLiteral
),
334 two_quotes_escape(twoQuotesEscape
),
338 ESCAPE_FORM escape_form
;
340 Bool two_quotes_escape
;
346 ESCAPE_FORM t
= escape_form
== RAND_ESC
? (ESCAPE_FORM
) (rand()%3
) : escape_form
;
349 bslash_escape
= TRUE
; quote_escape
= FALSE
; break;
351 bslash_escape
= FALSE
;quote_escape
= TRUE
; break;
352 case QUOTE_AND_BSLAH
:
353 bslash_escape
= TRUE
; quote_escape
= TRUE
; break;
364 inline void open_quoting(){
372 inline void close_quoting(){
381 // str [in] null-terminated c-string
382 void append(const char * strToAppend
){
383 for(;*strToAppend
!= 0; strToAppend
++){
384 append(*strToAppend
);
388 inline void append(const char c
){
392 quote_escape
? open_quoting() : close_quoting();
393 //bslash_escape always true here
396 } else if (c
== '\''){
397 if (two_quotes_escape
){ // quoted using two single quotes
398 // See documents in anonymous.design
402 quote_escape
? open_quoting() : close_quoting();
403 //bslash_escape always true here
407 } else if (isSpecialAsciiChar(c
) || isWhiteSpace(c
)){
408 quote_escape
? open_quoting() : close_quoting();
409 if (bslash_escape
) str
.append('\\');
411 } else { //if (isAlphabet(c) || isDigit(c) || TRUE){ // treat others as literal
413 quote_escape
? open_quoting() : close_quoting();
414 if (bslash_escape
) str
.append('\\');
424 // Return a null-terminate c-string. The buffer is owned by callee.
425 char * operator()(const char * literal
/*c-string*/){
427 for(;*literal
!= 0; literal
++){
430 close_quoting(); // P4 exception, to close whole quoting
436 // Return a random number in [0, size)
437 // Every number has different chance (aka weight) to be selected.
441 WeightedRand(const WeightedRand
&);
442 WeightedRand
& operator = (const WeightedRand
&);
444 WeightedRand(Buffer_int
* weight_list
= NULL
, int size
= 0){
445 if ( weight_list
== NULL
){
446 for (int i
=0; i
<size
; ++i
) weights
.append(DEFAULT_WEIGHT
);
448 int s
= weight_list
->content_size();
450 weights
.append_array( (*weight_list
),s
);
451 for (int i
=s
; i
<size
; ++i
) weights
.append(DEFAULT_WEIGHT
);
452 } else { // s >= size
453 weights
.append_array( (*weight_list
),size
);
457 int c
= weights
.content_size();
458 for (int i
=0; i
<c
; ++i
){
463 void append(int weight
){
464 weights
.append(weight
);
468 // Give a random number with the consideration of weight.
469 // Every random number is associated with a weight.
470 // It identifies the chance to be selected,
471 // larger weight has more chance to be selected.
474 // ______________________ every slot has equal chance
476 // [____][_][___][______] each item has different slots, hence different chance
479 // The algorithms to generate the number is illustrated by preceding figure.
480 // First, a slot is selected by rand(). Then we translate the slot to corresponding item.
483 // get a random in [0,1]
484 double reference_mark
= (double)rand() / (double)RAND_MAX
;
486 // get the slot's index, 0 <= mark <= total;
487 double mark
= total
* reference_mark
;
489 // translate the slot to corresponding item
492 mark
-= weights
[i
]; // 0 <= mark <= total
501 ///////////////////////////////////////////////////////////
503 // The parser result nodes
506 class Literal
: public Pick
{
508 virtual const char* next(){
511 Literal(const char * s
/*c-string*/){
512 str
.append_array(s
, strlen(s
) + 1);
515 Buffer_char str
; //null-terminated c-string
518 class Variable
: public Pick
{
520 Variable(SymbolTable
* symbols
, const char * varName
, Pick
* varRef
= NULL
){
521 this->var_name
.append_array(varName
, strlen(varName
) + 1);
522 if ((symbol_table
= symbols
)){
523 symbol_table
->put(varName
, varRef
);
527 operator const char *(){
531 virtual const char* next(){
533 Pick
* var_ref
= NULL
;
534 symbol_table
->find(var_name
, &var_ref
);
536 return var_ref
->next();
539 return ""; // dumb string
542 Buffer_char var_name
;
543 SymbolTable
* symbol_table
;
546 class Quote
: public Pick
{
548 Quote(Pick
& base
):item(base
),e(Escaper::NO
, Escaper::NO
, Escaper::BSLASH_ONLY
){
550 virtual const char* next(){
551 return e(item
.next());
560 class Morph
: public Pick
{
562 The difference between morph and an arbitrary random string is that
563 a morph changes slowly. When we build collation rules, for example,
564 it is a much better test if the strings we use are all in the same
565 'neighborhood'; they share many common characters.
568 Morph(Pick
& base
):item(base
){}
570 virtual const char* next(){
572 const char * s
= item
.next();
573 current
.append_array(s
, strlen(s
) + 1);
574 if (last
.content_size() == 0) {
577 str
.append_array(current
, current
.content_size());
578 last
.append_array(current
, current
.content_size());
607 // copy 0, 1, or 2 character(s) to str
609 static WeightedRand
wr(& Buffer_int().append(DEFAULT_WEIGHT
* 10), 5);
612 case 0: // copy last -- has 10 times chance than others
626 case 4: // copy nothing
635 int min
= strlen(last
);
636 int max
= strlen(current
);
643 int len
= min
+ rand()%(max
- min
+ 1); // min + [0, diff]
648 for (; str
.content_size()<len
&& *p_curr
&& *p_last
;){
649 copy(); // copy 0, 1, or 2 character(s) to str
652 if (str
.content_size() == len
) {
658 if (str
.content_size() > len
) { // if the last copy copied two characters
664 // str.content_size() < len
666 for (; str
.content_size() < len
; copy_last());
668 for (; str
.content_size() < len
; copy_curr());
671 int last_len
= last
.content_size();
672 for (;str
.content_size() < len
;){
673 str
.append(last
[rand()%last_len
]);
681 last
.append_array(current
, current
.content_size());
685 class Sequence
: public Pick
{
687 virtual const char* next(){
689 int s
= items
.content_size();
690 for(int i
=0; i
< s
; i
++){
691 const char * t
= items
[i
]->next();
692 str
.append_array(t
, strlen(t
));
694 str
.append(0); // terminal null
698 void append (Pick
* node
){
703 int s
= items
.content_size();
704 for(int i
=0; i
< s
; i
++){
705 //How can assure the item is got from heap?
707 delete items
[i
]; // TOFIX: point alias/recursion problem
713 Buffer_char str
; //null-terminated c-string
716 class Repeat
: public Pick
{
723 int select_a_count(){
724 return min
+ wr
.next();
727 virtual const char* next(){
729 int c
= select_a_count();
730 for(int i
=0; i
< c
; i
++){
731 const char * t
= item
->next();
732 str
.append_array(t
, strlen(t
));
738 Repeat(Pick
* base
, int minCount
=0, int maxCount
= 1, Buffer_int
* weights
= NULL
):
739 wr(weights
, maxCount
-minCount
+1) {
741 this->min
= minCount
;
742 this->max
= maxCount
;
745 delete item
; // TOFIX: point alias/recursion problem
751 class Alternation
: public Pick
{
753 virtual const char* next(){
756 const char * t
= items
[i
]->next();
757 str
.append_array(t
, strlen(t
) + 1);
760 virtual ~Alternation(){
761 int s
= items
.content_size();
762 for(int i
=0; i
< s
; i
++){
763 delete items
[i
]; // TOFIX: point alias/recursion problem
768 Alternation
& append (Pick
* node
, int weight
= DEFAULT_WEIGHT
){
775 Buffer_char str
; // null-terminated c-string
779 ///////////////////////////////////////////////////////////
784 enum TokenType
{STRING
, VAR
, NUMBER
, STREAM_END
, ERROR
, QUESTION
, STAR
, PLUS
, LBRACE
, RBRACE
, LPAR
, RPAR
, SEMI
, EQ
, COMMA
, BAR
, AT
, WAVE
, PERCENT
};
787 friend int DumpScanner(Scanner
& s
, UBool dumb
);
790 const char * working
;
791 const char * history
; // for debug
792 enum StateType
{START
, IN_NUM
, IN_VAR_FIRST
, IN_VAR
, IN_QUOTE
, IN_QUOTE_BSLASH
, IN_BSLASH
, IN_STRING
, DONE
};
794 void terminated(TokenType t
){
795 working
--; // return the peeked character
797 token
.append(0); // close buffer
801 // the buffer of "source" is owned by caller
802 Scanner(const char *src
/*[in] c-string*/ = NULL
):source(src
){
809 //void setSource(const char *const src /*[in] c-string*/){
810 // *(&const_cast<const char *>(source)) = src;
816 TokenType
getNextToken(){
819 history
= working
; // for debug
820 while (state
!= DONE
){
822 if (c
== 0 && state
!= START
){//avoid buffer overflow. for IN_QUOE, IN_ESCAPE
830 case '?' : tokenType
= QUESTION
; break;
831 case '*' : tokenType
= STAR
; break;
832 case '+' : tokenType
= PLUS
; break;
833 case '{' : tokenType
= LBRACE
; break;
834 case '}' : tokenType
= RBRACE
; break;
835 case '(' : tokenType
= LPAR
; break;
836 case ')' : tokenType
= RPAR
; break;
837 case ';' : tokenType
= SEMI
; break;
838 case '=' : tokenType
= EQ
; break;
839 case ',' : tokenType
= COMMA
; break;
840 case '|' : tokenType
= BAR
; break;
841 case '@' : tokenType
= AT
; break;
842 case '~' : tokenType
= WAVE
; break;
843 case '%' : tokenType
= PERCENT
; break;
844 case 0 : tokenType
= STREAM_END
; working
-- /*avoid buffer overflow*/; break;
846 if (tokenType
!= ERROR
){
853 case '$' : state
= IN_VAR_FIRST
; token
.append(c
); break;
854 case '\'' : state
= IN_QUOTE
; break;
855 case '\\' : state
= IN_BSLASH
; break;
857 if (isWhiteSpace(c
)){ // state = START; //do nothing
858 } else if (isDigit(c
)){ state
= IN_NUM
; token
.append(c
);
859 } else if (isAlphabet(c
)){ state
= IN_STRING
; token
.append(c
);
860 } else {terminated(ERROR
);}
877 break; // IN_VAR_FISRT
879 if (isAlphabet(c
) || isDigit(c
)){
886 // About the scanner's behavior for STRING, AT, and ESCAPE:
887 // All of them can be contacted with each other.
888 // This means the scanner will eat up as much as possible strings
889 // (STRING, AT, and ESCAPE) at one time, with no regard of their
890 // combining sequence.
893 state
= IN_QUOTE
; // the first time we see single quote
894 } else if (c
=='\\'){ // back slash character
896 } else if (isAlphabet(c
) || isDigit(c
)){
903 if (c
== '\''){ // the second time we see single quote
904 state
= IN_STRING
; // see document in IN_STRING
905 } else if ( c
== '\\') { // backslah escape in quote
906 state
= IN_QUOTE_BSLASH
;
908 token
.append(c
); // eat up everything, includes back slash
911 case IN_QUOTE_BSLASH
:
914 case 'n' : token
.append('\n'); break;
915 case 'r' : token
.append('\r'); break;
916 case 't' : token
.append('\t'); break;
917 case '\'' : token
.append('\''); break;
918 case '\\' : token
.append('\\'); break;
919 default: token
.append(c
); // unknown escaping, treat it as literal
921 if (state
== IN_BSLASH
){
922 state
= IN_STRING
; // see document in IN_STRING
923 } else { // state == IN_QUOTE_BSLASH
927 case DONE
: /* should never happen */
934 }//while (state != DONE)
941 friend UBool
TestParser();
942 friend class TestParserT
;
943 friend class LanguageGenerator_impl
;
947 int min_max
; // for the evil infinite
949 UBool
match(TokenType expected
){
950 if (token
== expected
) {
951 token
= s
.getNextToken();
954 //s.dumpCurrentPoint();
959 UBool
weight(int & value
){
960 if (token
== NUMBER
){
961 int temp
= atoi(s
.token
);
971 UBool
repeat (Pick
* &node
/*in,out*/){
972 if (node
== NULL
) return FALSE
;
977 UBool question
= FALSE
;
1000 if (token
!= NUMBER
){
1003 min
= atoi(s
.token
);
1005 if (token
== RBRACE
){
1009 } else if (token
== COMMA
) {
1011 if (token
== RBRACE
){
1015 } else if (token
== NUMBER
) {
1016 max
= atoi(s
.token
);
1018 count
= max
- min
+ 1;
1019 if (!match(RBRACE
)) {
1034 if (count
== -2 || min
== -2 || max
== -2){
1039 // eat up following weights
1046 // for the evil infinite
1047 min_max
= min_max
> min
? min_max
: min
;
1048 min_max
= min_max
> max
? min_max
: max
;
1049 if (min_max
> PSEUDO_INFINIT
){
1050 return FALSE
; // PSEUDO_INFINIT is less than the real maximum
1052 if (max
== -1){ // the evil infinite
1053 max
= PSEUDO_INFINIT
;
1055 // for the strange question mark
1056 if (question
&& weights
.content_size() > 0){
1058 w2
.append(DEFAULT_WEIGHT
- weights
[0]).append(weights
[0]);
1059 node
= new Repeat(node
,min
,max
,&w2
);
1062 node
= new Repeat(node
,min
,max
,&weights
);
1066 UBool
core(Pick
* &node
/*out*/){
1067 if (node
!= NULL
) return FALSE
; //assert node == NULL
1072 if(defination(node
) && match(RPAR
)){
1077 node
= new Variable(&symbols
, s
.token
);
1081 node
= new Literal(s
.token
);
1088 UBool
modified(Pick
* &node
/*out*/){
1089 if (node
!= NULL
) return FALSE
; //assert node == NULL
1099 node
= new Morph(*node
);
1103 node
= new Quote(*node
);
1109 if (!repeat(node
)) return FALSE
;
1111 case SEMI
: // rule definiation closed
1112 case RPAR
: // within parenthesis (core closed)
1113 case BAR
: // in alternation
1114 case NUMBER
: // in alternation, with weight
1115 case LPAR
: // in sequence
1116 case VAR
: // in sequence
1117 case STRING
: // in sequence
1126 UBool
sequence_list(Pick
* &node
/*in,out*/){
1127 if (node
== NULL
) return FALSE
; // assert node != NULL
1129 Sequence
* seq
= new Sequence();
1132 while (token
== VAR
|| token
== STRING
|| token
== LPAR
){
1142 if (token
== SEMI
|| token
== RPAR
|| token
== BAR
){
1153 UBool
sequence(Pick
* &node
/*out*/){
1154 if (node
!= NULL
) return FALSE
; //assert node == NULL
1156 if (!modified(node
)) {
1160 if (token
== VAR
|| token
== STRING
|| token
== LPAR
){
1161 return sequence_list(node
);
1163 return TRUE
; // just a modified
1167 UBool
alternation_list(Pick
* &node
/*in,out*/){
1168 if (node
== NULL
) return FALSE
; // assert node != NULL
1170 Alternation
* alt
= new Alternation();
1172 int w
= DEFAULT_WEIGHT
;
1174 while (token
== NUMBER
|| token
== BAR
){
1175 if(token
== NUMBER
) {
1178 // the middle item, go on
1180 // the last item or encounter error
1186 } // else token == BAR
1199 if (token
== SEMI
|| token
== RPAR
) {
1209 UBool
alternation(Pick
* &node
/*out*/){
1210 if (node
!= NULL
) return FALSE
; //assert node == NULL
1212 // 'sequence' has higher precedence than 'alternation'
1213 if (!sequence(node
)){
1217 if (token
== BAR
|| token
== NUMBER
){ // find a real alternation1, create it.
1218 return alternation_list(node
);
1220 return TRUE
; // just a sequence_old
1225 UBool
defination(Pick
* &node
/*out*/){
1226 if (node
!= NULL
) return FALSE
; //assert node == NULL
1227 return alternation(node
);
1233 name
.append_array(s
.token
, strlen(s
.token
) + 1);
1239 symbols
.put(name
, t
);
1249 token
= s
.getNextToken();
1252 if (token
== STREAM_END
){
1255 //s.dumpCurrentPoint();
1261 SymbolTable symbols
;
1263 Parser(const char *const source
):s(source
), token(s
.tokenType
){
1273 ///////////////////////////////////////////////////////////
1278 int DumpScanner(Scanner
& s
, UBool dump
= TRUE
){
1279 int len
= strlen(s
.source
);
1280 int error_start_offset
= s
.history
- s
.source
;
1282 printf("\n=================== DumpScanner ================\n");
1283 fwrite(s
.source
, len
, 1, stdout
);
1284 printf("\n-----parsed-------------------------------------\n");
1285 fwrite(s
.source
, s
.history
- s
.source
, 1, stdout
);
1286 printf("\n-----current------------------------------------\n");
1287 fwrite(s
.history
, s
.working
- s
.history
, 1, stdout
);
1288 printf("\n-----unparsed-----------------------------------\n");
1289 fwrite(s
.working
, (s
.source
+ len
- s
.working
), 1, stdout
);
1290 printf("\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n");
1292 return error_start_offset
;
1295 class LanguageGenerator_impl
{
1297 LanguageGenerator_impl(const char *const bnf_definition
, const char *const top_node
)
1298 :par(bnf_definition
), top_node_name(top_node
){
1299 srand((unsigned)time( NULL
));
1302 LanguageGenerator::PARSE_RESULT
parseBNF(UBool debug
= TRUE
){
1304 if (par
.symbols
.find(top_node_name
, &top_node_ref
) == SymbolTable::HAS_REF
) {
1305 if (par
.symbols
.is_complete()) {
1306 return LanguageGenerator::OK
;
1308 if (debug
) printf("The bnf definition is incomplete.\n");
1309 return LanguageGenerator::INCOMPLETE
;
1312 if (debug
) printf("No top node is found.\n");
1313 return LanguageGenerator::NO_TOP_NODE
;
1317 printf("The bnf definition is wrong\n");
1318 DumpScanner(par
.s
, TRUE
);
1320 return LanguageGenerator::BNF_DEF_WRONG
;
1323 const char * next(){
1324 return top_node_ref
->next();
1329 const char *const top_node_name
;
1330 Pick
* top_node_ref
;
1333 LanguageGenerator::LanguageGenerator():lang_gen(NULL
){
1336 LanguageGenerator::~LanguageGenerator(){
1340 LanguageGenerator::PARSE_RESULT
LanguageGenerator::parseBNF(const char *const bnf_definition
/*in*/, const char *const top_node
/*in*/, UBool debug
){
1344 lang_gen
= new LanguageGenerator_impl(bnf_definition
, top_node
);
1345 PARSE_RESULT r
= lang_gen
->parseBNF(debug
);
1354 const char *LanguageGenerator::next(){ // Return a null-terminated c-string. The buffer is owned by callee.
1356 return lang_gen
->next();
1362 ///////////////////////////////////////////////////////////
1364 // The test code for WBNF
1369 printf("Pass: " #fun "\n");\
1371 printf("FAILED: !!! " #fun " !!!\n"); \
1374 #define DUMP_R(fun, var, times) \
1375 {printf("\n========= " #fun " =============\n"); \
1376 for (int i=0; i<times; i++) { \
1377 const char * t = var.next();\
1378 fwrite(t,strlen(t),1,stdout); \
1381 printf("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n");}
1386 static UBool
TestQuote(){
1387 const char *const str
= "This ' A !,z| qq [] .new\tline";
1388 //const char *const str_r = "This \\' A '!,'z'|' qq '[]' '.'new\tline";
1390 //// :( we must quote our string to following C syntax
1391 //// cannot type the literal here, it makes our code rather human unreadable
1392 //// very very unconformable!
1397 //const char *const s1 = "ab'c";
1398 //const char (* s1_r1) [] = { "ab''c", // ab''c
1399 // "ab\\'c", // ab\'c
1403 // .. \.\. '.'\. '.'\. '..' // '.''.' wrong
1406 //const char *const s2 = "a..'.b"; // a..'.b
1407 //const char (*s2_r) [] = { "a'..''.'b" // a'..''.'b
1408 // ,"a'..\\'.'b" // a'..\'.'b
1409 // ,"a'..'\\''.'b" // a'..'\''.'b
1412 //const char *const s3 = "a..\\.b"; // a..\.b
1413 //const char (*s3_r) [] = { "a'..\\\\.'b" // a'..\\.'b
1414 // ,"a'..'\\\\'.'b" // a'..'\\'.'b
1417 // // no catact operation, no choice, must be compact
1419 srand((unsigned)time( NULL
));
1421 //Escaper l(Escaper::NO, Escaper::NO, Escaper::RAND_ESC);
1422 Pick
*p
= new Literal(str
);
1425 DUMP_R(TestQuote
, (*p
), 1);
1426 DUMP_R(TestQuote
, q
, 20);
1429 static UBool
TestLiteral(){
1430 const char * s
= "test string99.";
1432 const char * r
= n
.next();
1433 return strcmp(s
,r
) == 0;
1436 static UBool
TestSequence(){
1438 seq
.append(new Literal("abc "));
1439 seq
.append(new Literal(", s"));
1441 return strcmp(seq
.next(), "abc , s") == 0;
1443 static UBool
TestAlternation(){
1444 srand((unsigned)time( NULL
));
1446 alt
.append(new Literal("aaa_10%"),10);
1447 alt
.append(new Literal("bbb_0%"),0);
1448 alt
.append(new Literal("ccc_10%"),10);
1449 alt
.append(new Literal("ddddddd_50%"),50);
1451 DUMP_R(TestAlternation
, alt
, 50);
1456 static UBool
TestBuffer(){
1458 t
.append(1).append(0).append(5);
1459 int s
= t
.content_size();
1460 for (int i
=0; i
<s
; ++i
){
1461 printf("%d\n", t
[i
]);
1466 static UBool
TestWeightedRand(){
1467 srand((unsigned)time( NULL
));
1469 t
.append(1).append(0).append(5);
1470 WeightedRand
wr(&Buffer_int().append(10).append(0).append(50),4);
1471 // WeightedRand wr(&t,3);
1472 for (int i
=0; i
< 50; ++i
){
1473 printf("%d\n", wr
.next());
1478 static UBool
TestRepeat(){
1479 srand((unsigned)time( NULL
));
1480 Repeat
rep(new Literal("aaa1-5 "), 1, 5);
1481 DUMP_R(TestRepeat
, rep
, 50);
1483 Repeat
r2(new Literal("b{1,3}1%0%5% "), 1, 3, &Buffer_int().append(1).append(0).append(5));
1484 DUMP_R(TestRepeat
, r2
, 50);
1486 Repeat
r3(new Literal("aaa5-5 "), 5, 5);
1487 DUMP_R(TestRepeat
, r3
, 50);
1492 static UBool
TestVariable(){
1494 Pick
* value
= new Literal("string1");
1495 Variable
var1(&tab
, "x", value
);
1497 Variable
var2(&tab
, "y");
1498 // tab.put(var2, value); // TOFIX: point alias/recursion problem
1499 Pick
* value2
= new Literal("string2");
1500 tab
.put(var2
, value2
);
1502 Pick
* value3
= new Literal("string3");
1503 Variable
var3(&tab
, "z");
1504 tab
.put("z", value3
);
1507 pass
= strcmp(var1
.next(), value
->next()) == 0;
1508 pass
= pass
&& strcmp(var2
.next(), value2
->next()) == 0;
1509 pass
= pass
&& strcmp(var3
.next(), value3
->next()) == 0;
1513 static UBool
TestSymbolTable(){
1514 Literal
* n1
= new Literal("string1");
1515 Literal
* n2
= new Literal("string2");
1519 // t.put("alias", n1); // TOFIX: point alias/recursion problem
1523 pass
= t
.find(NULL
) == SymbolTable::EMPTY
;
1524 pass
= pass
&& t
.find("ccc") == SymbolTable::NO_VAR
;
1525 pass
= pass
&& t
.find("bbb") == SymbolTable::NO_REF
;
1526 pass
= pass
&& t
.find("abc") == SymbolTable::HAS_REF
;
1527 pass
= pass
&& t
.find("$aaa") == SymbolTable::HAS_REF
;
1530 pass
= pass
&& t
.find("abc") == SymbolTable::NO_VAR
;
1535 static UBool
TestScanner(void){
1536 //const char str1[] = "$root = $command{0,5} $reset $mostRules{1,20};";
1537 //const char str1_r[][20] = {"$root", "=", "$command", "{", "0", ",", "5", "}",
1538 // "$reset", "$mostRules", "{", "1", ",", "20", "}", ";"};
1540 const char str2
[] = "$p2 =(\\\\ $s $string $s)? 25%;";
1541 const char str2_r
[][20] = {"$p2", "=", "(", "\\", "$s", "$string", "$s", ")", "?", "25", "%", ";"};
1543 const char *str
= str2
;
1544 const char (*str_r
)[20] = str2_r
;
1545 int tokenNum
= sizeof(str2_r
)/sizeof(char[20]);
1552 if (t
.tokenType
== STREAM_END
){
1553 pass
= pass
? i
== tokenNum
: FALSE
;
1555 } else if (t
.tokenType
== ERROR
){
1559 pass
= strcmp( &(t
.token
[0]), str_r
[i
++]) == 0;
1564 //const char ts[] = "$commandList = '['"
1565 //" ( alternate ' ' $alternateOptions"
1566 //" | backwards ' 2'"
1567 //" | normalization ' ' $onoff "
1568 //" | caseLevel ' ' $onoff "
1569 //" | hiraganaQ ' ' $onoff"
1570 //" | caseFirst ' ' $caseFirstOptions"
1571 //" | strength ' ' $strengthOptions"
1577 // t2.getNextToken();
1578 // if (t2.tokenType == ERROR){
1582 //}while (t.tokenType != STREAM_END);
1589 UBool
operator () (const char *const str
, const int exp_error_offset
= -1, const UBool dump
= TRUE
){
1592 if ( exp_error_offset
== -1){
1595 DumpScanner(par
.s
,dump
);
1599 return DumpScanner(par
.s
, dump
) == exp_error_offset
;
1608 pass
= pass
&& test ("$s = ' ' ? 50%;");
1609 pass
= pass
&& test("$x = ($var {1,2}) 3%;"); // legal
1610 pass
= pass
&& test("$x = $var {1,2} 3% | b 4%;"); // legal
1611 pass
= pass
&& test("$x = $var {1,2} 3%;"); // legal
1612 pass
= pass
&& test("$m = $c ? 2% 4% | $r 5% | $n 25%;"); // legal
1613 pass
= pass
&& test("$a = b ? 2% | c 5%;"); // legal
1614 pass
= pass
&& test("$x = A B 5% C 10% | D;", 8, FALSE
); // illegal 5%
1615 pass
= pass
&& test("$x = aa 45% | bb 5% cc;", 19, FALSE
);// illegal cc
1616 pass
= pass
&& test("$x = (b 5%) (c 6%);"); // legal
1617 pass
= pass
&& test("$x = (b 5%) c 6%;", 13, FALSE
); // illegal 6%
1618 pass
= pass
&& test("$x = b 5% (c 6%);", 9, FALSE
); // illegal (c 6%)
1619 pass
= pass
&& test("$x = b 5% c 6%;", 9, FALSE
); // illegal c 6%
1620 pass
= pass
&& test("$x = b 5%;"); // legal
1621 pass
= pass
&& test("$x = aa 45% | bb 5% cc;", 19, FALSE
);// illegal cc
1622 pass
= pass
&& test("$x = a | b | c 4% | d 5%;"); // legal
1623 pass
= pass
&& test("$s = ' ' ? 50% abc;"); // legal
1624 pass
= pass
&& test("$s = a | c d | e f;"); // legal
1625 pass
= pass
&& test( "$z = q 0% | p 1% | r 100%;"); // legal How to check parsed tree??
1627 pass
= pass
&& test("$s = ' ' ? 50%;");
1628 pass
= pass
&& test("$relationList = '<' | '<<' | ';' | '<<<' | ',' | '=';");
1629 pass
= pass
&& test("$p1 = ($string $s '|' $s)? 25%;");
1630 pass
= pass
&& test("$p2 = (\\\\ $s $string $s)? 25%;");
1631 pass
= pass
&& test("$rel2 = $p1 $string $s $p2;");
1632 pass
= pass
&& test("$relation = $relationList $s ($rel1 | $rel2) $crlf;");
1633 pass
= pass
&& test("$command = $commandList $crlf;");
1634 pass
= pass
&& test("$reset = '&' $s ($beforeList $s)? 10% ($positionList 100% | $string 10%) $crlf;");
1635 pass
= pass
&& test("$mostRules = $command 1% | $reset 5% | $relation 25%;");
1636 pass
= pass
&& test("$root = $command{0,5} $reset $mostRules{1,20};");
1638 const char collationBNF
[] =
1642 "$alternateOptions = non'-'ignorable | shifted;"
1643 "$onoff = on | off;"
1644 "$caseFirstOptions = off | upper | lower;"
1645 "$strengthOptions = '1' | '2' | '3' | '4' | 'I';"
1646 "$commandList = '['"
1647 " ( alternate ' ' $alternateOptions"
1649 " | normalization ' ' $onoff "
1650 " | caseLevel ' ' $onoff "
1651 " | hiraganaQ ' ' $onoff"
1652 " | caseFirst ' ' $caseFirstOptions"
1653 " | strength ' ' $strengthOptions"
1655 "$command = $commandList $crlf;"
1657 "$ignorableTypes = (tertiary | secondary | primary) ' ' ignorable;"
1658 "$allTypes = variable | regular | implicit | trailing | $ignorableTypes;"
1659 "$positionList = '[' (first | last) ' ' $allTypes ']';"
1661 "$beforeList = '[before ' ('1' | '2' | '3') ']';"
1672 "$rel1 = '[variable top]' $s;"
1673 "$p1 = ($string $s '|' $s)? 25%;"
1674 "$p2 = (\\\\ $s $string $s)? 25%;"
1675 "$rel2 = $p1 $string $s $p2;"
1676 "$relation = $relationList $s ($rel1 | $rel2) $crlf;"
1678 "$reset = '&' $s ($beforeList $s)? 10% ($positionList 1% | $string 10%) $crlf;"
1679 "$mostRules = $command 1% | $reset 5% | $relation 25%;"
1680 "$root = $command{0,5} $reset $mostRules{1,20};"
1683 pass
= pass
&& test(collationBNF
);
1689 static UBool
TestMorph(){
1690 srand((unsigned)time( NULL
));
1692 Alternation
* alt
= new Alternation();
1695 .append(new Literal("a")).append(new Literal("b")).append(new Literal("c"))
1696 .append(new Literal("d")).append(new Literal("e")).append(new Literal("f"))
1697 .append(new Literal("g")).append(new Literal("h")).append(new Literal("i"))
1698 .append(new Literal("j")).append(new Literal("k")).append(new Literal("l"))
1699 .append(new Literal("m")).append(new Literal("n")).append(new Literal("o"))
1702 Repeat
* rep
= new Repeat( alt
,5,5 );
1705 // DUMP_R(TestMorph,(*rep),20);
1706 DUMP_R(TestMorph
,m
,100);
1713 static UBool
TestLanguageGenerator(){
1714 //LanguageGenerator g;
1715 //const char *const s = "$s = p 0% | q 1%;";
1716 //g.parseBNF(s, "$s");
1718 //= strcmp("q", g.next()) == 0;
1720 const char *const def
=
1724 //"$t = abc $z{1,2};"
1725 //"$k = a | b | c | d | e | f | g ;"
1726 //"$z = q 0% | p 1% | r 1%;"
1729 // const char * s = "abczz";
1732 LanguageGenerator g
;
1733 pass
= g
.parseBNF(def
, "$x",TRUE
);
1734 //// LanguageGenerator g(collationBNF, "$root", "$magic", new MagicNode());
1736 if (pass
!= LanguageGenerator::OK
) return FALSE
;
1738 DUMP_R(TestLanguageGenerator
, g
, 20);
1741 ////UBool pass = strcmp(s,r) == 0;
1744 // printf("TestRandomLanguageGenerator passed.\n");
1746 // printf("TestRandomLanguageGenerator FAILED!!!\n");
1751 void TestWbnf(void){
1752 srand((unsigned)time( NULL
));
1754 //CALL(TestLiteral);
1755 //CALL(TestSequence);
1756 //CALL(TestSymbolTable);
1757 //CALL(TestVariable);
1760 //TestAlternation();
1765 //TestWeightedRand();
1767 //CALL(TestScanner);
1769 CALL(TestLanguageGenerator
);