]>
Commit | Line | Data |
---|---|---|
73c04bcf A |
1 | /* |
2 | ****************************************************************************** | |
46f4442e | 3 | * Copyright (C) 2005-2007, International Business Machines Corporation and * |
73c04bcf A |
4 | * others. All Rights Reserved. * |
5 | ****************************************************************************** | |
6 | */ | |
7 | ||
8 | #include <stdio.h> | |
9 | #include <string.h> | |
10 | #include <stdlib.h> | |
11 | #include <time.h> | |
12 | ||
13 | #include "wbnf.h" | |
14 | ||
15 | // Most of this code is meant to test the test code. It's a self test. | |
16 | // Normally this isn't run. | |
17 | #define TEST_WBNF_TEST 0 | |
18 | ||
19 | /////////////////////////////////////////////////////////// | |
20 | // | |
21 | // Constants and the most basic helper classes | |
22 | // | |
23 | ||
24 | static const char DIGIT_CHAR[] = "0123456789"; | |
25 | static const char WHITE_SPACE[] = {'\t', ' ', '\r', '\n', 0}; | |
26 | static const char ALPHABET[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; | |
27 | static const char SPECIAL[] = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; | |
28 | ||
29 | static inline UBool isInList(const char c /*in*/, const char list[] /*in*/){ | |
30 | const char * p = list; | |
31 | for (;*p != 0 && *p != c; p++); | |
32 | return *p?TRUE:FALSE; | |
33 | } | |
34 | static inline UBool isDigit(char c) {return isInList(c, DIGIT_CHAR);} | |
35 | static inline UBool isWhiteSpace(char c) {return isInList(c, WHITE_SPACE);} | |
36 | static inline UBool isAlphabet(char c) {return isInList(c, ALPHABET);} | |
37 | static inline UBool isSpecialAsciiChar(char c) {return isInList(c,SPECIAL);} | |
38 | ||
39 | ||
40 | ||
41 | /////////////////////////////////////////////////////////// | |
42 | // | |
43 | // Helper classes | |
44 | // | |
45 | ||
46 | class Buffer_byte{ | |
47 | // Utility class, can be treated as an auto expanded array. no boundary check. | |
48 | ||
49 | typedef char byte; | |
50 | byte * start; | |
51 | byte * current; | |
52 | int buffer_size; // size unit is byte | |
53 | public: | |
54 | inline int content_size(){return current - start;} // size unit is byte | |
55 | ||
56 | private: | |
57 | inline void expand(int add_size = 100){ // size unit is byte | |
58 | int new_size = buffer_size + add_size; | |
59 | ||
60 | int cs_snap = content_size(); | |
61 | start = (byte *) realloc(start, new_size); // may change the value of start | |
62 | current = start + cs_snap; | |
63 | ||
64 | memset(current, 0, add_size); | |
65 | buffer_size = new_size; | |
66 | } | |
67 | ||
68 | inline void expand_to(int size){ | |
69 | int r = size - buffer_size; | |
70 | if (r > 0) { | |
71 | expand(r); // simply expand, no block alignment | |
72 | } | |
73 | } | |
74 | Buffer_byte(const Buffer_byte &); | |
75 | Buffer_byte & operator = (const Buffer_byte &); | |
76 | public: | |
77 | Buffer_byte():start(NULL),current(start),buffer_size(0){ | |
78 | expand(); | |
79 | } | |
80 | ~Buffer_byte(){ | |
81 | free(start); | |
82 | } | |
83 | ||
84 | inline void reset(){ | |
85 | start != NULL ? memset(start, 0, buffer_size) : 0; | |
86 | current = start; | |
87 | } | |
88 | ||
89 | // Using memory copy method to append a C array to buffer, | |
90 | inline void append(const void * c, int size){ // size unit is byte | |
91 | expand_to(content_size() + size) ; | |
92 | memcpy(current, c, size); | |
93 | current = current + size; | |
94 | } | |
95 | ||
96 | byte * buffer(){ | |
97 | return start; | |
98 | } | |
99 | }; | |
100 | ||
101 | /* | |
102 | The class(es) try to work as bulid-in array, so it overloads these two operators | |
103 | operator type *(); | |
104 | type & operator[]; | |
105 | The first is used to auto type convert, the latter is used to select member. | |
106 | ||
107 | A small trick is the class does not overload the address-of operator. This | |
108 | behavior is different from bulid-in array, but it give us the opportunity | |
109 | to get the address of the class itself. | |
110 | */ | |
111 | //template<typename type> | |
112 | // class BUFFER{ | |
113 | // typedef BUFFER name; | |
114 | #define BUFFER(type, name)\ | |
115 | class name {\ | |
116 | private:\ | |
117 | Buffer_byte buf;\ | |
118 | public:\ | |
119 | name & reset() {buf.reset(); return *this;}\ | |
120 | name & append(type c) {buf.append(&c, sizeof(type)); return *this;}\ | |
121 | name & append_array(const type * p, int size) {buf.append(p, sizeof(type)*size); return *this;}\ | |
122 | type & operator [] (int i) { return ((type *) buf.buffer())[i];}\ | |
123 | operator type *(){return (type *) buf.buffer();} \ | |
124 | int content_size(){return buf.content_size() / sizeof(type);}\ | |
125 | } | |
126 | ||
127 | ||
128 | class Pick{ | |
129 | /* The Pick is the basic language generator element*/ | |
130 | public: | |
131 | // generate a string accroding the syntax | |
132 | // Return a null-terminated c-string. The buffer is owned by callee. | |
133 | virtual const char* next() = 0; | |
134 | virtual ~Pick(){}; | |
135 | }; | |
136 | ||
137 | //typedef BUFFER<char> Buffer_char; | |
138 | //typedef BUFFER<int> Buffer_int; | |
139 | //typedef BUFFER<Pick *> Buffer_pPick; | |
140 | BUFFER(char, Buffer_char); | |
141 | BUFFER(int, Buffer_int); | |
142 | BUFFER(Pick *, Buffer_pPick); | |
143 | ||
144 | class SymbolTable{ | |
145 | /* Helper class. | |
146 | * It's a mapping table between 'variable name' and its 'active Pick object' | |
147 | */ | |
148 | private: | |
149 | Buffer_char name_buffer; // var names storage space | |
150 | ||
151 | Buffer_int names; // points to name (offset in name_buffer) | |
152 | Buffer_pPick refs; // points to Pick | |
153 | ||
154 | int get_index(const char *const var_name){ | |
155 | int len = names.content_size(); | |
156 | for (int i=0; i< len; i++){ | |
157 | if (strcmp(var_name, name_buffer + names[i]) == 0){ | |
158 | return i; | |
159 | } | |
160 | } | |
161 | return -1; | |
162 | } | |
163 | ||
164 | public: | |
165 | enum RESULT {EMPTY, NO_VAR, NO_REF, HAS_REF}; | |
166 | ||
167 | RESULT find(const char *const var_name /*[in] c-string*/, Pick * * ref = NULL /*[out] Pick* */){ | |
168 | if (!var_name) return EMPTY; // NULL name | |
169 | ||
170 | int i = get_index(var_name); | |
171 | if (i == -1){ | |
172 | return NO_VAR; // new name | |
173 | } | |
174 | if (!refs[i]){ // exist name, no ref | |
175 | return NO_REF; | |
176 | } else { | |
177 | if (ref) { | |
178 | *ref = refs[i]; | |
179 | } | |
180 | return HAS_REF; // exist name, has ref | |
181 | } | |
182 | } | |
183 | ||
184 | void put(const char *const var_name, Pick *const var_ref = NULL){ | |
185 | int i = get_index(var_name); | |
186 | switch(find(var_name)){ | |
187 | case EMPTY: // NULL name | |
188 | break; | |
189 | case NO_VAR: // new name | |
190 | int offset; | |
191 | offset = name_buffer.content_size(); | |
192 | name_buffer.append_array(var_name, strlen(var_name) + 1); | |
193 | names.append(offset); | |
194 | refs.append(var_ref); | |
195 | break; | |
196 | case NO_REF: // exist name, no ref | |
197 | refs[i] = var_ref; // link definition with variable | |
198 | break; | |
199 | case HAS_REF: // exist name, has ref | |
200 | if (var_ref){ | |
201 | refs[i] = var_ref; | |
202 | } | |
203 | break; | |
204 | default: | |
205 | ; // ASSERT(FALSE); | |
206 | } | |
207 | return; | |
208 | } | |
209 | ||
210 | UBool is_complete(){ | |
211 | int n = names.content_size(); | |
212 | for (int i=0; i<n; ++i){ | |
213 | if (refs[i] == NULL){ | |
214 | return FALSE; | |
215 | } | |
216 | } | |
217 | return TRUE; | |
218 | } | |
219 | ||
220 | void reset(){ | |
221 | names.reset(); | |
222 | name_buffer.reset(); | |
223 | ||
224 | // release memory here | |
225 | int s = refs.content_size(); | |
226 | for (int i=0; i < s; i++){ | |
227 | delete refs[i]; // TOFIX: point alias/recursion problem | |
228 | } | |
229 | refs.reset(); | |
230 | } | |
231 | ||
232 | ~SymbolTable(){ | |
233 | reset(); | |
234 | } | |
235 | }; | |
236 | ||
237 | ||
238 | /* | |
239 | // Document of class Escaper | |
240 | // | |
241 | // ATTENTION: | |
46f4442e | 242 | // From http://icu-project.org/userguide/Collate_Customization.html. |
73c04bcf A |
243 | // We get the precedence of escape/quote operations |
244 | // | |
245 | // (highest) 1. backslash \ | |
246 | // 2. two single quotes '' | |
247 | // 3. quoting ' ' | |
248 | // | |
249 | // ICU Collation should accept following as the same string. | |
250 | // | |
251 | // 1) 'ab'c _ | |
252 | // 2) a\bc \ | |
253 | // 3) a'b'\c |- They are equal. | |
254 | // 4) abc _/ | |
255 | // | |
256 | // From "two single quotes", we have following deductions | |
257 | // D1. empty quoting is illgal. (obviously) | |
258 | // D2. no contact operation between two quotings | |
259 | // '.''.' is not .. it is .'. | |
260 | // D3. "two single quotes" cannot contact two quoting simultaneously | |
261 | // '..''''.' is not ..'. it is ..''. | |
262 | // NOTICE: | |
263 | // "two single quotes" can contact before one quoting | |
264 | // '''.' is '. | |
265 | // "two single quotes" can literally contact after one quoting | |
266 | // But, from syntax, it's one quoting including a "two single quotes" | |
267 | // '.''' is .' | |
268 | // D4. "two single quotes" cannot solely be included in quoting | |
269 | // '''' is not ' it is '' | |
270 | // NOTICE: These are legal | |
271 | // '.''.' is .'. | |
272 | // '.''' is .' | |
273 | // | |
274 | // dicision | |
275 | // /\ | |
276 | // /__\ | |
277 | // output buffer input buffer | |
278 | // | |
279 | // To make our dicision (within an atom operation) without caring input and output buffer, | |
280 | // following calling pattern (within an atom operation) shall be avoided | |
281 | // | |
282 | // P1 open_quoting() then close_quoting() (direct violation) D1 | |
283 | // P2 close_quoting() then open_quoting() (direct violation) D2 | |
284 | // P3 empty open_quoting() (indirect violation) D1, D4 | |
285 | // P4 empty close_quoting() (indirect violation) D2, D3 | |
286 | // P5 open_quoting() then two single quotes (indirect violation) D4 | |
287 | // P6 close_quoting() then two single quotes (indirect violation) D3 | |
288 | // | |
289 | // two single quotes escaping will not open_ or close_ quoting() | |
290 | // The choice will not lose some quoing forms. | |
291 | // | |
292 | // For open_quoting(), | |
293 | // we may get this form quoting ''' P5 | |
294 | // It may raise a bug ''''x | |
295 | // If we expect | |
296 | // '''.' let the next char open the quoting | |
297 | // '.''.' the quoting is already opened by preceding char | |
298 | // | |
299 | // For close_quoting() | |
300 | // we will get this form quoting '.''' P6 | |
301 | // It may raise a bug '.''''.' | |
302 | // If we expect | |
303 | // '.'''\. let the next char close the quoting | |
304 | // '.''''.' the expectation is wrong! using '.'\''.' instead | |
305 | // | |
306 | // It's a hard work to re-adjust generation opportunity for various escaping form. | |
307 | // We just simply ignore it. | |
308 | */ | |
309 | class Escaper{ | |
310 | public: | |
311 | enum CHOICE {YES, NO, RAND}; | |
312 | enum ESCAPE_FORM {BSLASH_ONLY, QUOTE_ONLY, QUOTE_AND_BSLAH, RAND_ESC}; | |
313 | private: | |
314 | class Bool{ // A wrapper class for CHOICE, to auto adapter UBool class | |
315 | private: | |
316 | const CHOICE tag; | |
317 | public: | |
318 | Bool(CHOICE flag=RAND):tag(flag){} | |
319 | operator UBool() { // conversion operator | |
320 | return tag == RAND ? rand()%2 : tag == YES; | |
321 | //if (tag == RAND){ | |
322 | // return rand()%2 == 1; | |
323 | //} else { | |
324 | // return tag == YES ? TRUE : FALSE; | |
325 | //} | |
326 | } | |
327 | }; | |
328 | public: | |
329 | Escaper(CHOICE escapeLiteral = RAND, | |
330 | CHOICE twoQuotesEscape = RAND, | |
331 | ESCAPE_FORM escapeForm = RAND_ESC): | |
332 | escape_form(escapeForm), | |
333 | escape_literal(escapeLiteral), | |
334 | two_quotes_escape(twoQuotesEscape), | |
335 | is_quoting(FALSE){} | |
336 | private: | |
337 | Buffer_char str; | |
338 | ESCAPE_FORM escape_form; | |
339 | Bool escape_literal; | |
340 | Bool two_quotes_escape; | |
341 | UBool quote_escape; | |
342 | UBool bslash_escape; | |
343 | UBool is_quoting; | |
344 | ||
345 | void set_options(){ | |
346 | ESCAPE_FORM t = escape_form == RAND_ESC ? (ESCAPE_FORM) (rand()%3) : escape_form; | |
347 | switch (t){ | |
348 | case BSLASH_ONLY : | |
349 | bslash_escape = TRUE; quote_escape = FALSE; break; | |
350 | case QUOTE_ONLY: | |
351 | bslash_escape = FALSE;quote_escape = TRUE; break; | |
352 | case QUOTE_AND_BSLAH: | |
353 | bslash_escape = TRUE; quote_escape = TRUE; break; | |
354 | default: | |
355 | ;// error | |
356 | } | |
357 | } | |
358 | ||
359 | void reset(){ | |
360 | str.reset(); | |
361 | is_quoting = FALSE; | |
362 | } | |
363 | ||
364 | inline void open_quoting(){ | |
365 | if(is_quoting){ | |
366 | // do nothing | |
367 | } else { | |
368 | str.append('\''); | |
369 | is_quoting = TRUE; | |
370 | } | |
371 | } | |
372 | inline void close_quoting(){ | |
373 | if(is_quoting){ | |
374 | str.append('\''); | |
375 | is_quoting = FALSE; | |
376 | } else { | |
377 | // do nothing | |
378 | } | |
379 | } | |
380 | ||
381 | // str [in] null-terminated c-string | |
382 | void append(const char * strToAppend){ | |
383 | for(;*strToAppend != 0; strToAppend++){ | |
384 | append(*strToAppend); | |
385 | } | |
386 | } | |
387 | ||
388 | inline void append(const char c){ | |
389 | set_options(); | |
390 | ||
391 | if (c == '\\'){ | |
392 | quote_escape ? open_quoting() : close_quoting(); | |
393 | //bslash_escape always true here | |
394 | str.append('\\'); | |
395 | str.append('\\'); | |
396 | } else if (c == '\''){ | |
397 | if (two_quotes_escape){ // quoted using two single quotes | |
398 | // See documents in anonymous.design | |
399 | str.append('\''); | |
400 | str.append('\''); | |
401 | } else{ | |
402 | quote_escape ? open_quoting() : close_quoting(); | |
403 | //bslash_escape always true here | |
404 | str.append('\\'); | |
405 | str.append('\''); | |
406 | } | |
407 | } else if (isSpecialAsciiChar(c) || isWhiteSpace(c)){ | |
408 | quote_escape ? open_quoting() : close_quoting(); | |
409 | if (bslash_escape) str.append('\\'); | |
410 | str.append(c); | |
411 | } else { //if (isAlphabet(c) || isDigit(c) || TRUE){ // treat others as literal | |
412 | if (escape_literal){ | |
413 | quote_escape ? open_quoting() : close_quoting(); | |
414 | if (bslash_escape) str.append('\\'); | |
415 | str.append(c); | |
416 | } else { | |
417 | close_quoting(); | |
418 | str.append(c); | |
419 | } | |
420 | } | |
421 | } | |
422 | ||
423 | public: | |
424 | // Return a null-terminate c-string. The buffer is owned by callee. | |
425 | char * operator()(const char * literal /*c-string*/){ | |
426 | str.reset(); | |
427 | for(;*literal != 0; literal++){ | |
428 | append(*literal); | |
429 | } | |
430 | close_quoting(); // P4 exception, to close whole quoting | |
431 | return str; | |
432 | } | |
433 | }; | |
434 | ||
435 | class WeightedRand{ | |
436 | // Return a random number in [0, size) | |
437 | // Every number has different chance (aka weight) to be selected. | |
438 | private: | |
439 | Buffer_int weights; | |
440 | double total; | |
441 | WeightedRand(const WeightedRand &); | |
442 | WeightedRand & operator = (const WeightedRand &); | |
443 | public: | |
444 | WeightedRand(Buffer_int * weight_list = NULL, int size = 0){ | |
445 | if ( weight_list == NULL){ | |
446 | for (int i=0; i<size; ++i) weights.append(DEFAULT_WEIGHT); | |
447 | } else { | |
448 | int s = weight_list->content_size(); | |
449 | if (s < size){ | |
450 | weights.append_array( (*weight_list),s); | |
451 | for (int i=s; i<size; ++i) weights.append(DEFAULT_WEIGHT); | |
452 | } else { // s >= size | |
453 | weights.append_array( (*weight_list),size); | |
454 | } | |
455 | } | |
456 | total = 0; | |
457 | int c = weights.content_size(); | |
458 | for (int i=0; i<c; ++i){ | |
459 | total += weights[i]; | |
460 | } | |
461 | } | |
462 | ||
463 | void append(int weight){ | |
464 | weights.append(weight); | |
465 | total += weight; | |
466 | } | |
467 | ||
468 | // Give a random number with the consideration of weight. | |
469 | // Every random number is associated with a weight. | |
470 | // It identifies the chance to be selected, | |
471 | // larger weight has more chance to be selected. | |
472 | // | |
473 | // | |
474 | // ______________________ every slot has equal chance | |
475 | // | |
476 | // [____][_][___][______] each item has different slots, hence different chance | |
477 | // | |
478 | // | |
479 | // The algorithms to generate the number is illustrated by preceding figure. | |
480 | // First, a slot is selected by rand(). Then we translate the slot to corresponding item. | |
481 | // | |
482 | int next(){ | |
483 | // get a random in [0,1] | |
484 | double reference_mark = (double)rand() / (double)RAND_MAX; | |
485 | ||
486 | // get the slot's index, 0 <= mark <= total; | |
487 | double mark = total * reference_mark; | |
488 | ||
489 | // translate the slot to corresponding item | |
490 | int i=0; | |
491 | for (;;){ | |
492 | mark -= weights[i]; // 0 <= mark <= total | |
493 | if (mark <= 0) | |
494 | break; | |
495 | i++; | |
496 | } | |
497 | return i; | |
498 | } | |
499 | }; | |
500 | ||
501 | /////////////////////////////////////////////////////////// | |
502 | // | |
503 | // The parser result nodes | |
504 | // | |
505 | ||
506 | class Literal : public Pick { | |
507 | public: | |
508 | virtual const char* next(){ | |
509 | return str; | |
510 | } | |
511 | Literal(const char * s /*c-string*/){ | |
512 | str.append_array(s, strlen(s) + 1); | |
513 | } | |
514 | private: | |
515 | Buffer_char str; //null-terminated c-string | |
516 | }; | |
517 | ||
518 | class Variable : public Pick { | |
519 | public: | |
520 | Variable(SymbolTable * symbols, const char * varName, Pick * varRef = NULL){ | |
521 | this->var_name.append_array(varName, strlen(varName) + 1); | |
522 | if ((symbol_table = symbols)){ | |
523 | symbol_table->put(varName, varRef); | |
524 | } | |
525 | } | |
526 | ||
527 | operator const char *(){ | |
528 | return var_name; | |
529 | } | |
530 | ||
531 | virtual const char* next(){ | |
532 | if (symbol_table){ | |
533 | Pick * var_ref = NULL; | |
534 | symbol_table->find(var_name, &var_ref); | |
535 | if (var_ref) { | |
536 | return var_ref->next(); | |
537 | } | |
538 | } | |
539 | return ""; // dumb string | |
540 | } | |
541 | private: | |
542 | Buffer_char var_name; | |
543 | SymbolTable * symbol_table; | |
544 | }; | |
545 | ||
546 | class Quote : public Pick{ | |
547 | public: | |
548 | Quote(Pick & base):item(base),e(Escaper::NO, Escaper::NO, Escaper::BSLASH_ONLY){ | |
549 | } | |
550 | virtual const char* next(){ | |
551 | return e(item.next()); | |
552 | } | |
553 | private: | |
554 | Pick & item; | |
555 | Buffer_char str; | |
556 | Escaper e; | |
557 | }; | |
558 | ||
559 | ||
560 | class Morph : public Pick{ | |
561 | /* | |
562 | The difference between morph and an arbitrary random string is that | |
563 | a morph changes slowly. When we build collation rules, for example, | |
564 | it is a much better test if the strings we use are all in the same | |
565 | 'neighborhood'; they share many common characters. | |
566 | */ | |
567 | public: | |
568 | Morph(Pick & base):item(base){} | |
569 | ||
570 | virtual const char* next(){ | |
571 | current.reset(); | |
572 | const char * s = item.next(); | |
573 | current.append_array(s, strlen(s) + 1); | |
574 | if (last.content_size() == 0) { | |
575 | str.reset(); | |
576 | last.reset(); | |
577 | str.append_array(current, current.content_size()); | |
578 | last.append_array(current, current.content_size()); | |
579 | } else { | |
580 | morph(); | |
581 | } | |
582 | return str; | |
583 | } | |
584 | private: | |
585 | Pick & item; | |
586 | Buffer_char str; | |
587 | Buffer_char last; | |
588 | Buffer_char current; | |
589 | ||
590 | char * p_last; | |
591 | char * p_curr; | |
592 | ||
593 | void copy_curr(){ | |
594 | if (*p_curr) { | |
595 | str.append(*p_curr); | |
596 | p_curr++; | |
597 | } | |
598 | } | |
599 | ||
600 | void copy_last(){ | |
601 | if (*p_last) { | |
602 | str.append(*p_last); | |
603 | p_last++; | |
604 | } | |
605 | } | |
606 | ||
607 | // copy 0, 1, or 2 character(s) to str | |
608 | void copy(){ | |
609 | static WeightedRand wr(& Buffer_int().append(DEFAULT_WEIGHT * 10), 5); | |
610 | ||
611 | switch (wr.next()){ | |
612 | case 0: // copy last -- has 10 times chance than others | |
613 | copy_last(); | |
614 | break; | |
615 | case 1: // copy both | |
616 | copy_curr(); | |
617 | copy_last(); | |
618 | break; | |
619 | case 2: // copy both | |
620 | copy_last(); | |
621 | copy_curr(); | |
622 | break; | |
623 | case 3: | |
624 | copy_curr(); | |
625 | break; | |
626 | case 4: // copy nothing | |
627 | break; | |
628 | default: | |
629 | // ASSERT(FALSE); | |
630 | ; | |
631 | } | |
632 | } | |
633 | ||
634 | void morph(void){ | |
635 | int min = strlen(last); | |
636 | int max = strlen(current); | |
637 | if (min > max){ | |
638 | int temp = min; | |
639 | min = max; | |
640 | max = temp; | |
641 | } | |
642 | ||
643 | int len = min + rand()%(max - min + 1); // min + [0, diff] | |
644 | p_curr = current; | |
645 | p_last = last; | |
646 | str.reset(); | |
647 | ||
648 | for (; str.content_size()<len && *p_curr && *p_last;){ | |
649 | copy(); // copy 0, 1, or 2 character(s) to str | |
650 | } | |
651 | ||
652 | if (str.content_size() == len) { | |
653 | str.append(0); | |
654 | final(); | |
655 | return; | |
656 | } | |
657 | ||
658 | if (str.content_size() > len) { // if the last copy copied two characters | |
659 | str[len]=0; | |
660 | final(); | |
661 | return; | |
662 | } | |
663 | ||
664 | // str.content_size() < len | |
665 | if (*p_last) { | |
666 | for (; str.content_size() < len; copy_last()); | |
667 | } else if (*p_curr){ | |
668 | for (; str.content_size() < len; copy_curr()); | |
669 | } | |
670 | ||
671 | int last_len = last.content_size(); | |
672 | for (;str.content_size() < len;){ | |
673 | str.append(last[rand()%last_len]); | |
674 | } | |
675 | str.append(0); | |
676 | final(); | |
677 | } | |
678 | ||
679 | void final(){ | |
680 | last.reset(); | |
681 | last.append_array(current, current.content_size()); | |
682 | } | |
683 | }; | |
684 | ||
685 | class Sequence : public Pick { | |
686 | public: | |
687 | virtual const char* next(){ | |
688 | str.reset(); | |
689 | int s = items.content_size(); | |
690 | for(int i=0; i < s; i++){ | |
691 | const char * t = items[i]->next(); | |
692 | str.append_array(t, strlen(t)); | |
693 | } | |
694 | str.append(0); // terminal null | |
695 | return str; | |
696 | } | |
697 | ||
698 | void append (Pick * node){ | |
699 | items.append(node); | |
700 | } | |
701 | ||
702 | virtual ~Sequence(){ | |
703 | int s = items.content_size(); | |
704 | for(int i=0; i < s; i++){ | |
705 | //How can assure the item is got from heap? | |
706 | //Let's assume it. | |
707 | delete items[i]; // TOFIX: point alias/recursion problem | |
708 | items[i] = NULL; | |
709 | } | |
710 | } | |
711 | private: | |
712 | Buffer_pPick items; | |
713 | Buffer_char str; //null-terminated c-string | |
714 | }; | |
715 | ||
716 | class Repeat : public Pick { | |
717 | private: | |
718 | Pick * item; | |
719 | Buffer_char str; | |
720 | WeightedRand wr; | |
721 | int min; | |
722 | int max; | |
723 | int select_a_count(){ | |
724 | return min + wr.next(); | |
725 | } | |
726 | public: | |
727 | virtual const char* next(){ | |
728 | str.reset(); | |
729 | int c = select_a_count(); | |
730 | for(int i=0; i< c; i++){ | |
731 | const char * t = item->next(); | |
732 | str.append_array(t, strlen(t)); | |
733 | } | |
734 | str.append(0); | |
735 | return str; | |
736 | } | |
737 | ||
738 | Repeat(Pick * base, int minCount =0, int maxCount = 1, Buffer_int * weights = NULL): | |
739 | wr(weights, maxCount-minCount +1) { | |
740 | this->item = base; | |
741 | this->min = minCount; | |
742 | this->max = maxCount; | |
743 | } | |
744 | virtual ~Repeat(){ | |
745 | delete item; // TOFIX: point alias/recursion problem | |
746 | item = NULL; | |
747 | } | |
748 | }; | |
749 | ||
750 | ||
751 | class Alternation : public Pick { | |
752 | public: | |
753 | virtual const char* next(){ | |
754 | str.reset(); | |
755 | int i = wr.next(); | |
756 | const char * t = items[i]->next(); | |
757 | str.append_array(t, strlen(t) + 1); | |
758 | return str; | |
759 | } | |
760 | virtual ~Alternation(){ | |
761 | int s = items.content_size(); | |
762 | for(int i=0; i < s; i++){ | |
763 | delete items[i]; // TOFIX: point alias/recursion problem | |
764 | items[i] = NULL; | |
765 | } | |
766 | } | |
767 | ||
768 | Alternation & append (Pick * node, int weight = DEFAULT_WEIGHT){ | |
769 | items.append(node); | |
770 | wr.append(weight); | |
771 | return *this; | |
772 | } | |
773 | private: | |
774 | Buffer_pPick items; | |
775 | Buffer_char str; // null-terminated c-string | |
776 | WeightedRand wr; | |
777 | }; | |
778 | ||
779 | /////////////////////////////////////////////////////////// | |
780 | // | |
781 | // The parser | |
782 | // | |
783 | ||
784 | enum TokenType {STRING, VAR, NUMBER, STREAM_END, ERROR, QUESTION, STAR, PLUS, LBRACE, RBRACE, LPAR, RPAR, SEMI, EQ, COMMA, BAR, AT, WAVE, PERCENT}; | |
785 | ||
786 | class Scanner{ | |
787 | friend int DumpScanner(Scanner & s, UBool dumb); | |
788 | private: | |
789 | const char * source; | |
790 | const char * working; | |
791 | const char * history; // for debug | |
792 | enum StateType {START, IN_NUM, IN_VAR_FIRST, IN_VAR, IN_QUOTE, IN_QUOTE_BSLASH, IN_BSLASH, IN_STRING, DONE}; | |
793 | StateType state; | |
794 | void terminated(TokenType t){ | |
795 | working--; // return the peeked character | |
796 | tokenType = t; | |
797 | token.append(0); // close buffer | |
798 | state = DONE; | |
799 | } | |
800 | public: | |
801 | // the buffer of "source" is owned by caller | |
802 | Scanner(const char *src/*[in] c-string*/ = NULL):source(src){ | |
803 | working = src; | |
804 | history = working; | |
805 | state = DONE; | |
806 | tokenType = ERROR; | |
807 | } | |
808 | ||
809 | //void setSource(const char *const src /*[in] c-string*/){ | |
810 | // *(&const_cast<const char *>(source)) = src; | |
811 | //} | |
812 | ||
813 | Buffer_char token; | |
814 | TokenType tokenType; | |
815 | ||
816 | TokenType getNextToken(){ | |
817 | token.reset(); | |
818 | state = START; | |
819 | history = working; // for debug | |
820 | while (state != DONE){ | |
821 | char c = *working++; | |
822 | if (c == 0 && state != START){//avoid buffer overflow. for IN_QUOE, IN_ESCAPE | |
823 | terminated(ERROR); | |
824 | break; // while | |
825 | } | |
826 | switch(state){ | |
827 | case START: | |
828 | tokenType = ERROR; | |
829 | switch(c){ | |
830 | case '?' : tokenType = QUESTION; break; | |
831 | case '*' : tokenType = STAR; break; | |
832 | case '+' : tokenType = PLUS; break; | |
833 | case '{' : tokenType = LBRACE; break; | |
834 | case '}' : tokenType = RBRACE; break; | |
835 | case '(' : tokenType = LPAR; break; | |
836 | case ')' : tokenType = RPAR; break; | |
837 | case ';' : tokenType = SEMI; break; | |
838 | case '=' : tokenType = EQ; break; | |
839 | case ',' : tokenType = COMMA; break; | |
840 | case '|' : tokenType = BAR; break; | |
841 | case '@' : tokenType = AT; break; | |
842 | case '~' : tokenType = WAVE; break; | |
843 | case '%' : tokenType = PERCENT; break; | |
844 | case 0 : tokenType = STREAM_END; working-- /*avoid buffer overflow*/; break; | |
845 | } | |
846 | if (tokenType != ERROR){ | |
847 | token.append(c); | |
848 | token.append(0); | |
849 | state = DONE; | |
850 | break; // START | |
851 | } | |
852 | switch(c){ | |
853 | case '$' : state = IN_VAR_FIRST; token.append(c); break; | |
854 | case '\'' : state = IN_QUOTE; break; | |
855 | case '\\' : state = IN_BSLASH; break; | |
856 | default: | |
857 | if (isWhiteSpace(c)){ // state = START; //do nothing | |
858 | } else if (isDigit(c)){ state = IN_NUM; token.append(c); | |
859 | } else if (isAlphabet(c)){ state = IN_STRING; token.append(c); | |
860 | } else {terminated(ERROR);} | |
861 | } | |
862 | break;//START | |
863 | case IN_NUM: | |
864 | if (isDigit(c)){ | |
865 | token.append(c); | |
866 | } else { | |
867 | terminated(NUMBER); | |
868 | } | |
869 | break;//IN_NUM | |
870 | case IN_VAR_FIRST: | |
871 | if (isAlphabet(c)){ | |
872 | token.append(c); | |
873 | state = IN_VAR; | |
874 | } else { | |
875 | terminated(ERROR); | |
876 | } | |
877 | break; // IN_VAR_FISRT | |
878 | case IN_VAR: | |
879 | if (isAlphabet(c) || isDigit(c)){ | |
880 | token.append(c); | |
881 | } else { | |
882 | terminated(VAR); | |
883 | } | |
884 | break;//IN_VAR | |
885 | case IN_STRING: | |
886 | // About the scanner's behavior for STRING, AT, and ESCAPE: | |
887 | // All of them can be contacted with each other. | |
888 | // This means the scanner will eat up as much as possible strings | |
889 | // (STRING, AT, and ESCAPE) at one time, with no regard of their | |
890 | // combining sequence. | |
891 | // | |
892 | if (c == '\''){ | |
893 | state = IN_QUOTE; // the first time we see single quote | |
894 | } else if (c =='\\'){ // back slash character | |
895 | state = IN_BSLASH; | |
896 | } else if (isAlphabet(c) || isDigit(c)){ | |
897 | token.append(c); | |
898 | } else{ | |
899 | terminated(STRING); | |
900 | } | |
901 | break;//IN_STRING | |
902 | case IN_QUOTE: | |
903 | if (c == '\''){ // the second time we see single quote | |
904 | state = IN_STRING; // see document in IN_STRING | |
905 | } else if ( c== '\\') { // backslah escape in quote | |
906 | state = IN_QUOTE_BSLASH; | |
907 | } else { | |
908 | token.append(c); // eat up everything, includes back slash | |
909 | } | |
910 | break;//IN_QUOTE | |
911 | case IN_QUOTE_BSLASH: | |
912 | case IN_BSLASH: | |
913 | switch (c){ | |
914 | case 'n' : token.append('\n'); break; | |
915 | case 'r' : token.append('\r'); break; | |
916 | case 't' : token.append('\t'); break; | |
917 | case '\'' : token.append('\''); break; | |
918 | case '\\' : token.append('\\'); break; | |
919 | default: token.append(c); // unknown escaping, treat it as literal | |
920 | } | |
921 | if (state == IN_BSLASH){ | |
922 | state = IN_STRING; // see document in IN_STRING | |
923 | } else { // state == IN_QUOTE_BSLASH | |
924 | state = IN_QUOTE; | |
925 | } | |
926 | break;//IN_BSLASH | |
927 | case DONE: /* should never happen */ | |
928 | default: | |
929 | working--; | |
930 | tokenType = ERROR; | |
931 | state = DONE; | |
932 | break; | |
933 | }//switch(state) | |
934 | }//while (state != DONE) | |
935 | ||
936 | return tokenType; | |
937 | } | |
938 | };//class Scanner | |
939 | ||
940 | class Parser{ | |
941 | friend UBool TestParser(); | |
942 | friend class TestParserT; | |
943 | friend class LanguageGenerator_impl; | |
944 | private: | |
945 | Scanner s; | |
946 | TokenType & token; | |
947 | int min_max; // for the evil infinite | |
948 | ||
949 | UBool match(TokenType expected){ | |
950 | if (token == expected) { | |
951 | token = s.getNextToken(); | |
952 | return TRUE; | |
953 | } else { | |
954 | //s.dumpCurrentPoint(); | |
955 | return FALSE; | |
956 | } | |
957 | } | |
958 | ||
959 | UBool weight(int & value){ | |
960 | if (token == NUMBER){ | |
961 | int temp = atoi(s.token); | |
962 | match(NUMBER); | |
963 | if (match(PERCENT)){ | |
964 | value = temp; | |
965 | return TRUE; | |
966 | } | |
967 | } | |
968 | return FALSE; | |
969 | } | |
970 | ||
971 | UBool repeat (Pick* &node /*in,out*/){ | |
972 | if (node == NULL) return FALSE; | |
973 | ||
974 | int count = -2; | |
975 | int min = -2; | |
976 | int max = -2; | |
977 | UBool question = FALSE; | |
978 | switch (token){ | |
979 | case QUESTION: | |
980 | match(QUESTION); | |
981 | min = 0; | |
982 | max = 1; | |
983 | count = 2; | |
984 | question = TRUE; | |
985 | break; | |
986 | case STAR: | |
987 | match(STAR); | |
988 | min = 0; | |
989 | max = -1; | |
990 | count = -1; | |
991 | break; | |
992 | case PLUS: | |
993 | match(PLUS); | |
994 | min = 1; | |
995 | max = -1; | |
996 | count = -1; | |
997 | break; | |
998 | case LBRACE: | |
999 | match(LBRACE); | |
1000 | if (token != NUMBER){ | |
1001 | return FALSE; | |
1002 | }else { | |
1003 | min = atoi(s.token); | |
1004 | match(NUMBER); | |
1005 | if (token == RBRACE){ | |
1006 | match(RBRACE); | |
1007 | max = min; | |
1008 | count = 1; | |
1009 | } else if (token == COMMA) { | |
1010 | match(COMMA); | |
1011 | if (token == RBRACE){ | |
1012 | match(RBRACE); | |
1013 | max = -1; | |
1014 | count = -1; | |
1015 | } else if (token == NUMBER) { | |
1016 | max = atoi(s.token); | |
1017 | match(NUMBER); | |
1018 | count = max - min + 1; | |
1019 | if (!match(RBRACE)) { | |
1020 | return FALSE; | |
1021 | } | |
1022 | } else { | |
1023 | return FALSE; | |
1024 | } | |
1025 | } else { | |
1026 | return FALSE; | |
1027 | } | |
1028 | } | |
1029 | break; | |
1030 | default: | |
1031 | return FALSE; | |
1032 | } | |
1033 | ||
1034 | if (count == -2 || min == -2 || max == -2){ | |
1035 | //ASSERT(FALSE); | |
1036 | return FALSE; | |
1037 | } | |
1038 | ||
1039 | // eat up following weights | |
1040 | Buffer_int weights; | |
1041 | int w; | |
1042 | while (weight(w)){ | |
1043 | weights.append(w); | |
1044 | } | |
1045 | ||
1046 | // for the evil infinite | |
1047 | min_max = min_max > min ? min_max : min; | |
1048 | min_max = min_max > max ? min_max : max; | |
1049 | if (min_max > PSEUDO_INFINIT){ | |
1050 | return FALSE; // PSEUDO_INFINIT is less than the real maximum | |
1051 | } | |
1052 | if (max == -1){ // the evil infinite | |
1053 | max = PSEUDO_INFINIT; | |
1054 | } | |
1055 | // for the strange question mark | |
1056 | if (question && weights.content_size() > 0){ | |
1057 | Buffer_int w2; | |
1058 | w2.append(DEFAULT_WEIGHT - weights[0]).append(weights[0]); | |
1059 | node = new Repeat(node,min,max,&w2); | |
1060 | return TRUE; | |
1061 | } | |
1062 | node = new Repeat(node,min,max,&weights); | |
1063 | return TRUE; | |
1064 | } | |
1065 | ||
1066 | UBool core(Pick* &node /*out*/){ | |
1067 | if (node != NULL) return FALSE; //assert node == NULL | |
1068 | ||
1069 | switch(token){ | |
1070 | case LPAR: | |
1071 | match(LPAR); | |
1072 | if(defination(node) && match(RPAR)){ | |
1073 | return TRUE; | |
1074 | } | |
1075 | return FALSE; | |
1076 | case VAR: | |
1077 | node = new Variable(&symbols, s.token); | |
1078 | match(VAR); | |
1079 | return TRUE; | |
1080 | case STRING: | |
1081 | node = new Literal(s.token); | |
1082 | match(STRING); | |
1083 | return TRUE; | |
1084 | default: | |
1085 | return FALSE; | |
1086 | } | |
1087 | } | |
1088 | UBool modified(Pick* &node /*out*/){ | |
1089 | if (node != NULL) return FALSE; //assert node == NULL | |
1090 | ||
1091 | if (!core(node)) { | |
1092 | return FALSE; | |
1093 | } | |
1094 | ||
1095 | for (;;){ | |
1096 | switch(token){ | |
1097 | case WAVE: | |
1098 | match(WAVE); | |
1099 | node = new Morph(*node); | |
1100 | break; | |
1101 | case AT: | |
1102 | match(AT); | |
1103 | node = new Quote(*node); | |
1104 | break; | |
1105 | case QUESTION: | |
1106 | case STAR: | |
1107 | case PLUS: | |
1108 | case LBRACE: | |
1109 | if (!repeat(node)) return FALSE; | |
1110 | break; | |
1111 | case SEMI: // rule definiation closed | |
1112 | case RPAR: // within parenthesis (core closed) | |
1113 | case BAR: // in alternation | |
1114 | case NUMBER: // in alternation, with weight | |
1115 | case LPAR: // in sequence | |
1116 | case VAR: // in sequence | |
1117 | case STRING: // in sequence | |
1118 | return TRUE; | |
1119 | default: | |
1120 | return FALSE; | |
1121 | } | |
1122 | } | |
1123 | } | |
1124 | ||
1125 | ||
1126 | UBool sequence_list(Pick* &node /*in,out*/){ | |
1127 | if (node == NULL) return FALSE; // assert node != NULL | |
1128 | ||
1129 | Sequence* seq = new Sequence(); | |
1130 | Pick * n = node; | |
1131 | ||
1132 | while (token == VAR || token == STRING || token == LPAR){ | |
1133 | seq->append(n); | |
1134 | n = NULL; | |
1135 | if (modified(n)){ | |
1136 | // go on | |
1137 | } else { | |
1138 | goto FAIL; | |
1139 | } | |
1140 | } | |
1141 | ||
1142 | if (token == SEMI || token == RPAR || token == BAR){ | |
1143 | seq->append(n); | |
1144 | node = seq; | |
1145 | return TRUE; | |
1146 | } | |
1147 | FAIL: | |
1148 | delete seq; | |
1149 | return FALSE; | |
1150 | ||
1151 | } | |
1152 | ||
1153 | UBool sequence(Pick* &node /*out*/){ | |
1154 | if (node != NULL) return FALSE; //assert node == NULL | |
1155 | ||
1156 | if (!modified(node)) { | |
1157 | return FALSE; | |
1158 | } | |
1159 | ||
1160 | if (token == VAR || token == STRING || token == LPAR){ | |
1161 | return sequence_list(node); | |
1162 | } else { | |
1163 | return TRUE; // just a modified | |
1164 | } | |
1165 | } | |
1166 | ||
1167 | UBool alternation_list(Pick* &node /*in,out*/){ | |
1168 | if (node == NULL) return FALSE; // assert node != NULL | |
1169 | ||
1170 | Alternation * alt = new Alternation(); | |
1171 | Pick * n = node; | |
1172 | int w = DEFAULT_WEIGHT; | |
1173 | ||
1174 | while (token == NUMBER || token == BAR){ | |
1175 | if(token == NUMBER) { | |
1176 | if (weight(w)){ | |
1177 | if (token == BAR){ | |
1178 | // the middle item, go on | |
1179 | } else { | |
1180 | // the last item or encounter error | |
1181 | break; //while | |
1182 | } | |
1183 | } else { | |
1184 | goto FAIL; | |
1185 | } | |
1186 | } // else token == BAR | |
1187 | match(BAR); | |
1188 | alt->append(n,w); | |
1189 | ||
1190 | n = NULL; | |
1191 | w = DEFAULT_WEIGHT; | |
1192 | if (sequence(n)){ | |
1193 | // go on | |
1194 | } else { | |
1195 | goto FAIL; | |
1196 | } | |
1197 | } | |
1198 | ||
1199 | if (token == SEMI || token == RPAR) { | |
1200 | alt->append(n,w); | |
1201 | node = alt; | |
1202 | return TRUE; | |
1203 | } | |
1204 | FAIL: | |
1205 | delete alt; | |
1206 | return FALSE; | |
1207 | } | |
1208 | ||
1209 | UBool alternation(Pick* &node /*out*/){ | |
1210 | if (node != NULL) return FALSE; //assert node == NULL | |
1211 | ||
1212 | // 'sequence' has higher precedence than 'alternation' | |
1213 | if (!sequence(node)){ | |
1214 | return FALSE; | |
1215 | } | |
1216 | ||
1217 | if (token == BAR || token == NUMBER){ // find a real alternation1, create it. | |
1218 | return alternation_list(node); | |
1219 | } else { | |
1220 | return TRUE; // just a sequence_old | |
1221 | } | |
1222 | } | |
1223 | ||
1224 | ||
1225 | UBool defination(Pick* &node /*out*/){ | |
1226 | if (node != NULL) return FALSE; //assert node == NULL | |
1227 | return alternation(node); | |
1228 | } | |
1229 | ||
1230 | UBool rule(){ | |
1231 | if (token == VAR){ | |
1232 | Buffer_char name; | |
1233 | name.append_array(s.token, strlen(s.token) + 1); | |
1234 | match(VAR); | |
1235 | ||
1236 | if (match(EQ)){ | |
1237 | Pick * t = NULL; | |
1238 | if(defination(t)){ | |
1239 | symbols.put(name, t); | |
1240 | return match(SEMI); | |
1241 | } | |
1242 | } | |
1243 | } | |
1244 | return FALSE; | |
1245 | } | |
1246 | public: | |
1247 | UBool rules(){ | |
1248 | symbols.reset(); | |
1249 | token = s.getNextToken(); | |
1250 | while (rule()){ | |
1251 | } | |
1252 | if (token == STREAM_END){ | |
1253 | return TRUE; | |
1254 | } else { | |
1255 | //s.dumpCurrentPoint(); | |
1256 | return FALSE; | |
1257 | } | |
1258 | } | |
1259 | ||
1260 | public: | |
1261 | SymbolTable symbols; | |
1262 | ||
1263 | Parser(const char *const source):s(source), token(s.tokenType){ | |
1264 | min_max = -2; | |
1265 | } | |
1266 | UBool parse(){ | |
1267 | return rules(); | |
1268 | } | |
1269 | ||
1270 | }; // class Parser | |
1271 | ||
1272 | ||
1273 | /////////////////////////////////////////////////////////// | |
1274 | // | |
1275 | // | |
1276 | // | |
1277 | ||
1278 | int DumpScanner(Scanner & s, UBool dump = TRUE){ | |
1279 | int len = strlen(s.source); | |
1280 | int error_start_offset = s.history - s.source; | |
1281 | if (dump){ | |
1282 | printf("\n=================== DumpScanner ================\n"); | |
1283 | fwrite(s.source, len, 1, stdout); | |
1284 | printf("\n-----parsed-------------------------------------\n"); | |
1285 | fwrite(s.source, s.history - s.source, 1, stdout); | |
1286 | printf("\n-----current------------------------------------\n"); | |
1287 | fwrite(s.history, s.working - s.history, 1, stdout); | |
1288 | printf("\n-----unparsed-----------------------------------\n"); | |
1289 | fwrite(s.working, (s.source + len - s.working), 1, stdout); | |
1290 | printf("\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"); | |
1291 | } | |
1292 | return error_start_offset; | |
1293 | } | |
1294 | ||
1295 | class LanguageGenerator_impl{ | |
1296 | public: | |
1297 | LanguageGenerator_impl(const char *const bnf_definition, const char *const top_node) | |
1298 | :par(bnf_definition), top_node_name(top_node){ | |
1299 | srand((unsigned)time( NULL )); | |
1300 | } | |
1301 | ||
1302 | LanguageGenerator::PARSE_RESULT parseBNF(UBool debug = TRUE){ | |
1303 | if (par.parse()){ | |
1304 | if (par.symbols.find(top_node_name, &top_node_ref) == SymbolTable::HAS_REF) { | |
1305 | if (par.symbols.is_complete()) { | |
1306 | return LanguageGenerator::OK; | |
1307 | } else { | |
1308 | if (debug) printf("The bnf definition is incomplete.\n"); | |
1309 | return LanguageGenerator::INCOMPLETE; | |
1310 | } | |
1311 | } else { | |
1312 | if (debug) printf("No top node is found.\n"); | |
1313 | return LanguageGenerator::NO_TOP_NODE; | |
1314 | } | |
1315 | } else { | |
1316 | if(debug) { | |
1317 | printf("The bnf definition is wrong\n"); | |
1318 | DumpScanner(par.s, TRUE); | |
1319 | } | |
1320 | return LanguageGenerator::BNF_DEF_WRONG; | |
1321 | } | |
1322 | } | |
1323 | const char * next(){ | |
1324 | return top_node_ref->next(); | |
1325 | } | |
1326 | ||
1327 | private: | |
1328 | Parser par; | |
1329 | const char *const top_node_name; | |
1330 | Pick * top_node_ref; | |
1331 | }; | |
1332 | ||
1333 | LanguageGenerator::LanguageGenerator():lang_gen(NULL){ | |
1334 | } | |
1335 | ||
1336 | LanguageGenerator::~LanguageGenerator(){ | |
1337 | delete lang_gen; | |
1338 | } | |
1339 | ||
1340 | LanguageGenerator::PARSE_RESULT LanguageGenerator::parseBNF(const char *const bnf_definition /*in*/, const char *const top_node/*in*/, UBool debug){ | |
1341 | if (lang_gen){ | |
1342 | delete lang_gen; | |
1343 | } | |
1344 | lang_gen = new LanguageGenerator_impl(bnf_definition, top_node); | |
1345 | PARSE_RESULT r = lang_gen->parseBNF(debug); | |
1346 | if (r != OK){ | |
1347 | delete lang_gen; | |
1348 | lang_gen = NULL; | |
1349 | return r; | |
1350 | } else { | |
1351 | return r; | |
1352 | } | |
1353 | } | |
1354 | const char *LanguageGenerator::next(){ // Return a null-terminated c-string. The buffer is owned by callee. | |
1355 | if (lang_gen){ | |
1356 | return lang_gen->next(); | |
1357 | }else { | |
1358 | return ""; | |
1359 | } | |
1360 | } | |
1361 | ||
1362 | /////////////////////////////////////////////////////////// | |
1363 | // | |
1364 | // The test code for WBNF | |
1365 | // | |
1366 | ||
1367 | #define CALL(fun) \ | |
1368 | if (fun()){ \ | |
1369 | printf("Pass: " #fun "\n");\ | |
1370 | } else { \ | |
1371 | printf("FAILED: !!! " #fun " !!!\n"); \ | |
1372 | } | |
1373 | ||
1374 | #define DUMP_R(fun, var, times) \ | |
1375 | {printf("\n========= " #fun " =============\n"); \ | |
1376 | for (int i=0; i<times; i++) { \ | |
1377 | const char * t = var.next();\ | |
1378 | fwrite(t,strlen(t),1,stdout); \ | |
1379 | printf("\n"); \ | |
1380 | } \ | |
1381 | printf("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n");} | |
1382 | ||
1383 | ||
1384 | ||
1385 | #if TEST_WBNF_TEST | |
1386 | static UBool TestQuote(){ | |
1387 | const char *const str = "This ' A !,z| qq [] .new\tline"; | |
1388 | //const char *const str_r = "This \\' A '!,'z'|' qq '[]' '.'new\tline"; | |
1389 | //// | |
1390 | //// :( we must quote our string to following C syntax | |
1391 | //// cannot type the literal here, it makes our code rather human unreadable | |
1392 | //// very very unconformable! | |
1393 | //// | |
1394 | ///* | |
1395 | //*/ | |
1396 | ||
1397 | //const char *const s1 = "ab'c"; | |
1398 | //const char (* s1_r1) [] = { "ab''c", // ab''c | |
1399 | // "ab\\'c", // ab\'c | |
1400 | // };// | |
1401 | ///* | |
1402 | // . '.' \. | |
1403 | // .. \.\. '.'\. '.'\. '..' // '.''.' wrong | |
1404 | //*/ | |
1405 | ||
1406 | //const char *const s2 = "a..'.b"; // a..'.b | |
1407 | //const char (*s2_r) [] = { "a'..''.'b" // a'..''.'b | |
1408 | // ,"a'..\\'.'b" // a'..\'.'b | |
1409 | // ,"a'..'\\''.'b" // a'..'\''.'b | |
1410 | // };// | |
1411 | ||
1412 | //const char *const s3 = "a..\\.b"; // a..\.b | |
1413 | //const char (*s3_r) [] = { "a'..\\\\.'b" // a'..\\.'b | |
1414 | // ,"a'..'\\\\'.'b" // a'..'\\'.'b | |
1415 | // };// | |
1416 | ||
1417 | // // no catact operation, no choice, must be compact | |
1418 | ||
1419 | srand((unsigned)time( NULL )); | |
1420 | ||
1421 | //Escaper l(Escaper::NO, Escaper::NO, Escaper::RAND_ESC); | |
1422 | Pick *p = new Literal(str); | |
1423 | Quote q(*p); | |
1424 | ||
1425 | DUMP_R(TestQuote, (*p), 1); | |
1426 | DUMP_R(TestQuote, q, 20); | |
1427 | return FALSE; | |
1428 | } | |
1429 | static UBool TestLiteral(){ | |
1430 | const char * s = "test string99."; | |
1431 | Literal n(s); | |
1432 | const char * r = n.next(); | |
1433 | return strcmp(s,r) == 0; | |
1434 | } | |
1435 | ||
1436 | static UBool TestSequence(){ | |
1437 | Sequence seq; | |
1438 | seq.append(new Literal("abc ")); | |
1439 | seq.append(new Literal(", s")); | |
1440 | ||
1441 | return strcmp(seq.next(), "abc , s") == 0; | |
1442 | } | |
1443 | static UBool TestAlternation(){ | |
1444 | srand((unsigned)time( NULL )); | |
1445 | Alternation alt; | |
1446 | alt.append(new Literal("aaa_10%"),10); | |
1447 | alt.append(new Literal("bbb_0%"),0); | |
1448 | alt.append(new Literal("ccc_10%"),10); | |
1449 | alt.append(new Literal("ddddddd_50%"),50); | |
1450 | ||
1451 | DUMP_R(TestAlternation, alt, 50); | |
1452 | ||
1453 | return FALSE; | |
1454 | } | |
1455 | ||
1456 | static UBool TestBuffer(){ | |
1457 | Buffer_int t; | |
1458 | t.append(1).append(0).append(5); | |
1459 | int s = t.content_size(); | |
1460 | for (int i=0; i<s; ++i){ | |
1461 | printf("%d\n", t[i]); | |
1462 | } | |
1463 | return FALSE; | |
1464 | } | |
1465 | ||
1466 | static UBool TestWeightedRand(){ | |
1467 | srand((unsigned)time( NULL )); | |
1468 | Buffer_int t; | |
1469 | t.append(1).append(0).append(5); | |
1470 | WeightedRand wr(&Buffer_int().append(10).append(0).append(50),4); | |
1471 | // WeightedRand wr(&t,3); | |
1472 | for (int i=0; i< 50; ++i){ | |
1473 | printf("%d\n", wr.next()); | |
1474 | } | |
1475 | return FALSE; | |
1476 | } | |
1477 | ||
1478 | static UBool TestRepeat(){ | |
1479 | srand((unsigned)time( NULL )); | |
1480 | Repeat rep(new Literal("aaa1-5 "), 1, 5); | |
1481 | DUMP_R(TestRepeat, rep, 50); | |
1482 | ||
1483 | Repeat r2(new Literal("b{1,3}1%0%5% "), 1, 3, &Buffer_int().append(1).append(0).append(5)); | |
1484 | DUMP_R(TestRepeat, r2, 50); | |
1485 | ||
1486 | Repeat r3(new Literal("aaa5-5 "), 5, 5); | |
1487 | DUMP_R(TestRepeat, r3, 50); | |
1488 | ||
1489 | return FALSE; | |
1490 | } | |
1491 | ||
1492 | static UBool TestVariable(){ | |
1493 | SymbolTable tab; | |
1494 | Pick * value = new Literal("string1"); | |
1495 | Variable var1(&tab, "x", value); | |
1496 | ||
1497 | Variable var2(&tab, "y"); | |
1498 | // tab.put(var2, value); // TOFIX: point alias/recursion problem | |
1499 | Pick * value2 = new Literal("string2"); | |
1500 | tab.put(var2, value2); | |
1501 | ||
1502 | Pick * value3 = new Literal("string3"); | |
1503 | Variable var3(&tab, "z"); | |
1504 | tab.put("z", value3); | |
1505 | ||
1506 | UBool pass; | |
1507 | pass = strcmp(var1.next(), value->next()) == 0; | |
1508 | pass = pass && strcmp(var2.next(), value2->next()) == 0; | |
1509 | pass = pass && strcmp(var3.next(), value3->next()) == 0; | |
1510 | return pass; | |
1511 | } | |
1512 | ||
1513 | static UBool TestSymbolTable(){ | |
1514 | Literal * n1 = new Literal("string1"); | |
1515 | Literal * n2 = new Literal("string2"); | |
1516 | SymbolTable t; | |
1517 | t.put("abc", n1); | |
1518 | t.put("$aaa", n2); | |
1519 | // t.put("alias", n1); // TOFIX: point alias/recursion problem | |
1520 | t.put("bbb"); | |
1521 | ||
1522 | UBool pass; | |
1523 | pass = t.find(NULL) == SymbolTable::EMPTY; | |
1524 | pass = pass && t.find("ccc") == SymbolTable::NO_VAR; | |
1525 | pass = pass && t.find("bbb") == SymbolTable::NO_REF; | |
1526 | pass = pass && t.find("abc") == SymbolTable::HAS_REF; | |
1527 | pass = pass && t.find("$aaa") == SymbolTable::HAS_REF; | |
1528 | ||
1529 | t.reset(); | |
1530 | pass = pass && t.find("abc") == SymbolTable::NO_VAR; | |
1531 | return pass; | |
1532 | } | |
1533 | ||
1534 | ||
1535 | static UBool TestScanner(void){ | |
1536 | //const char str1[] = "$root = $command{0,5} $reset $mostRules{1,20};"; | |
1537 | //const char str1_r[][20] = {"$root", "=", "$command", "{", "0", ",", "5", "}", | |
1538 | // "$reset", "$mostRules", "{", "1", ",", "20", "}", ";"}; | |
1539 | ||
1540 | const char str2[] = "$p2 =(\\\\ $s $string $s)? 25%;"; | |
1541 | const char str2_r[][20] = {"$p2", "=", "(", "\\", "$s", "$string", "$s", ")", "?", "25", "%", ";"}; | |
1542 | ||
1543 | const char *str = str2; | |
1544 | const char (*str_r)[20] = str2_r; | |
1545 | int tokenNum = sizeof(str2_r)/sizeof(char[20]); | |
1546 | ||
1547 | Scanner t(str); | |
1548 | UBool pass = TRUE; | |
1549 | t.getNextToken(); | |
1550 | int i = 0; | |
1551 | while (pass){ | |
1552 | if (t.tokenType == STREAM_END){ | |
1553 | pass = pass? i == tokenNum : FALSE; | |
1554 | break;//while | |
1555 | } else if (t.tokenType == ERROR){ | |
1556 | pass = FALSE; | |
1557 | break;//while | |
1558 | } else { | |
1559 | pass = strcmp( &(t.token[0]), str_r[i++]) == 0; | |
1560 | t.getNextToken(); | |
1561 | } | |
1562 | } | |
1563 | ||
1564 | //const char ts[] = "$commandList = '['" | |
1565 | //" ( alternate ' ' $alternateOptions" | |
1566 | //" | backwards ' 2'" | |
1567 | //" | normalization ' ' $onoff " | |
1568 | //" | caseLevel ' ' $onoff " | |
1569 | //" | hiraganaQ ' ' $onoff" | |
1570 | //" | caseFirst ' ' $caseFirstOptions" | |
1571 | //" | strength ' ' $strengthOptions" | |
1572 | //" ) ']';" ; | |
1573 | ||
1574 | //Scanner t2(ts); | |
1575 | //pass = TRUE; | |
1576 | //do { | |
1577 | // t2.getNextToken(); | |
1578 | // if (t2.tokenType == ERROR){ | |
1579 | // DumpScanner(t2); | |
1580 | // return FALSE; | |
1581 | // } | |
1582 | //}while (t.tokenType != STREAM_END); | |
1583 | ||
1584 | return pass; | |
1585 | } | |
1586 | ||
1587 | class TestParserT { | |
1588 | public: | |
1589 | UBool operator () (const char *const str, const int exp_error_offset = -1, const UBool dump = TRUE){ | |
1590 | Parser par(str); | |
1591 | if (par.rules()){ | |
1592 | if ( exp_error_offset == -1){ | |
1593 | return TRUE; | |
1594 | }else { | |
1595 | DumpScanner(par.s,dump); | |
1596 | return FALSE; | |
1597 | } | |
1598 | }else { | |
1599 | return DumpScanner(par.s, dump) == exp_error_offset; | |
1600 | } | |
1601 | } | |
1602 | }; | |
1603 | ||
1604 | UBool TestParser(){ | |
1605 | TestParserT test; | |
1606 | ||
1607 | UBool pass = TRUE; | |
1608 | pass = pass && test ("$s = ' ' ? 50%;"); | |
1609 | pass = pass && test("$x = ($var {1,2}) 3%;"); // legal | |
1610 | pass = pass && test("$x = $var {1,2} 3% | b 4%;"); // legal | |
1611 | pass = pass && test("$x = $var {1,2} 3%;"); // legal | |
1612 | pass = pass && test("$m = $c ? 2% 4% | $r 5% | $n 25%;"); // legal | |
1613 | pass = pass && test("$a = b ? 2% | c 5%;"); // legal | |
1614 | pass = pass && test("$x = A B 5% C 10% | D;", 8, FALSE); // illegal 5% | |
1615 | pass = pass && test("$x = aa 45% | bb 5% cc;", 19, FALSE);// illegal cc | |
1616 | pass = pass && test("$x = (b 5%) (c 6%);"); // legal | |
1617 | pass = pass && test("$x = (b 5%) c 6%;", 13, FALSE); // illegal 6% | |
1618 | pass = pass && test("$x = b 5% (c 6%);", 9, FALSE); // illegal (c 6%) | |
1619 | pass = pass && test("$x = b 5% c 6%;", 9, FALSE); // illegal c 6% | |
1620 | pass = pass && test("$x = b 5%;"); // legal | |
1621 | pass = pass && test("$x = aa 45% | bb 5% cc;", 19, FALSE);// illegal cc | |
1622 | pass = pass && test("$x = a | b | c 4% | d 5%;"); // legal | |
1623 | pass = pass && test("$s = ' ' ? 50% abc;"); // legal | |
1624 | pass = pass && test("$s = a | c d | e f;"); // legal | |
1625 | pass = pass && test( "$z = q 0% | p 1% | r 100%;"); // legal How to check parsed tree?? | |
1626 | ||
1627 | pass = pass && test("$s = ' ' ? 50%;"); | |
1628 | pass = pass && test("$relationList = '<' | '<<' | ';' | '<<<' | ',' | '=';"); | |
1629 | pass = pass && test("$p1 = ($string $s '|' $s)? 25%;"); | |
1630 | pass = pass && test("$p2 = (\\\\ $s $string $s)? 25%;"); | |
1631 | pass = pass && test("$rel2 = $p1 $string $s $p2;"); | |
1632 | pass = pass && test("$relation = $relationList $s ($rel1 | $rel2) $crlf;"); | |
1633 | pass = pass && test("$command = $commandList $crlf;"); | |
1634 | pass = pass && test("$reset = '&' $s ($beforeList $s)? 10% ($positionList 100% | $string 10%) $crlf;"); | |
1635 | pass = pass && test("$mostRules = $command 1% | $reset 5% | $relation 25%;"); | |
1636 | pass = pass && test("$root = $command{0,5} $reset $mostRules{1,20};"); | |
1637 | ||
1638 | const char collationBNF[] = | |
1639 | "$s = ' '? 50%;" | |
1640 | "$crlf = '\r\n';" | |
1641 | ||
1642 | "$alternateOptions = non'-'ignorable | shifted;" | |
1643 | "$onoff = on | off;" | |
1644 | "$caseFirstOptions = off | upper | lower;" | |
1645 | "$strengthOptions = '1' | '2' | '3' | '4' | 'I';" | |
1646 | "$commandList = '['" | |
1647 | " ( alternate ' ' $alternateOptions" | |
1648 | " | backwards ' 2'" | |
1649 | " | normalization ' ' $onoff " | |
1650 | " | caseLevel ' ' $onoff " | |
1651 | " | hiraganaQ ' ' $onoff" | |
1652 | " | caseFirst ' ' $caseFirstOptions" | |
1653 | " | strength ' ' $strengthOptions" | |
1654 | " ) ']';" | |
1655 | "$command = $commandList $crlf;" | |
1656 | ||
1657 | "$ignorableTypes = (tertiary | secondary | primary) ' ' ignorable;" | |
1658 | "$allTypes = variable | regular | implicit | trailing | $ignorableTypes;" | |
1659 | "$positionList = '[' (first | last) ' ' $allTypes ']';" | |
1660 | ||
1661 | "$beforeList = '[before ' ('1' | '2' | '3') ']';" | |
1662 | ||
1663 | "$relationList = (" | |
1664 | " '<'" | |
1665 | " | '<<'" | |
1666 | " | ';'" | |
1667 | " | '<<<'" | |
1668 | " | ','" | |
1669 | " | '='" | |
1670 | ");" | |
1671 | "$string = $magic;" | |
1672 | "$rel1 = '[variable top]' $s;" | |
1673 | "$p1 = ($string $s '|' $s)? 25%;" | |
1674 | "$p2 = (\\\\ $s $string $s)? 25%;" | |
1675 | "$rel2 = $p1 $string $s $p2;" | |
1676 | "$relation = $relationList $s ($rel1 | $rel2) $crlf;" | |
1677 | ||
1678 | "$reset = '&' $s ($beforeList $s)? 10% ($positionList 1% | $string 10%) $crlf;" | |
1679 | "$mostRules = $command 1% | $reset 5% | $relation 25%;" | |
1680 | "$root = $command{0,5} $reset $mostRules{1,20};" | |
1681 | ; | |
1682 | ||
1683 | pass = pass && test(collationBNF); | |
1684 | ||
1685 | ||
1686 | return pass; | |
1687 | } | |
1688 | ||
1689 | static UBool TestMorph(){ | |
1690 | srand((unsigned)time( NULL )); | |
1691 | ||
1692 | Alternation * alt = new Alternation(); | |
1693 | ||
1694 | (*alt) | |
1695 | .append(new Literal("a")).append(new Literal("b")).append(new Literal("c")) | |
1696 | .append(new Literal("d")).append(new Literal("e")).append(new Literal("f")) | |
1697 | .append(new Literal("g")).append(new Literal("h")).append(new Literal("i")) | |
1698 | .append(new Literal("j")).append(new Literal("k")).append(new Literal("l")) | |
1699 | .append(new Literal("m")).append(new Literal("n")).append(new Literal("o")) | |
1700 | ; | |
1701 | ||
1702 | Repeat * rep = new Repeat( alt ,5,5 ); | |
1703 | Morph m( *rep); | |
1704 | ||
1705 | // DUMP_R(TestMorph,(*rep),20); | |
1706 | DUMP_R(TestMorph,m,100); | |
1707 | ||
1708 | return FALSE; | |
1709 | } | |
1710 | ||
1711 | #endif | |
1712 | ||
1713 | static UBool TestLanguageGenerator(){ | |
1714 | //LanguageGenerator g; | |
1715 | //const char *const s = "$s = p 0% | q 1%;"; | |
1716 | //g.parseBNF(s, "$s"); | |
1717 | UBool pass; | |
1718 | //= strcmp("q", g.next()) == 0; | |
1719 | ||
1720 | const char *const def = | |
1721 | //"$a = $b;" | |
1722 | //"$b = $c;" | |
1723 | //"$c = $t;" | |
1724 | //"$t = abc $z{1,2};" | |
1725 | //"$k = a | b | c | d | e | f | g ;" | |
1726 | //"$z = q 0% | p 1% | r 1%;" | |
1727 | "$x = a ? 0%;" | |
1728 | ; // end of string | |
1729 | // const char * s = "abczz"; | |
1730 | // | |
1731 | // | |
1732 | LanguageGenerator g; | |
1733 | pass = g.parseBNF(def, "$x",TRUE); | |
1734 | //// LanguageGenerator g(collationBNF, "$root", "$magic", new MagicNode()); | |
1735 | // | |
1736 | if (pass != LanguageGenerator::OK) return FALSE; | |
1737 | ||
1738 | DUMP_R(TestLanguageGenerator, g, 20); | |
1739 | return pass; | |
1740 | ||
1741 | ////UBool pass = strcmp(s,r) == 0; | |
1742 | ||
1743 | //if (pass){ | |
1744 | // printf("TestRandomLanguageGenerator passed.\n"); | |
1745 | //} else { | |
1746 | // printf("TestRandomLanguageGenerator FAILED!!!\n"); | |
1747 | //} | |
1748 | //return pass; | |
1749 | } | |
1750 | ||
1751 | void TestWbnf(void){ | |
1752 | srand((unsigned)time( NULL )); | |
1753 | ||
1754 | //CALL(TestLiteral); | |
1755 | //CALL(TestSequence); | |
1756 | //CALL(TestSymbolTable); | |
1757 | //CALL(TestVariable); | |
1758 | ||
1759 | //TestRepeat(); | |
1760 | //TestAlternation(); | |
1761 | //TestMorph(); | |
1762 | ||
1763 | //TestQuote(); | |
1764 | //TestBuffer(); | |
1765 | //TestWeightedRand(); | |
1766 | ||
1767 | //CALL(TestScanner); | |
1768 | //CALL(TestParser); | |
1769 | CALL(TestLanguageGenerator); | |
1770 | } | |
1771 |