]> git.saurik.com Git - apple/icu.git/blob - icuSources/tools/genrb/parse.cpp
ICU-66108.tar.gz
[apple/icu.git] / icuSources / tools / genrb / parse.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 1998-2015, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 *
11 * File parse.cpp
12 *
13 * Modification History:
14 *
15 * Date Name Description
16 * 05/26/99 stephen Creation.
17 * 02/25/00 weiv Overhaul to write udata
18 * 5/10/01 Ram removed ustdio dependency
19 * 06/10/2001 Dominic Ludlam <dom@recoil.org> Rewritten
20 *******************************************************************************
21 */
22
23 // Safer use of UnicodeString.
24 #ifndef UNISTR_FROM_CHAR_EXPLICIT
25 # define UNISTR_FROM_CHAR_EXPLICIT explicit
26 #endif
27
28 // Less important, but still a good idea.
29 #ifndef UNISTR_FROM_STRING_EXPLICIT
30 # define UNISTR_FROM_STRING_EXPLICIT explicit
31 #endif
32
33 #include <assert.h>
34 #include "parse.h"
35 #include "errmsg.h"
36 #include "uhash.h"
37 #include "cmemory.h"
38 #include "cstring.h"
39 #include "uinvchar.h"
40 #include "read.h"
41 #include "ustr.h"
42 #include "reslist.h"
43 #include "rbt_pars.h"
44 #include "genrb.h"
45 #include "unicode/stringpiece.h"
46 #include "unicode/unistr.h"
47 #include "unicode/ustring.h"
48 #include "unicode/uscript.h"
49 #include "unicode/utf16.h"
50 #include "unicode/putil.h"
51 #include "charstr.h"
52 #include "collationbuilder.h"
53 #include "collationdata.h"
54 #include "collationdatareader.h"
55 #include "collationdatawriter.h"
56 #include "collationfastlatinbuilder.h"
57 #include "collationinfo.h"
58 #include "collationroot.h"
59 #include "collationruleparser.h"
60 #include "collationtailoring.h"
61 #include <stdio.h>
62
63 /* Number of tokens to read ahead of the current stream position */
64 #define MAX_LOOKAHEAD 3
65
66 #define CR 0x000D
67 #define LF 0x000A
68 #define SPACE 0x0020
69 #define TAB 0x0009
70 #define ESCAPE 0x005C
71 #define HASH 0x0023
72 #define QUOTE 0x0027
73 #define ZERO 0x0030
74 #define STARTCOMMAND 0x005B
75 #define ENDCOMMAND 0x005D
76 #define OPENSQBRACKET 0x005B
77 #define CLOSESQBRACKET 0x005D
78
79 using icu::CharString;
80 using icu::LocalMemory;
81 using icu::LocalPointer;
82 using icu::LocalUCHARBUFPointer;
83 using icu::StringPiece;
84 using icu::UnicodeString;
85
86 struct Lookahead
87 {
88 enum ETokenType type;
89 struct UString value;
90 struct UString comment;
91 uint32_t line;
92 };
93
94 /* keep in sync with token defines in read.h */
95 const char *tokenNames[TOK_TOKEN_COUNT] =
96 {
97 "string", /* A string token, such as "MonthNames" */
98 "'{'", /* An opening brace character */
99 "'}'", /* A closing brace character */
100 "','", /* A comma */
101 "':'", /* A colon */
102
103 "<end of file>", /* End of the file has been reached successfully */
104 "<end of line>"
105 };
106
107 /* Just to store "TRUE" */
108 //static const UChar trueValue[] = {0x0054, 0x0052, 0x0055, 0x0045, 0x0000};
109
110 typedef struct {
111 struct Lookahead lookahead[MAX_LOOKAHEAD + 1];
112 uint32_t lookaheadPosition;
113 UCHARBUF *buffer;
114 struct SRBRoot *bundle;
115 const char *inputdir;
116 uint32_t inputdirLength;
117 const char *outputdir;
118 uint32_t outputdirLength;
119 const char *filename;
120 UBool makeBinaryCollation;
121 UBool omitCollationRules;
122 } ParseState;
123
124 typedef struct SResource *
125 ParseResourceFunction(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status);
126
127 static struct SResource *parseResource(ParseState* state, char *tag, const struct UString *comment, UErrorCode *status);
128
129 /* The nature of the lookahead buffer:
130 There are MAX_LOOKAHEAD + 1 slots, used as a circular buffer. This provides
131 MAX_LOOKAHEAD lookahead tokens and a slot for the current token and value.
132 When getToken is called, the current pointer is moved to the next slot and the
133 old slot is filled with the next token from the reader by calling getNextToken.
134 The token values are stored in the slot, which means that token values don't
135 survive a call to getToken, ie.
136
137 UString *value;
138
139 getToken(&value, NULL, status);
140 getToken(NULL, NULL, status); bad - value is now a different string
141 */
142 static void
143 initLookahead(ParseState* state, UCHARBUF *buf, UErrorCode *status)
144 {
145 static uint32_t initTypeStrings = 0;
146 uint32_t i;
147
148 if (!initTypeStrings)
149 {
150 initTypeStrings = 1;
151 }
152
153 state->lookaheadPosition = 0;
154 state->buffer = buf;
155
156 resetLineNumber();
157
158 for (i = 0; i < MAX_LOOKAHEAD; i++)
159 {
160 state->lookahead[i].type = getNextToken(state->buffer, &state->lookahead[i].value, &state->lookahead[i].line, &state->lookahead[i].comment, status);
161 if (U_FAILURE(*status))
162 {
163 return;
164 }
165 }
166
167 *status = U_ZERO_ERROR;
168 }
169
170 static void
171 cleanupLookahead(ParseState* state)
172 {
173 uint32_t i;
174 for (i = 0; i <= MAX_LOOKAHEAD; i++)
175 {
176 ustr_deinit(&state->lookahead[i].value);
177 ustr_deinit(&state->lookahead[i].comment);
178 }
179
180 }
181
182 static enum ETokenType
183 getToken(ParseState* state, struct UString **tokenValue, struct UString* comment, uint32_t *linenumber, UErrorCode *status)
184 {
185 enum ETokenType result;
186 uint32_t i;
187
188 result = state->lookahead[state->lookaheadPosition].type;
189
190 if (tokenValue != NULL)
191 {
192 *tokenValue = &state->lookahead[state->lookaheadPosition].value;
193 }
194
195 if (linenumber != NULL)
196 {
197 *linenumber = state->lookahead[state->lookaheadPosition].line;
198 }
199
200 if (comment != NULL)
201 {
202 ustr_cpy(comment, &(state->lookahead[state->lookaheadPosition].comment), status);
203 }
204
205 i = (state->lookaheadPosition + MAX_LOOKAHEAD) % (MAX_LOOKAHEAD + 1);
206 state->lookaheadPosition = (state->lookaheadPosition + 1) % (MAX_LOOKAHEAD + 1);
207 ustr_setlen(&state->lookahead[i].comment, 0, status);
208 ustr_setlen(&state->lookahead[i].value, 0, status);
209 state->lookahead[i].type = getNextToken(state->buffer, &state->lookahead[i].value, &state->lookahead[i].line, &state->lookahead[i].comment, status);
210
211 /* printf("getToken, returning %s\n", tokenNames[result]); */
212
213 return result;
214 }
215
216 static enum ETokenType
217 peekToken(ParseState* state, uint32_t lookaheadCount, struct UString **tokenValue, uint32_t *linenumber, struct UString *comment, UErrorCode *status)
218 {
219 uint32_t i = (state->lookaheadPosition + lookaheadCount) % (MAX_LOOKAHEAD + 1);
220
221 if (U_FAILURE(*status))
222 {
223 return TOK_ERROR;
224 }
225
226 if (lookaheadCount >= MAX_LOOKAHEAD)
227 {
228 *status = U_INTERNAL_PROGRAM_ERROR;
229 return TOK_ERROR;
230 }
231
232 if (tokenValue != NULL)
233 {
234 *tokenValue = &state->lookahead[i].value;
235 }
236
237 if (linenumber != NULL)
238 {
239 *linenumber = state->lookahead[i].line;
240 }
241
242 if(comment != NULL){
243 ustr_cpy(comment, &(state->lookahead[state->lookaheadPosition].comment), status);
244 }
245
246 return state->lookahead[i].type;
247 }
248
249 static void
250 expect(ParseState* state, enum ETokenType expectedToken, struct UString **tokenValue, struct UString *comment, uint32_t *linenumber, UErrorCode *status)
251 {
252 uint32_t line;
253
254 enum ETokenType token = getToken(state, tokenValue, comment, &line, status);
255
256 if (linenumber != NULL)
257 {
258 *linenumber = line;
259 }
260
261 if (U_FAILURE(*status))
262 {
263 return;
264 }
265
266 if (token != expectedToken)
267 {
268 *status = U_INVALID_FORMAT_ERROR;
269 error(line, "expecting %s, got %s", tokenNames[expectedToken], tokenNames[token]);
270 }
271 else
272 {
273 *status = U_ZERO_ERROR;
274 }
275 }
276
277 static char *getInvariantString(ParseState* state, uint32_t *line, struct UString *comment,
278 int32_t &stringLength, UErrorCode *status)
279 {
280 struct UString *tokenValue;
281 char *result;
282
283 expect(state, TOK_STRING, &tokenValue, comment, line, status);
284
285 if (U_FAILURE(*status))
286 {
287 return NULL;
288 }
289
290 if(!uprv_isInvariantUString(tokenValue->fChars, tokenValue->fLength)) {
291 *status = U_INVALID_FORMAT_ERROR;
292 error(*line, "invariant characters required for table keys, binary data, etc.");
293 return NULL;
294 }
295
296 result = static_cast<char *>(uprv_malloc(tokenValue->fLength+1));
297
298 if (result == NULL)
299 {
300 *status = U_MEMORY_ALLOCATION_ERROR;
301 return NULL;
302 }
303
304 u_UCharsToChars(tokenValue->fChars, result, tokenValue->fLength+1);
305 stringLength = tokenValue->fLength;
306 return result;
307 }
308
309 static struct SResource *
310 parseUCARules(ParseState* state, char *tag, uint32_t startline, const struct UString* /*comment*/, UErrorCode *status)
311 {
312 struct SResource *result = NULL;
313 struct UString *tokenValue;
314 FileStream *file = NULL;
315 char filename[256] = { '\0' };
316 char cs[128] = { '\0' };
317 uint32_t line;
318 UBool quoted = FALSE;
319 UCHARBUF *ucbuf=NULL;
320 UChar32 c = 0;
321 const char* cp = NULL;
322 UChar *pTarget = NULL;
323 UChar *target = NULL;
324 UChar *targetLimit = NULL;
325 int32_t size = 0;
326
327 expect(state, TOK_STRING, &tokenValue, NULL, &line, status);
328
329 if(isVerbose()){
330 printf(" %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
331 }
332
333 if (U_FAILURE(*status))
334 {
335 return NULL;
336 }
337 /* make the filename including the directory */
338 if (state->inputdir != NULL)
339 {
340 uprv_strcat(filename, state->inputdir);
341
342 if (state->inputdir[state->inputdirLength - 1] != U_FILE_SEP_CHAR)
343 {
344 uprv_strcat(filename, U_FILE_SEP_STRING);
345 }
346 }
347
348 u_UCharsToChars(tokenValue->fChars, cs, tokenValue->fLength);
349
350 expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
351
352 if (U_FAILURE(*status))
353 {
354 return NULL;
355 }
356 uprv_strcat(filename, cs);
357
358 if(state->omitCollationRules) {
359 return res_none();
360 }
361
362 ucbuf = ucbuf_open(filename, &cp, getShowWarning(),FALSE, status);
363
364 if (U_FAILURE(*status)) {
365 error(line, "An error occurred while opening the input file %s\n", filename);
366 return NULL;
367 }
368
369 /* We allocate more space than actually required
370 * since the actual size needed for storing UChars
371 * is not known in UTF-8 byte stream
372 */
373 size = ucbuf_size(ucbuf) + 1;
374 pTarget = (UChar*) uprv_malloc(U_SIZEOF_UCHAR * size);
375 uprv_memset(pTarget, 0, size*U_SIZEOF_UCHAR);
376 target = pTarget;
377 targetLimit = pTarget+size;
378
379 /* read the rules into the buffer */
380 while (target < targetLimit)
381 {
382 c = ucbuf_getc(ucbuf, status);
383 if(c == QUOTE) {
384 quoted = (UBool)!quoted;
385 }
386 /* weiv (06/26/2002): adding the following:
387 * - preserving spaces in commands [...]
388 * - # comments until the end of line
389 */
390 if (c == STARTCOMMAND && !quoted)
391 {
392 /* preserve commands
393 * closing bracket will be handled by the
394 * append at the end of the loop
395 */
396 while(c != ENDCOMMAND) {
397 U_APPEND_CHAR32_ONLY(c, target);
398 c = ucbuf_getc(ucbuf, status);
399 }
400 }
401 else if (c == HASH && !quoted) {
402 /* skip comments */
403 while(c != CR && c != LF) {
404 c = ucbuf_getc(ucbuf, status);
405 }
406 continue;
407 }
408 else if (c == ESCAPE)
409 {
410 c = unescape(ucbuf, status);
411
412 if (c == (UChar32)U_ERR)
413 {
414 uprv_free(pTarget);
415 T_FileStream_close(file);
416 return NULL;
417 }
418 }
419 else if (!quoted && (c == SPACE || c == TAB || c == CR || c == LF))
420 {
421 /* ignore spaces carriage returns
422 * and line feed unless in the form \uXXXX
423 */
424 continue;
425 }
426
427 /* Append UChar * after dissembling if c > 0xffff*/
428 if (c != (UChar32)U_EOF)
429 {
430 U_APPEND_CHAR32_ONLY(c, target);
431 }
432 else
433 {
434 break;
435 }
436 }
437
438 /* terminate the string */
439 if(target < targetLimit){
440 *target = 0x0000;
441 }
442
443 result = string_open(state->bundle, tag, pTarget, (int32_t)(target - pTarget), NULL, status);
444
445
446 ucbuf_close(ucbuf);
447 uprv_free(pTarget);
448 T_FileStream_close(file);
449
450 return result;
451 }
452
453 static struct SResource *
454 parseTransliterator(ParseState* state, char *tag, uint32_t startline, const struct UString* /*comment*/, UErrorCode *status)
455 {
456 struct SResource *result = NULL;
457 struct UString *tokenValue;
458 FileStream *file = NULL;
459 char filename[256] = { '\0' };
460 char cs[128] = { '\0' };
461 uint32_t line;
462 UCHARBUF *ucbuf=NULL;
463 const char* cp = NULL;
464 UChar *pTarget = NULL;
465 const UChar *pSource = NULL;
466 int32_t size = 0;
467
468 expect(state, TOK_STRING, &tokenValue, NULL, &line, status);
469
470 if(isVerbose()){
471 printf(" %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
472 }
473
474 if (U_FAILURE(*status))
475 {
476 return NULL;
477 }
478 /* make the filename including the directory */
479 if (state->inputdir != NULL)
480 {
481 uprv_strcat(filename, state->inputdir);
482
483 if (state->inputdir[state->inputdirLength - 1] != U_FILE_SEP_CHAR)
484 {
485 uprv_strcat(filename, U_FILE_SEP_STRING);
486 }
487 }
488
489 u_UCharsToChars(tokenValue->fChars, cs, tokenValue->fLength);
490
491 expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
492
493 if (U_FAILURE(*status))
494 {
495 return NULL;
496 }
497 uprv_strcat(filename, cs);
498
499
500 ucbuf = ucbuf_open(filename, &cp, getShowWarning(),FALSE, status);
501
502 if (U_FAILURE(*status)) {
503 error(line, "An error occurred while opening the input file %s\n", filename);
504 return NULL;
505 }
506
507 /* We allocate more space than actually required
508 * since the actual size needed for storing UChars
509 * is not known in UTF-8 byte stream
510 */
511 pSource = ucbuf_getBuffer(ucbuf, &size, status);
512 pTarget = (UChar*) uprv_malloc(U_SIZEOF_UCHAR * (size + 1));
513 uprv_memset(pTarget, 0, size*U_SIZEOF_UCHAR);
514
515 #if !UCONFIG_NO_TRANSLITERATION
516 size = utrans_stripRules(pSource, size, pTarget, status);
517 #else
518 size = 0;
519 fprintf(stderr, " Warning: writing empty transliteration data ( UCONFIG_NO_TRANSLITERATION ) \n");
520 #endif
521 result = string_open(state->bundle, tag, pTarget, size, NULL, status);
522
523 ucbuf_close(ucbuf);
524 uprv_free(pTarget);
525 T_FileStream_close(file);
526
527 return result;
528 }
529 static ArrayResource* dependencyArray = NULL;
530
531 static struct SResource *
532 parseDependency(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status)
533 {
534 struct SResource *result = NULL;
535 struct SResource *elem = NULL;
536 struct UString *tokenValue;
537 uint32_t line;
538 char filename[256] = { '\0' };
539 char cs[128] = { '\0' };
540
541 expect(state, TOK_STRING, &tokenValue, NULL, &line, status);
542
543 if(isVerbose()){
544 printf(" %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
545 }
546
547 if (U_FAILURE(*status))
548 {
549 return NULL;
550 }
551 /* make the filename including the directory */
552 if (state->outputdir != NULL)
553 {
554 uprv_strcat(filename, state->outputdir);
555
556 if (state->outputdir[state->outputdirLength - 1] != U_FILE_SEP_CHAR)
557 {
558 uprv_strcat(filename, U_FILE_SEP_STRING);
559 }
560 }
561
562 u_UCharsToChars(tokenValue->fChars, cs, tokenValue->fLength);
563
564 if (U_FAILURE(*status))
565 {
566 return NULL;
567 }
568 uprv_strcat(filename, cs);
569 if(!T_FileStream_file_exists(filename)){
570 if(isStrict()){
571 error(line, "The dependency file %s does not exist. Please make sure it exists.\n",filename);
572 }else{
573 warning(line, "The dependency file %s does not exist. Please make sure it exists.\n",filename);
574 }
575 }
576 if(dependencyArray==NULL){
577 dependencyArray = array_open(state->bundle, "%%DEPENDENCY", NULL, status);
578 }
579 if(tag!=NULL){
580 result = string_open(state->bundle, tag, tokenValue->fChars, tokenValue->fLength, comment, status);
581 }
582 elem = string_open(state->bundle, NULL, tokenValue->fChars, tokenValue->fLength, comment, status);
583
584 dependencyArray->add(elem);
585
586 if (U_FAILURE(*status))
587 {
588 return NULL;
589 }
590 expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
591 return result;
592 }
593 static struct SResource *
594 parseString(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status)
595 {
596 struct UString *tokenValue;
597 struct SResource *result = NULL;
598
599 /* if (tag != NULL && uprv_strcmp(tag, "%%UCARULES") == 0)
600 {
601 return parseUCARules(tag, startline, status);
602 }*/
603 if(isVerbose()){
604 printf(" string %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
605 }
606 expect(state, TOK_STRING, &tokenValue, NULL, NULL, status);
607
608 if (U_SUCCESS(*status))
609 {
610 /* create the string now - tokenValue doesn't survive a call to getToken (and therefore
611 doesn't survive expect either) */
612
613 result = string_open(state->bundle, tag, tokenValue->fChars, tokenValue->fLength, comment, status);
614 if(U_SUCCESS(*status) && result) {
615 expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
616
617 if (U_FAILURE(*status))
618 {
619 res_close(result);
620 return NULL;
621 }
622 }
623 }
624
625 return result;
626 }
627
628 static struct SResource *
629 parseAlias(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
630 {
631 struct UString *tokenValue;
632 struct SResource *result = NULL;
633
634 expect(state, TOK_STRING, &tokenValue, NULL, NULL, status);
635
636 if(isVerbose()){
637 printf(" alias %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
638 }
639
640 if (U_SUCCESS(*status))
641 {
642 /* create the string now - tokenValue doesn't survive a call to getToken (and therefore
643 doesn't survive expect either) */
644
645 result = alias_open(state->bundle, tag, tokenValue->fChars, tokenValue->fLength, comment, status);
646
647 expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
648
649 if (U_FAILURE(*status))
650 {
651 res_close(result);
652 return NULL;
653 }
654 }
655
656 return result;
657 }
658
659 #if !UCONFIG_NO_COLLATION
660
661 namespace {
662
663 static struct SResource* resLookup(struct SResource* res, const char* key){
664 if (res == res_none() || !res->isTable()) {
665 return NULL;
666 }
667
668 TableResource *list = static_cast<TableResource *>(res);
669 SResource *current = list->fFirst;
670 while (current != NULL) {
671 if (uprv_strcmp(((list->fRoot->fKeys) + (current->fKey)), key) == 0) {
672 return current;
673 }
674 current = current->fNext;
675 }
676 return NULL;
677 }
678
679 class GenrbImporter : public icu::CollationRuleParser::Importer {
680 public:
681 GenrbImporter(const char *in, const char *out) : inputDir(in), outputDir(out) {}
682 virtual ~GenrbImporter();
683 virtual void getRules(
684 const char *localeID, const char *collationType,
685 UnicodeString &rules,
686 const char *&errorReason, UErrorCode &errorCode);
687
688 private:
689 const char *inputDir;
690 const char *outputDir;
691 };
692
693 GenrbImporter::~GenrbImporter() {}
694
695 void
696 GenrbImporter::getRules(
697 const char *localeID, const char *collationType,
698 UnicodeString &rules,
699 const char *& /*errorReason*/, UErrorCode &errorCode) {
700 CharString filename(localeID, errorCode);
701 for(int32_t i = 0; i < filename.length(); i++){
702 if(filename[i] == '-'){
703 filename.data()[i] = '_';
704 }
705 }
706 filename.append(".txt", errorCode);
707 if (U_FAILURE(errorCode)) {
708 return;
709 }
710 CharString inputDirBuf;
711 CharString openFileName;
712 if(inputDir == NULL) {
713 const char *filenameBegin = uprv_strrchr(filename.data(), U_FILE_SEP_CHAR);
714 if (filenameBegin != NULL) {
715 /*
716 * When a filename ../../../data/root.txt is specified,
717 * we presume that the input directory is ../../../data
718 * This is very important when the resource file includes
719 * another file, like UCARules.txt or thaidict.brk.
720 */
721 StringPiece dir = filename.toStringPiece();
722 const char *filenameLimit = filename.data() + filename.length();
723 dir.remove_suffix((int32_t)(filenameLimit - filenameBegin));
724 inputDirBuf.append(dir, errorCode);
725 inputDir = inputDirBuf.data();
726 }
727 }else{
728 int32_t dirlen = (int32_t)uprv_strlen(inputDir);
729
730 if((filename[0] != U_FILE_SEP_CHAR) && (inputDir[dirlen-1] !='.')) {
731 /*
732 * append the input dir to openFileName if the first char in
733 * filename is not file separator char and the last char input directory is not '.'.
734 * This is to support :
735 * genrb -s. /home/icu/data
736 * genrb -s. icu/data
737 * The user cannot mix notations like
738 * genrb -s. /icu/data --- the absolute path specified. -s redundant
739 * user should use
740 * genrb -s. icu/data --- start from CWD and look in icu/data dir
741 */
742 openFileName.append(inputDir, dirlen, errorCode);
743 if(inputDir[dirlen-1] != U_FILE_SEP_CHAR) {
744 openFileName.append(U_FILE_SEP_CHAR, errorCode);
745 }
746 }
747 }
748 openFileName.append(filename, errorCode);
749 if(U_FAILURE(errorCode)) {
750 return;
751 }
752 // printf("GenrbImporter::getRules(%s, %s) reads %s\n", localeID, collationType, openFileName.data());
753 const char* cp = "";
754 LocalUCHARBUFPointer ucbuf(
755 ucbuf_open(openFileName.data(), &cp, getShowWarning(), TRUE, &errorCode));
756 if(errorCode == U_FILE_ACCESS_ERROR) {
757 fprintf(stderr, "couldn't open file %s\n", openFileName.data());
758 return;
759 }
760 if (ucbuf.isNull() || U_FAILURE(errorCode)) {
761 fprintf(stderr, "An error occurred processing file %s. Error: %s\n", openFileName.data(), u_errorName(errorCode));
762 return;
763 }
764
765 /* Parse the data into an SRBRoot */
766 LocalPointer<SRBRoot> data(
767 parse(ucbuf.getAlias(), inputDir, outputDir, filename.data(), FALSE, FALSE, &errorCode));
768 if (U_FAILURE(errorCode)) {
769 return;
770 }
771
772 struct SResource *root = data->fRoot;
773 struct SResource *collations = resLookup(root, "collations");
774 if (collations != NULL) {
775 struct SResource *collation = resLookup(collations, collationType);
776 if (collation != NULL) {
777 struct SResource *sequence = resLookup(collation, "Sequence");
778 if (sequence != NULL && sequence->isString()) {
779 // No string pointer aliasing so that we need not hold onto the resource bundle.
780 StringResource *sr = static_cast<StringResource *>(sequence);
781 rules = sr->fString;
782 }
783 }
784 }
785 }
786
787 // Quick-and-dirty escaping function.
788 // Assumes that we are on an ASCII-based platform.
789 static void
790 escape(const UChar *s, char *buffer) {
791 int32_t length = u_strlen(s);
792 int32_t i = 0;
793 for (;;) {
794 UChar32 c;
795 U16_NEXT(s, i, length, c);
796 if (c == 0) {
797 *buffer = 0;
798 return;
799 } else if (0x20 <= c && c <= 0x7e) {
800 // printable ASCII
801 *buffer++ = (char)c; // assumes ASCII-based platform
802 } else {
803 buffer += sprintf(buffer, "\\u%04X", (int)c);
804 }
805 }
806 }
807
808 } // namespace
809
810 #endif // !UCONFIG_NO_COLLATION
811
812 static TableResource *
813 addCollation(ParseState* state, TableResource *result, const char *collationType,
814 uint32_t startline, UErrorCode *status)
815 {
816 // TODO: Use LocalPointer for result, or make caller close it when there is a failure.
817 struct SResource *member = NULL;
818 struct UString *tokenValue;
819 struct UString comment;
820 enum ETokenType token;
821 char subtag[1024];
822 UnicodeString rules;
823 UBool haveRules = FALSE;
824 UVersionInfo version;
825 uint32_t line;
826
827 /* '{' . (name resource)* '}' */
828 version[0]=0; version[1]=0; version[2]=0; version[3]=0;
829
830 for (;;)
831 {
832 ustr_init(&comment);
833 token = getToken(state, &tokenValue, &comment, &line, status);
834
835 if (token == TOK_CLOSE_BRACE)
836 {
837 break;
838 }
839
840 if (token != TOK_STRING)
841 {
842 res_close(result);
843 *status = U_INVALID_FORMAT_ERROR;
844
845 if (token == TOK_EOF)
846 {
847 error(startline, "unterminated table");
848 }
849 else
850 {
851 error(line, "Unexpected token %s", tokenNames[token]);
852 }
853
854 return NULL;
855 }
856
857 u_UCharsToChars(tokenValue->fChars, subtag, u_strlen(tokenValue->fChars) + 1);
858
859 if (U_FAILURE(*status))
860 {
861 res_close(result);
862 return NULL;
863 }
864
865 member = parseResource(state, subtag, NULL, status);
866
867 if (U_FAILURE(*status))
868 {
869 res_close(result);
870 return NULL;
871 }
872 if (result == NULL)
873 {
874 // Ignore the parsed resources, continue parsing.
875 }
876 else if (uprv_strcmp(subtag, "Version") == 0 && member->isString())
877 {
878 StringResource *sr = static_cast<StringResource *>(member);
879 char ver[40];
880 int32_t length = sr->length();
881
882 if (length >= UPRV_LENGTHOF(ver))
883 {
884 length = UPRV_LENGTHOF(ver) - 1;
885 }
886
887 sr->fString.extract(0, length, ver, UPRV_LENGTHOF(ver), US_INV);
888 u_versionFromString(version, ver);
889
890 result->add(member, line, *status);
891 member = NULL;
892 }
893 else if(uprv_strcmp(subtag, "%%CollationBin")==0)
894 {
895 /* discard duplicate %%CollationBin if any*/
896 }
897 else if (uprv_strcmp(subtag, "Sequence") == 0 && member->isString())
898 {
899 StringResource *sr = static_cast<StringResource *>(member);
900 rules = sr->fString;
901 haveRules = TRUE;
902 // Defer building the collator until we have seen
903 // all sub-elements of the collation table, including the Version.
904 /* in order to achieve smaller data files, we can direct genrb */
905 /* to omit collation rules */
906 if(!state->omitCollationRules) {
907 result->add(member, line, *status);
908 member = NULL;
909 }
910 }
911 else // Just copy non-special items.
912 {
913 result->add(member, line, *status);
914 member = NULL;
915 }
916 res_close(member); // TODO: use LocalPointer
917 if (U_FAILURE(*status))
918 {
919 res_close(result);
920 return NULL;
921 }
922 }
923
924 if (!haveRules) { return result; }
925
926 #if UCONFIG_NO_COLLATION || UCONFIG_NO_FILE_IO
927 warning(line, "Not building collation elements because of UCONFIG_NO_COLLATION and/or UCONFIG_NO_FILE_IO, see uconfig.h");
928 (void)collationType;
929 #else
930 // CLDR ticket #3949, ICU ticket #8082:
931 // Do not build collation binary data for for-import-only "private" collation rule strings.
932 if (uprv_strncmp(collationType, "private-", 8) == 0) {
933 if(isVerbose()) {
934 printf("Not building %s~%s collation binary\n", state->filename, collationType);
935 }
936 return result;
937 }
938
939 if(!state->makeBinaryCollation) {
940 if(isVerbose()) {
941 printf("Not building %s~%s collation binary\n", state->filename, collationType);
942 }
943 return result;
944 }
945 UErrorCode intStatus = U_ZERO_ERROR;
946 UParseError parseError;
947 uprv_memset(&parseError, 0, sizeof(parseError));
948 GenrbImporter importer(state->inputdir, state->outputdir);
949 const icu::CollationTailoring *base = icu::CollationRoot::getRoot(intStatus);
950 if(U_FAILURE(intStatus)) {
951 error(line, "failed to load root collator (ucadata.icu) - %s", u_errorName(intStatus));
952 res_close(result);
953 return NULL; // TODO: use LocalUResourceBundlePointer for result
954 }
955 icu::CollationBuilder builder(base, intStatus);
956 if(uprv_strncmp(collationType, "search", 6) == 0) {
957 builder.disableFastLatin(); // build fast-Latin table unless search collator
958 }
959 LocalPointer<icu::CollationTailoring> t(
960 builder.parseAndBuild(rules, version, &importer, &parseError, intStatus));
961 if(U_FAILURE(intStatus)) {
962 const char *reason = builder.getErrorReason();
963 if(reason == NULL) { reason = ""; }
964 error(line, "CollationBuilder failed at %s~%s/Sequence rule offset %ld: %s %s",
965 state->filename, collationType,
966 (long)parseError.offset, u_errorName(intStatus), reason);
967 if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
968 // Print pre- and post-context.
969 char preBuffer[100], postBuffer[100];
970 escape(parseError.preContext, preBuffer);
971 escape(parseError.postContext, postBuffer);
972 error(line, " error context: \"...%s\" ! \"%s...\"", preBuffer, postBuffer);
973 }
974 if(isStrict() || t.isNull()) {
975 *status = intStatus;
976 res_close(result);
977 return NULL;
978 }
979 }
980 icu::LocalMemory<uint8_t> buffer;
981 int32_t capacity = 100000;
982 uint8_t *dest = buffer.allocateInsteadAndCopy(capacity);
983 if(dest == NULL) {
984 fprintf(stderr, "memory allocation (%ld bytes) for file contents failed\n",
985 (long)capacity);
986 *status = U_MEMORY_ALLOCATION_ERROR;
987 res_close(result);
988 return NULL;
989 }
990 int32_t indexes[icu::CollationDataReader::IX_TOTAL_SIZE + 1];
991 int32_t totalSize = icu::CollationDataWriter::writeTailoring(
992 *t, *t->settings, indexes, dest, capacity, intStatus);
993 if(intStatus == U_BUFFER_OVERFLOW_ERROR) {
994 intStatus = U_ZERO_ERROR;
995 capacity = totalSize;
996 dest = buffer.allocateInsteadAndCopy(capacity);
997 if(dest == NULL) {
998 fprintf(stderr, "memory allocation (%ld bytes) for file contents failed\n",
999 (long)capacity);
1000 *status = U_MEMORY_ALLOCATION_ERROR;
1001 res_close(result);
1002 return NULL;
1003 }
1004 totalSize = icu::CollationDataWriter::writeTailoring(
1005 *t, *t->settings, indexes, dest, capacity, intStatus);
1006 }
1007 if(U_FAILURE(intStatus)) {
1008 fprintf(stderr, "CollationDataWriter::writeTailoring() failed: %s\n",
1009 u_errorName(intStatus));
1010 res_close(result);
1011 return NULL;
1012 }
1013 if(isVerbose()) {
1014 printf("%s~%s collation tailoring part sizes:\n", state->filename, collationType);
1015 icu::CollationInfo::printSizes(totalSize, indexes);
1016 if(t->settings->hasReordering()) {
1017 printf("%s~%s collation reordering ranges:\n", state->filename, collationType);
1018 icu::CollationInfo::printReorderRanges(
1019 *t->data, t->settings->reorderCodes, t->settings->reorderCodesLength);
1020 }
1021 #if 0 // debugging output
1022 } else {
1023 printf("%s~%s collation tailoring part sizes:\n", state->filename, collationType);
1024 icu::CollationInfo::printSizes(totalSize, indexes);
1025 #endif
1026 }
1027 struct SResource *collationBin = bin_open(state->bundle, "%%CollationBin", totalSize, dest, NULL, NULL, status);
1028 result->add(collationBin, line, *status);
1029 if (U_FAILURE(*status)) {
1030 res_close(result);
1031 return NULL;
1032 }
1033 #endif
1034 return result;
1035 }
1036
1037 static UBool
1038 keepCollationType(const char * /*type*/) {
1039 return TRUE;
1040 }
1041
1042 static struct SResource *
1043 parseCollationElements(ParseState* state, char *tag, uint32_t startline, UBool newCollation, UErrorCode *status)
1044 {
1045 TableResource *result = NULL;
1046 struct SResource *member = NULL;
1047 struct UString *tokenValue;
1048 struct UString comment;
1049 enum ETokenType token;
1050 char subtag[1024], typeKeyword[1024];
1051 uint32_t line;
1052
1053 result = table_open(state->bundle, tag, NULL, status);
1054
1055 if (result == NULL || U_FAILURE(*status))
1056 {
1057 return NULL;
1058 }
1059 if(isVerbose()){
1060 printf(" collation elements %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
1061 }
1062 if(!newCollation) {
1063 return addCollation(state, result, "(no type)", startline, status);
1064 }
1065 else {
1066 for(;;) {
1067 ustr_init(&comment);
1068 token = getToken(state, &tokenValue, &comment, &line, status);
1069
1070 if (token == TOK_CLOSE_BRACE)
1071 {
1072 return result;
1073 }
1074
1075 if (token != TOK_STRING)
1076 {
1077 res_close(result);
1078 *status = U_INVALID_FORMAT_ERROR;
1079
1080 if (token == TOK_EOF)
1081 {
1082 error(startline, "unterminated table");
1083 }
1084 else
1085 {
1086 error(line, "Unexpected token %s", tokenNames[token]);
1087 }
1088
1089 return NULL;
1090 }
1091
1092 u_UCharsToChars(tokenValue->fChars, subtag, u_strlen(tokenValue->fChars) + 1);
1093
1094 if (U_FAILURE(*status))
1095 {
1096 res_close(result);
1097 return NULL;
1098 }
1099
1100 if (uprv_strcmp(subtag, "default") == 0)
1101 {
1102 member = parseResource(state, subtag, NULL, status);
1103
1104 if (U_FAILURE(*status))
1105 {
1106 res_close(result);
1107 return NULL;
1108 }
1109
1110 result->add(member, line, *status);
1111 }
1112 else
1113 {
1114 token = peekToken(state, 0, &tokenValue, &line, &comment, status);
1115 /* this probably needs to be refactored or recursively use the parser */
1116 /* first we assume that our collation table won't have the explicit type */
1117 /* then, we cannot handle aliases */
1118 if(token == TOK_OPEN_BRACE) {
1119 token = getToken(state, &tokenValue, &comment, &line, status);
1120 TableResource *collationRes;
1121 if (keepCollationType(subtag)) {
1122 collationRes = table_open(state->bundle, subtag, NULL, status);
1123 } else {
1124 collationRes = NULL;
1125 }
1126 // need to parse the collation data regardless
1127 collationRes = addCollation(state, collationRes, subtag, startline, status);
1128 if (collationRes != NULL) {
1129 result->add(collationRes, startline, *status);
1130 }
1131 } else if(token == TOK_COLON) { /* right now, we'll just try to see if we have aliases */
1132 /* we could have a table too */
1133 token = peekToken(state, 1, &tokenValue, &line, &comment, status);
1134 u_UCharsToChars(tokenValue->fChars, typeKeyword, u_strlen(tokenValue->fChars) + 1);
1135 if(uprv_strcmp(typeKeyword, "alias") == 0) {
1136 member = parseResource(state, subtag, NULL, status);
1137 if (U_FAILURE(*status))
1138 {
1139 res_close(result);
1140 return NULL;
1141 }
1142
1143 result->add(member, line, *status);
1144 } else {
1145 res_close(result);
1146 *status = U_INVALID_FORMAT_ERROR;
1147 return NULL;
1148 }
1149 } else {
1150 res_close(result);
1151 *status = U_INVALID_FORMAT_ERROR;
1152 return NULL;
1153 }
1154 }
1155
1156 /*member = string_open(bundle, subtag, tokenValue->fChars, tokenValue->fLength, status);*/
1157
1158 /*expect(TOK_CLOSE_BRACE, NULL, NULL, status);*/
1159
1160 if (U_FAILURE(*status))
1161 {
1162 res_close(result);
1163 return NULL;
1164 }
1165 }
1166 }
1167 }
1168
1169 /* Necessary, because CollationElements requires the bundle->fRoot member to be present which,
1170 if this weren't special-cased, wouldn't be set until the entire file had been processed. */
1171 static struct SResource *
1172 realParseTable(ParseState* state, TableResource *table, char *tag, uint32_t startline, UErrorCode *status)
1173 {
1174 struct SResource *member = NULL;
1175 struct UString *tokenValue=NULL;
1176 struct UString comment;
1177 enum ETokenType token;
1178 char subtag[1024];
1179 uint32_t line;
1180 UBool readToken = FALSE;
1181
1182 /* '{' . (name resource)* '}' */
1183
1184 if(isVerbose()){
1185 printf(" parsing table %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
1186 }
1187 for (;;)
1188 {
1189 ustr_init(&comment);
1190 token = getToken(state, &tokenValue, &comment, &line, status);
1191
1192 if (token == TOK_CLOSE_BRACE)
1193 {
1194 if (!readToken) {
1195 warning(startline, "Encountered empty table");
1196 }
1197 return table;
1198 }
1199
1200 if (token != TOK_STRING)
1201 {
1202 *status = U_INVALID_FORMAT_ERROR;
1203
1204 if (token == TOK_EOF)
1205 {
1206 error(startline, "unterminated table");
1207 }
1208 else
1209 {
1210 error(line, "unexpected token %s", tokenNames[token]);
1211 }
1212
1213 return NULL;
1214 }
1215
1216 if(uprv_isInvariantUString(tokenValue->fChars, -1)) {
1217 u_UCharsToChars(tokenValue->fChars, subtag, u_strlen(tokenValue->fChars) + 1);
1218 } else {
1219 *status = U_INVALID_FORMAT_ERROR;
1220 error(line, "invariant characters required for table keys");
1221 return NULL;
1222 }
1223
1224 if (U_FAILURE(*status))
1225 {
1226 error(line, "parse error. Stopped parsing tokens with %s", u_errorName(*status));
1227 return NULL;
1228 }
1229
1230 member = parseResource(state, subtag, &comment, status);
1231
1232 if (member == NULL || U_FAILURE(*status))
1233 {
1234 error(line, "parse error. Stopped parsing resource with %s", u_errorName(*status));
1235 return NULL;
1236 }
1237
1238 table->add(member, line, *status);
1239
1240 if (U_FAILURE(*status))
1241 {
1242 error(line, "parse error. Stopped parsing table with %s", u_errorName(*status));
1243 return NULL;
1244 }
1245 readToken = TRUE;
1246 ustr_deinit(&comment);
1247 }
1248
1249 /* not reached */
1250 /* A compiler warning will appear if all paths don't contain a return statement. */
1251 /* *status = U_INTERNAL_PROGRAM_ERROR;
1252 return NULL;*/
1253 }
1254
1255 static struct SResource *
1256 parseTable(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
1257 {
1258 if (tag != NULL && uprv_strcmp(tag, "CollationElements") == 0)
1259 {
1260 return parseCollationElements(state, tag, startline, FALSE, status);
1261 }
1262 if (tag != NULL && uprv_strcmp(tag, "collations") == 0)
1263 {
1264 return parseCollationElements(state, tag, startline, TRUE, status);
1265 }
1266 if(isVerbose()){
1267 printf(" table %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
1268 }
1269
1270 TableResource *result = table_open(state->bundle, tag, comment, status);
1271
1272 if (result == NULL || U_FAILURE(*status))
1273 {
1274 return NULL;
1275 }
1276 return realParseTable(state, result, tag, startline, status);
1277 }
1278
1279 static struct SResource *
1280 parseArray(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
1281 {
1282 struct SResource *member = NULL;
1283 struct UString *tokenValue;
1284 struct UString memberComments;
1285 enum ETokenType token;
1286 UBool readToken = FALSE;
1287
1288 ArrayResource *result = array_open(state->bundle, tag, comment, status);
1289
1290 if (result == NULL || U_FAILURE(*status))
1291 {
1292 return NULL;
1293 }
1294 if(isVerbose()){
1295 printf(" array %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
1296 }
1297
1298 ustr_init(&memberComments);
1299
1300 /* '{' . resource [','] '}' */
1301 for (;;)
1302 {
1303 /* reset length */
1304 ustr_setlen(&memberComments, 0, status);
1305
1306 /* check for end of array, but don't consume next token unless it really is the end */
1307 token = peekToken(state, 0, &tokenValue, NULL, &memberComments, status);
1308
1309
1310 if (token == TOK_CLOSE_BRACE)
1311 {
1312 getToken(state, NULL, NULL, NULL, status);
1313 if (!readToken) {
1314 warning(startline, "Encountered empty array");
1315 }
1316 break;
1317 }
1318
1319 if (token == TOK_EOF)
1320 {
1321 res_close(result);
1322 *status = U_INVALID_FORMAT_ERROR;
1323 error(startline, "unterminated array");
1324 return NULL;
1325 }
1326
1327 /* string arrays are a special case */
1328 if (token == TOK_STRING)
1329 {
1330 getToken(state, &tokenValue, &memberComments, NULL, status);
1331 member = string_open(state->bundle, NULL, tokenValue->fChars, tokenValue->fLength, &memberComments, status);
1332 }
1333 else
1334 {
1335 member = parseResource(state, NULL, &memberComments, status);
1336 }
1337
1338 if (member == NULL || U_FAILURE(*status))
1339 {
1340 res_close(result);
1341 return NULL;
1342 }
1343
1344 result->add(member);
1345
1346 /* eat optional comma if present */
1347 token = peekToken(state, 0, NULL, NULL, NULL, status);
1348
1349 if (token == TOK_COMMA)
1350 {
1351 getToken(state, NULL, NULL, NULL, status);
1352 }
1353
1354 if (U_FAILURE(*status))
1355 {
1356 res_close(result);
1357 return NULL;
1358 }
1359 readToken = TRUE;
1360 }
1361
1362 ustr_deinit(&memberComments);
1363 return result;
1364 }
1365
1366 static struct SResource *
1367 parseIntVector(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
1368 {
1369 enum ETokenType token;
1370 char *string;
1371 int32_t value;
1372 UBool readToken = FALSE;
1373 char *stopstring;
1374 struct UString memberComments;
1375
1376 IntVectorResource *result = intvector_open(state->bundle, tag, comment, status);
1377
1378 if (result == NULL || U_FAILURE(*status))
1379 {
1380 return NULL;
1381 }
1382
1383 if(isVerbose()){
1384 printf(" vector %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
1385 }
1386 ustr_init(&memberComments);
1387 /* '{' . string [','] '}' */
1388 for (;;)
1389 {
1390 ustr_setlen(&memberComments, 0, status);
1391
1392 /* check for end of array, but don't consume next token unless it really is the end */
1393 token = peekToken(state, 0, NULL, NULL,&memberComments, status);
1394
1395 if (token == TOK_CLOSE_BRACE)
1396 {
1397 /* it's the end, consume the close brace */
1398 getToken(state, NULL, NULL, NULL, status);
1399 if (!readToken) {
1400 warning(startline, "Encountered empty int vector");
1401 }
1402 ustr_deinit(&memberComments);
1403 return result;
1404 }
1405
1406 int32_t stringLength;
1407 string = getInvariantString(state, NULL, NULL, stringLength, status);
1408
1409 if (U_FAILURE(*status))
1410 {
1411 res_close(result);
1412 return NULL;
1413 }
1414
1415 /* For handling illegal char in the Intvector */
1416 value = uprv_strtoul(string, &stopstring, 0);/* make intvector support decimal,hexdigit,octal digit ranging from -2^31-2^32-1*/
1417 int32_t len = (int32_t)(stopstring-string);
1418
1419 if(len==stringLength)
1420 {
1421 result->add(value, *status);
1422 uprv_free(string);
1423 token = peekToken(state, 0, NULL, NULL, NULL, status);
1424 }
1425 else
1426 {
1427 uprv_free(string);
1428 *status=U_INVALID_CHAR_FOUND;
1429 }
1430
1431 if (U_FAILURE(*status))
1432 {
1433 res_close(result);
1434 return NULL;
1435 }
1436
1437 /* the comma is optional (even though it is required to prevent the reader from concatenating
1438 consecutive entries) so that a missing comma on the last entry isn't an error */
1439 if (token == TOK_COMMA)
1440 {
1441 getToken(state, NULL, NULL, NULL, status);
1442 }
1443 readToken = TRUE;
1444 }
1445
1446 /* not reached */
1447 /* A compiler warning will appear if all paths don't contain a return statement. */
1448 /* intvector_close(result, status);
1449 *status = U_INTERNAL_PROGRAM_ERROR;
1450 return NULL;*/
1451 }
1452
1453 static struct SResource *
1454 parseBinary(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
1455 {
1456 uint32_t line;
1457 int32_t stringLength;
1458 LocalMemory<char> string(getInvariantString(state, &line, NULL, stringLength, status));
1459 if (string.isNull() || U_FAILURE(*status))
1460 {
1461 return NULL;
1462 }
1463
1464 expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
1465 if (U_FAILURE(*status))
1466 {
1467 return NULL;
1468 }
1469
1470 if(isVerbose()){
1471 printf(" binary %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
1472 }
1473
1474 LocalMemory<uint8_t> value;
1475 int32_t count = 0;
1476 if (stringLength > 0 && value.allocateInsteadAndCopy(stringLength) == NULL)
1477 {
1478 *status = U_MEMORY_ALLOCATION_ERROR;
1479 return NULL;
1480 }
1481
1482 char toConv[3] = {'\0', '\0', '\0'};
1483 for (int32_t i = 0; i < stringLength;)
1484 {
1485 // Skip spaces (which may have been line endings).
1486 char c0 = string[i++];
1487 if (c0 == ' ') { continue; }
1488 if (i == stringLength) {
1489 *status=U_INVALID_CHAR_FOUND;
1490 error(line, "Encountered invalid binary value (odd number of hex digits)");
1491 return NULL;
1492 }
1493 toConv[0] = c0;
1494 toConv[1] = string[i++];
1495
1496 char *stopstring;
1497 value[count++] = (uint8_t) uprv_strtoul(toConv, &stopstring, 16);
1498 uint32_t len=(uint32_t)(stopstring-toConv);
1499
1500 if(len!=2)
1501 {
1502 *status=U_INVALID_CHAR_FOUND;
1503 error(line, "Encountered invalid binary value (not all pairs of hex digits)");
1504 return NULL;
1505 }
1506 }
1507
1508 if (count == 0) {
1509 warning(startline, "Encountered empty binary value");
1510 return bin_open(state->bundle, tag, 0, NULL, "", comment, status);
1511 } else {
1512 return bin_open(state->bundle, tag, count, value.getAlias(), NULL, comment, status);
1513 }
1514 }
1515
1516 static struct SResource *
1517 parseInteger(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
1518 {
1519 struct SResource *result = NULL;
1520 int32_t value;
1521 char *string;
1522 char *stopstring;
1523
1524 int32_t stringLength;
1525 string = getInvariantString(state, NULL, NULL, stringLength, status);
1526
1527 if (string == NULL || U_FAILURE(*status))
1528 {
1529 return NULL;
1530 }
1531
1532 expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
1533
1534 if (U_FAILURE(*status))
1535 {
1536 uprv_free(string);
1537 return NULL;
1538 }
1539
1540 if(isVerbose()){
1541 printf(" integer %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
1542 }
1543
1544 if (stringLength == 0)
1545 {
1546 warning(startline, "Encountered empty integer. Default value is 0.");
1547 }
1548
1549 /* Allow integer support for hexdecimal, octal digit and decimal*/
1550 /* and handle illegal char in the integer*/
1551 value = uprv_strtoul(string, &stopstring, 0);
1552 int32_t len = (int32_t)(stopstring-string);
1553 if(len==stringLength)
1554 {
1555 result = int_open(state->bundle, tag, value, comment, status);
1556 }
1557 else
1558 {
1559 *status=U_INVALID_CHAR_FOUND;
1560 }
1561 uprv_free(string);
1562
1563 return result;
1564 }
1565
1566 static struct SResource *
1567 parseImport(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status)
1568 {
1569 uint32_t line;
1570 int32_t stringLength;
1571 LocalMemory<char> filename(getInvariantString(state, &line, NULL, stringLength, status));
1572 if (U_FAILURE(*status))
1573 {
1574 return NULL;
1575 }
1576
1577 expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
1578
1579 if (U_FAILURE(*status))
1580 {
1581 return NULL;
1582 }
1583
1584 if(isVerbose()){
1585 printf(" import %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
1586 }
1587
1588 /* Open the input file for reading */
1589 CharString fullname;
1590 if (state->inputdir != NULL) {
1591 fullname.append(state->inputdir, *status);
1592 }
1593 fullname.appendPathPart(filename.getAlias(), *status);
1594 if (U_FAILURE(*status)) {
1595 return NULL;
1596 }
1597
1598 FileStream *file = T_FileStream_open(fullname.data(), "rb");
1599 if (file == NULL)
1600 {
1601 error(line, "couldn't open input file %s", filename.getAlias());
1602 *status = U_FILE_ACCESS_ERROR;
1603 return NULL;
1604 }
1605
1606 int32_t len = T_FileStream_size(file);
1607 LocalMemory<uint8_t> data;
1608 if(data.allocateInsteadAndCopy(len) == NULL)
1609 {
1610 *status = U_MEMORY_ALLOCATION_ERROR;
1611 T_FileStream_close (file);
1612 return NULL;
1613 }
1614
1615 /* int32_t numRead = */ T_FileStream_read(file, data.getAlias(), len);
1616 T_FileStream_close (file);
1617
1618 return bin_open(state->bundle, tag, len, data.getAlias(), fullname.data(), comment, status);
1619 }
1620
1621 static struct SResource *
1622 parseInclude(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status)
1623 {
1624 struct SResource *result;
1625 int32_t len=0;
1626 char *filename;
1627 uint32_t line;
1628 UChar *pTarget = NULL;
1629
1630 UCHARBUF *ucbuf;
1631 char *fullname = NULL;
1632 const char* cp = NULL;
1633 const UChar* uBuffer = NULL;
1634
1635 int32_t stringLength;
1636 filename = getInvariantString(state, &line, NULL, stringLength, status);
1637
1638 if (U_FAILURE(*status))
1639 {
1640 return NULL;
1641 }
1642
1643 expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
1644
1645 if (U_FAILURE(*status))
1646 {
1647 uprv_free(filename);
1648 return NULL;
1649 }
1650
1651 if(isVerbose()){
1652 printf(" include %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
1653 }
1654
1655 fullname = (char *) uprv_malloc(state->inputdirLength + stringLength + 2);
1656 /* test for NULL */
1657 if(fullname == NULL)
1658 {
1659 *status = U_MEMORY_ALLOCATION_ERROR;
1660 uprv_free(filename);
1661 return NULL;
1662 }
1663
1664 if(state->inputdir!=NULL){
1665 if (state->inputdir[state->inputdirLength - 1] != U_FILE_SEP_CHAR)
1666 {
1667
1668 uprv_strcpy(fullname, state->inputdir);
1669
1670 fullname[state->inputdirLength] = U_FILE_SEP_CHAR;
1671 fullname[state->inputdirLength + 1] = '\0';
1672
1673 uprv_strcat(fullname, filename);
1674 }
1675 else
1676 {
1677 uprv_strcpy(fullname, state->inputdir);
1678 uprv_strcat(fullname, filename);
1679 }
1680 }else{
1681 uprv_strcpy(fullname,filename);
1682 }
1683
1684 ucbuf = ucbuf_open(fullname, &cp,getShowWarning(),FALSE,status);
1685
1686 if (U_FAILURE(*status)) {
1687 error(line, "couldn't open input file %s\n", filename);
1688 return NULL;
1689 }
1690
1691 uBuffer = ucbuf_getBuffer(ucbuf,&len,status);
1692 result = string_open(state->bundle, tag, uBuffer, len, comment, status);
1693
1694 ucbuf_close(ucbuf);
1695
1696 uprv_free(pTarget);
1697
1698 uprv_free(filename);
1699 uprv_free(fullname);
1700
1701 return result;
1702 }
1703
1704
1705
1706
1707
1708 U_STRING_DECL(k_type_string, "string", 6);
1709 U_STRING_DECL(k_type_binary, "binary", 6);
1710 U_STRING_DECL(k_type_bin, "bin", 3);
1711 U_STRING_DECL(k_type_table, "table", 5);
1712 U_STRING_DECL(k_type_table_no_fallback, "table(nofallback)", 17);
1713 U_STRING_DECL(k_type_int, "int", 3);
1714 U_STRING_DECL(k_type_integer, "integer", 7);
1715 U_STRING_DECL(k_type_array, "array", 5);
1716 U_STRING_DECL(k_type_alias, "alias", 5);
1717 U_STRING_DECL(k_type_intvector, "intvector", 9);
1718 U_STRING_DECL(k_type_import, "import", 6);
1719 U_STRING_DECL(k_type_include, "include", 7);
1720
1721 /* Various non-standard processing plugins that create one or more special resources. */
1722 U_STRING_DECL(k_type_plugin_uca_rules, "process(uca_rules)", 18);
1723 U_STRING_DECL(k_type_plugin_collation, "process(collation)", 18);
1724 U_STRING_DECL(k_type_plugin_transliterator, "process(transliterator)", 23);
1725 U_STRING_DECL(k_type_plugin_dependency, "process(dependency)", 19);
1726
1727 typedef enum EResourceType
1728 {
1729 RESTYPE_UNKNOWN,
1730 RESTYPE_STRING,
1731 RESTYPE_BINARY,
1732 RESTYPE_TABLE,
1733 RESTYPE_TABLE_NO_FALLBACK,
1734 RESTYPE_INTEGER,
1735 RESTYPE_ARRAY,
1736 RESTYPE_ALIAS,
1737 RESTYPE_INTVECTOR,
1738 RESTYPE_IMPORT,
1739 RESTYPE_INCLUDE,
1740 RESTYPE_PROCESS_UCA_RULES,
1741 RESTYPE_PROCESS_COLLATION,
1742 RESTYPE_PROCESS_TRANSLITERATOR,
1743 RESTYPE_PROCESS_DEPENDENCY,
1744 RESTYPE_RESERVED
1745 } EResourceType;
1746
1747 static struct {
1748 const char *nameChars; /* only used for debugging */
1749 const UChar *nameUChars;
1750 ParseResourceFunction *parseFunction;
1751 } gResourceTypes[] = {
1752 {"Unknown", NULL, NULL},
1753 {"string", k_type_string, parseString},
1754 {"binary", k_type_binary, parseBinary},
1755 {"table", k_type_table, parseTable},
1756 {"table(nofallback)", k_type_table_no_fallback, NULL}, /* parseFunction will never be called */
1757 {"integer", k_type_integer, parseInteger},
1758 {"array", k_type_array, parseArray},
1759 {"alias", k_type_alias, parseAlias},
1760 {"intvector", k_type_intvector, parseIntVector},
1761 {"import", k_type_import, parseImport},
1762 {"include", k_type_include, parseInclude},
1763 {"process(uca_rules)", k_type_plugin_uca_rules, parseUCARules},
1764 {"process(collation)", k_type_plugin_collation, NULL /* not implemented yet */},
1765 {"process(transliterator)", k_type_plugin_transliterator, parseTransliterator},
1766 {"process(dependency)", k_type_plugin_dependency, parseDependency},
1767 {"reserved", NULL, NULL}
1768 };
1769
1770 void initParser()
1771 {
1772 U_STRING_INIT(k_type_string, "string", 6);
1773 U_STRING_INIT(k_type_binary, "binary", 6);
1774 U_STRING_INIT(k_type_bin, "bin", 3);
1775 U_STRING_INIT(k_type_table, "table", 5);
1776 U_STRING_INIT(k_type_table_no_fallback, "table(nofallback)", 17);
1777 U_STRING_INIT(k_type_int, "int", 3);
1778 U_STRING_INIT(k_type_integer, "integer", 7);
1779 U_STRING_INIT(k_type_array, "array", 5);
1780 U_STRING_INIT(k_type_alias, "alias", 5);
1781 U_STRING_INIT(k_type_intvector, "intvector", 9);
1782 U_STRING_INIT(k_type_import, "import", 6);
1783 U_STRING_INIT(k_type_include, "include", 7);
1784
1785 U_STRING_INIT(k_type_plugin_uca_rules, "process(uca_rules)", 18);
1786 U_STRING_INIT(k_type_plugin_collation, "process(collation)", 18);
1787 U_STRING_INIT(k_type_plugin_transliterator, "process(transliterator)", 23);
1788 U_STRING_INIT(k_type_plugin_dependency, "process(dependency)", 19);
1789 }
1790
1791 static inline UBool isTable(enum EResourceType type) {
1792 return (UBool)(type==RESTYPE_TABLE || type==RESTYPE_TABLE_NO_FALLBACK);
1793 }
1794
1795 static enum EResourceType
1796 parseResourceType(ParseState* state, UErrorCode *status)
1797 {
1798 struct UString *tokenValue;
1799 struct UString comment;
1800 enum EResourceType result = RESTYPE_UNKNOWN;
1801 uint32_t line=0;
1802 ustr_init(&comment);
1803 expect(state, TOK_STRING, &tokenValue, &comment, &line, status);
1804
1805 if (U_FAILURE(*status))
1806 {
1807 return RESTYPE_UNKNOWN;
1808 }
1809
1810 *status = U_ZERO_ERROR;
1811
1812 /* Search for normal types */
1813 result=RESTYPE_UNKNOWN;
1814 while ((result=(EResourceType)(result+1)) < RESTYPE_RESERVED) {
1815 if (u_strcmp(tokenValue->fChars, gResourceTypes[result].nameUChars) == 0) {
1816 break;
1817 }
1818 }
1819 /* Now search for the aliases */
1820 if (u_strcmp(tokenValue->fChars, k_type_int) == 0) {
1821 result = RESTYPE_INTEGER;
1822 }
1823 else if (u_strcmp(tokenValue->fChars, k_type_bin) == 0) {
1824 result = RESTYPE_BINARY;
1825 }
1826 else if (result == RESTYPE_RESERVED) {
1827 char tokenBuffer[1024];
1828 u_austrncpy(tokenBuffer, tokenValue->fChars, sizeof(tokenBuffer));
1829 tokenBuffer[sizeof(tokenBuffer) - 1] = 0;
1830 *status = U_INVALID_FORMAT_ERROR;
1831 error(line, "unknown resource type '%s'", tokenBuffer);
1832 }
1833
1834 return result;
1835 }
1836
1837 /* parse a non-top-level resource */
1838 static struct SResource *
1839 parseResource(ParseState* state, char *tag, const struct UString *comment, UErrorCode *status)
1840 {
1841 enum ETokenType token;
1842 enum EResourceType resType = RESTYPE_UNKNOWN;
1843 ParseResourceFunction *parseFunction = NULL;
1844 struct UString *tokenValue;
1845 uint32_t startline;
1846 uint32_t line;
1847
1848
1849 token = getToken(state, &tokenValue, NULL, &startline, status);
1850
1851 if(isVerbose()){
1852 printf(" resource %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
1853 }
1854
1855 /* name . [ ':' type ] '{' resource '}' */
1856 /* This function parses from the colon onwards. If the colon is present, parse the
1857 type then try to parse a resource of that type. If there is no explicit type,
1858 work it out using the lookahead tokens. */
1859 switch (token)
1860 {
1861 case TOK_EOF:
1862 *status = U_INVALID_FORMAT_ERROR;
1863 error(startline, "Unexpected EOF encountered");
1864 return NULL;
1865
1866 case TOK_ERROR:
1867 *status = U_INVALID_FORMAT_ERROR;
1868 return NULL;
1869
1870 case TOK_COLON:
1871 resType = parseResourceType(state, status);
1872 expect(state, TOK_OPEN_BRACE, &tokenValue, NULL, &startline, status);
1873
1874 if (U_FAILURE(*status))
1875 {
1876 return NULL;
1877 }
1878
1879 break;
1880
1881 case TOK_OPEN_BRACE:
1882 break;
1883
1884 default:
1885 *status = U_INVALID_FORMAT_ERROR;
1886 error(startline, "syntax error while reading a resource, expected '{' or ':'");
1887 return NULL;
1888 }
1889
1890
1891 if (resType == RESTYPE_UNKNOWN)
1892 {
1893 /* No explicit type, so try to work it out. At this point, we've read the first '{'.
1894 We could have any of the following:
1895 { { => array (nested)
1896 { :/} => array
1897 { string , => string array
1898
1899 { string { => table
1900
1901 { string :/{ => table
1902 { string } => string
1903 */
1904
1905 token = peekToken(state, 0, NULL, &line, NULL,status);
1906
1907 if (U_FAILURE(*status))
1908 {
1909 return NULL;
1910 }
1911
1912 if (token == TOK_OPEN_BRACE || token == TOK_COLON ||token ==TOK_CLOSE_BRACE )
1913 {
1914 resType = RESTYPE_ARRAY;
1915 }
1916 else if (token == TOK_STRING)
1917 {
1918 token = peekToken(state, 1, NULL, &line, NULL, status);
1919
1920 if (U_FAILURE(*status))
1921 {
1922 return NULL;
1923 }
1924
1925 switch (token)
1926 {
1927 case TOK_COMMA: resType = RESTYPE_ARRAY; break;
1928 case TOK_OPEN_BRACE: resType = RESTYPE_TABLE; break;
1929 case TOK_CLOSE_BRACE: resType = RESTYPE_STRING; break;
1930 case TOK_COLON: resType = RESTYPE_TABLE; break;
1931 default:
1932 *status = U_INVALID_FORMAT_ERROR;
1933 error(line, "Unexpected token after string, expected ',', '{' or '}'");
1934 return NULL;
1935 }
1936 }
1937 else
1938 {
1939 *status = U_INVALID_FORMAT_ERROR;
1940 error(line, "Unexpected token after '{'");
1941 return NULL;
1942 }
1943
1944 /* printf("Type guessed as %s\n", resourceNames[resType]); */
1945 } else if(resType == RESTYPE_TABLE_NO_FALLBACK) {
1946 *status = U_INVALID_FORMAT_ERROR;
1947 error(startline, "error: %s resource type not valid except on top bundle level", gResourceTypes[resType].nameChars);
1948 return NULL;
1949 }
1950
1951
1952 /* We should now know what we need to parse next, so call the appropriate parser
1953 function and return. */
1954 parseFunction = gResourceTypes[resType].parseFunction;
1955 if (parseFunction != NULL) {
1956 return parseFunction(state, tag, startline, comment, status);
1957 }
1958 else {
1959 *status = U_INTERNAL_PROGRAM_ERROR;
1960 error(startline, "internal error: %s resource type found and not handled", gResourceTypes[resType].nameChars);
1961 }
1962
1963 return NULL;
1964 }
1965
1966 /* parse the top-level resource */
1967 struct SRBRoot *
1968 parse(UCHARBUF *buf, const char *inputDir, const char *outputDir, const char *filename,
1969 UBool makeBinaryCollation, UBool omitCollationRules, UErrorCode *status)
1970 {
1971 struct UString *tokenValue;
1972 struct UString comment;
1973 uint32_t line;
1974 enum EResourceType bundleType;
1975 enum ETokenType token;
1976 ParseState state;
1977 uint32_t i;
1978
1979
1980 for (i = 0; i < MAX_LOOKAHEAD + 1; i++)
1981 {
1982 ustr_init(&state.lookahead[i].value);
1983 ustr_init(&state.lookahead[i].comment);
1984 }
1985
1986 initLookahead(&state, buf, status);
1987
1988 state.inputdir = inputDir;
1989 state.inputdirLength = (state.inputdir != NULL) ? (uint32_t)uprv_strlen(state.inputdir) : 0;
1990 state.outputdir = outputDir;
1991 state.outputdirLength = (state.outputdir != NULL) ? (uint32_t)uprv_strlen(state.outputdir) : 0;
1992 state.filename = filename;
1993 state.makeBinaryCollation = makeBinaryCollation;
1994 state.omitCollationRules = omitCollationRules;
1995
1996 ustr_init(&comment);
1997 expect(&state, TOK_STRING, &tokenValue, &comment, NULL, status);
1998
1999 state.bundle = new SRBRoot(&comment, FALSE, *status);
2000
2001 if (state.bundle == NULL || U_FAILURE(*status))
2002 {
2003 delete state.bundle;
2004
2005 return NULL;
2006 }
2007
2008
2009 state.bundle->setLocale(tokenValue->fChars, *status);
2010
2011 /* The following code is to make Empty bundle work no matter with :table specifer or not */
2012 token = getToken(&state, NULL, NULL, &line, status);
2013 if(token==TOK_COLON) {
2014 *status=U_ZERO_ERROR;
2015 bundleType=parseResourceType(&state, status);
2016
2017 if(isTable(bundleType))
2018 {
2019 expect(&state, TOK_OPEN_BRACE, NULL, NULL, &line, status);
2020 }
2021 else
2022 {
2023 *status=U_PARSE_ERROR;
2024 error(line, "parse error. Stopped parsing with %s", u_errorName(*status));
2025 }
2026 }
2027 else
2028 {
2029 /* not a colon */
2030 if(token==TOK_OPEN_BRACE)
2031 {
2032 *status=U_ZERO_ERROR;
2033 bundleType=RESTYPE_TABLE;
2034 }
2035 else
2036 {
2037 /* neither colon nor open brace */
2038 *status=U_PARSE_ERROR;
2039 bundleType=RESTYPE_UNKNOWN;
2040 error(line, "parse error, did not find open-brace '{' or colon ':', stopped with %s", u_errorName(*status));
2041 }
2042 }
2043
2044 if (U_FAILURE(*status))
2045 {
2046 delete state.bundle;
2047 return NULL;
2048 }
2049
2050 if(bundleType==RESTYPE_TABLE_NO_FALLBACK) {
2051 /*
2052 * Parse a top-level table with the table(nofallback) declaration.
2053 * This is the same as a regular table, but also sets the
2054 * URES_ATT_NO_FALLBACK flag in indexes[URES_INDEX_ATTRIBUTES] .
2055 */
2056 state.bundle->fNoFallback=TRUE;
2057 }
2058 /* top-level tables need not handle special table names like "collations" */
2059 assert(!state.bundle->fIsPoolBundle);
2060 assert(state.bundle->fRoot->fType == URES_TABLE);
2061 TableResource *rootTable = static_cast<TableResource *>(state.bundle->fRoot);
2062 realParseTable(&state, rootTable, NULL, line, status);
2063 if(dependencyArray!=NULL){
2064 rootTable->add(dependencyArray, 0, *status);
2065 dependencyArray = NULL;
2066 }
2067 if (U_FAILURE(*status))
2068 {
2069 delete state.bundle;
2070 res_close(dependencyArray);
2071 return NULL;
2072 }
2073
2074 if (getToken(&state, NULL, NULL, &line, status) != TOK_EOF)
2075 {
2076 warning(line, "extraneous text after resource bundle (perhaps unmatched braces)");
2077 if(isStrict()){
2078 *status = U_INVALID_FORMAT_ERROR;
2079 return NULL;
2080 }
2081 }
2082
2083 cleanupLookahead(&state);
2084 ustr_deinit(&comment);
2085 return state.bundle;
2086 }