icuSources/tools/genrb/read.c

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 1998-2011, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *
   9 * File read.c
  10 *
  11 * Modification History:
  12 *
  13 *   Date        Name        Description
  14 *   05/26/99    stephen     Creation.
  15 *   5/10/01     Ram         removed ustdio dependency
  16 *******************************************************************************
  17 */
  18
  19 #include "read.h"
  20 #include "errmsg.h"
  21 #include "unicode/ustring.h"
  22
  23 #define OPENBRACE    0x007B
  24 #define CLOSEBRACE   0x007D
  25 #define COMMA        0x002C
  26 #define QUOTE        0x0022
  27 #define ESCAPE       0x005C
  28 #define SLASH        0x002F
  29 #define ASTERISK     0x002A
  30 #define SPACE        0x0020
  31 #define COLON        0x003A
  32 #define BADBOM       0xFFFE
  33 #define CR           0x000D
  34 #define LF           0x000A
  35
  36 static int32_t lineCount;
  37
  38 /* Protos */
  39 static enum ETokenType getStringToken(UCHARBUF *buf,
  40                                       UChar32 initialChar,
  41                                       struct UString *token,
  42                                       UErrorCode *status);
  43
  44 static UChar32 getNextChar           (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status);
  45 static void    seekUntilNewline      (UCHARBUF *buf, struct UString *token, UErrorCode *status);
  46 static void    seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status);
  47 static UBool   isWhitespace          (UChar32 c);
  48 static UBool   isNewline             (UChar32 c);
  49
  50 U_CFUNC void resetLineNumber() {
  51     lineCount = 1;
  52 }
  53
  54 /* Read and return the next token from the stream.  If the token is of
  55    type eString, fill in the token parameter with the token.  If the
  56    token is eError, then the status parameter will contain the
  57    specific error.  This will be eItemNotFound at the end of file,
  58    indicating that all tokens have been returned.  This method will
  59    never return eString twice in a row; instead, multiple adjacent
  60    string tokens will be merged into one, with no intervening
  61    space. */
  62 U_CFUNC enum ETokenType
  63 getNextToken(UCHARBUF* buf,
  64              struct UString *token,
  65              uint32_t *linenumber, /* out: linenumber of token */
  66              struct UString *comment,
  67              UErrorCode *status) {
  68     enum ETokenType result;
  69     UChar32         c;
  70
  71     if (U_FAILURE(*status)) {
  72         return TOK_ERROR;
  73     }
  74
  75     /* Skip whitespace */
  76     c = getNextChar(buf, TRUE, comment, status);
  77
  78     if (U_FAILURE(*status)) {
  79         return TOK_ERROR;
  80     }
  81
  82     *linenumber = lineCount;
  83
  84     switch(c) {
  85     case BADBOM:
  86         return TOK_ERROR;
  87     case OPENBRACE:
  88         return TOK_OPEN_BRACE;
  89     case CLOSEBRACE:
  90         return TOK_CLOSE_BRACE;
  91     case COMMA:
  92         return TOK_COMMA;
  93     case U_EOF:
  94         return TOK_EOF;
  95     case COLON:
  96         return TOK_COLON;
  97
  98     default:
  99         result = getStringToken(buf, c, token, status);
 100     }
 101
 102     *linenumber = lineCount;
 103     return result;
 104 }
 105
 106 /* Copy a string token into the given UnicodeString.  Upon entry, we
 107    have already read the first character of the string token, which is
 108    not a whitespace character (but may be a QUOTE or ESCAPE). This
 109    function reads all subsequent characters that belong with this
 110    string, and copy them into the token parameter. The other
 111    important, and slightly convoluted purpose of this function is to
 112    merge adjacent strings.  It looks forward a bit, and if the next
 113    non comment, non whitespace item is a string, it reads it in as
 114    well.  If two adjacent strings are quoted, they are merged without
 115    intervening space.  Otherwise a single SPACE character is
 116    inserted. */
 117 static enum ETokenType getStringToken(UCHARBUF* buf,
 118                                       UChar32 initialChar,
 119                                       struct UString *token,
 120                                       UErrorCode *status) {
 121     UBool    lastStringWasQuoted;
 122     UChar32  c;
 123     UChar    target[3] = { '\0' };
 124     UChar    *pTarget   = target;
 125     int      len=0;
 126     UBool    isFollowingCharEscaped=FALSE;
 127     UBool    isNLUnescaped = FALSE;
 128     UChar32  prevC=0;
 129
 130     /* We are guaranteed on entry that initialChar is not a whitespace
 131        character. If we are at the EOF, or have some other problem, it
 132        doesn't matter; we still want to validly return the initialChar
 133        (if nothing else) as a string token. */
 134
 135     if (U_FAILURE(*status)) {
 136         return TOK_ERROR;
 137     }
 138
 139     /* setup */
 140     lastStringWasQuoted = FALSE;
 141     c = initialChar;
 142     ustr_setlen(token, 0, status);
 143
 144     if (U_FAILURE(*status)) {
 145         return TOK_ERROR;
 146     }
 147
 148     for (;;) {
 149         if (c == QUOTE) {
 150             if (!lastStringWasQuoted && token->fLength > 0) {
 151                 ustr_ucat(token, SPACE, status);
 152
 153                 if (U_FAILURE(*status)) {
 154                     return TOK_ERROR;
 155                 }
 156             }
 157
 158             lastStringWasQuoted = TRUE;
 159
 160             for (;;) {
 161                 c = ucbuf_getc(buf,status);
 162
 163                 /* EOF reached */
 164                 if (c == U_EOF) {
 165                     return TOK_EOF;
 166                 }
 167
 168                 /* Unterminated quoted strings */
 169                 if (U_FAILURE(*status)) {
 170                     return TOK_ERROR;
 171                 }
 172
 173                 if (c == QUOTE && !isFollowingCharEscaped) {
 174                     break;
 175                 }
 176
 177                 if (c == ESCAPE  && !isFollowingCharEscaped) {
 178                     pTarget = target;
 179                     c       = unescape(buf, status);
 180
 181                     if (c == U_ERR) {
 182                         return TOK_ERROR;
 183                     }
 184                     if(c == CR || c == LF){
 185                         isNLUnescaped = TRUE;
 186                     }
 187                 }
 188
 189                 if(c==ESCAPE && !isFollowingCharEscaped){
 190                     isFollowingCharEscaped = TRUE;
 191                 }else{
 192                     U_APPEND_CHAR32(c, pTarget,len);
 193                     pTarget = target;
 194                     ustr_uscat(token, pTarget,len, status);
 195                     isFollowingCharEscaped = FALSE;
 196                     len=0;
 197                     if(c == CR || c == LF){
 198                         if(isNLUnescaped == FALSE && prevC!=CR){
 199                             lineCount++;
 200                         }
 201                         isNLUnescaped = FALSE;
 202                     }
 203                 }
 204
 205                 if (U_FAILURE(*status)) {
 206                     return TOK_ERROR;
 207                 }
 208                 prevC = c;
 209             }
 210         } else {
 211             if (token->fLength > 0) {
 212                 ustr_ucat(token, SPACE, status);
 213
 214                 if (U_FAILURE(*status)) {
 215                     return TOK_ERROR;
 216                 }
 217             }
 218
 219             if(lastStringWasQuoted){
 220                 if(getShowWarning()){
 221                     warning(lineCount, "Mixing quoted and unquoted strings");
 222                 }
 223                 if(isStrict()){
 224                     return TOK_ERROR;
 225                 }
 226
 227             }
 228
 229             lastStringWasQuoted = FALSE;
 230
 231             /* if we reach here we are mixing
 232              * quoted and unquoted strings
 233              * warn in normal mode and error in
 234              * pedantic mode
 235              */
 236
 237             if (c == ESCAPE) {
 238                 pTarget = target;
 239                 c       = unescape(buf, status);
 240
 241                 /* EOF reached */
 242                 if (c == U_EOF) {
 243                     return TOK_ERROR;
 244                 }
 245             }
 246
 247             U_APPEND_CHAR32(c, pTarget,len);
 248             pTarget = target;
 249             ustr_uscat(token, pTarget,len, status);
 250             len=0;
 251
 252             if (U_FAILURE(*status)) {
 253                 return TOK_ERROR;
 254             }
 255
 256             for (;;) {
 257                 /* DON'T skip whitespace */
 258                 c = getNextChar(buf, FALSE, NULL, status);
 259
 260                 /* EOF reached */
 261                 if (c == U_EOF) {
 262                     ucbuf_ungetc(c, buf);
 263                     return TOK_STRING;
 264                 }
 265
 266                 if (U_FAILURE(*status)) {
 267                     return TOK_STRING;
 268                 }
 269
 270                 if (c == QUOTE
 271                         || c == OPENBRACE
 272                         || c == CLOSEBRACE
 273                         || c == COMMA
 274                         || c == COLON) {
 275                     ucbuf_ungetc(c, buf);
 276                     break;
 277                 }
 278
 279                 if (isWhitespace(c)) {
 280                     break;
 281                 }
 282
 283                 if (c == ESCAPE) {
 284                     pTarget = target;
 285                     c       = unescape(buf, status);
 286
 287                     if (c == U_ERR) {
 288                         return TOK_ERROR;
 289                     }
 290                 }
 291
 292                 U_APPEND_CHAR32(c, pTarget,len);
 293                 pTarget = target;
 294                 ustr_uscat(token, pTarget,len, status);
 295                 len=0;
 296                 if (U_FAILURE(*status)) {
 297                     return TOK_ERROR;
 298                 }
 299             }
 300         }
 301
 302         /* DO skip whitespace */
 303         c = getNextChar(buf, TRUE, NULL, status);
 304
 305         if (U_FAILURE(*status)) {
 306             return TOK_STRING;
 307         }
 308
 309         if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) {
 310             ucbuf_ungetc(c, buf);
 311             return TOK_STRING;
 312         }
 313     }
 314 }
 315
 316 /* Retrieve the next character.  If skipwhite is
 317    true, whitespace is skipped as well. */
 318 static UChar32 getNextChar(UCHARBUF* buf,
 319                            UBool skipwhite,
 320                            struct UString *token,
 321                            UErrorCode *status) {
 322     UChar32 c, c2;
 323
 324     if (U_FAILURE(*status)) {
 325         return U_EOF;
 326     }
 327
 328     for (;;) {
 329         c = ucbuf_getc(buf,status);
 330
 331         if (c == U_EOF) {
 332             return U_EOF;
 333         }
 334
 335         if (skipwhite && isWhitespace(c)) {
 336             continue;
 337         }
 338
 339         /* This also handles the get() failing case */
 340         if (c != SLASH) {
 341             return c;
 342         }
 343
 344         c = ucbuf_getc(buf,status); /* "/c" */
 345
 346         if (c == U_EOF) {
 347             return U_EOF;
 348         }
 349
 350         switch (c) {
 351         case SLASH:  /* "//" */
 352             seekUntilNewline(buf, NULL, status);
 353             break;
 354
 355         case ASTERISK:  /* " / * " */
 356             c2 = ucbuf_getc(buf, status); /* "/ * c" */
 357             if(c2 == ASTERISK){  /* "/ * *" */
 358                 /* parse multi-line comment and store it in token*/
 359                 seekUntilEndOfComment(buf, token, status);
 360             } else {
 361                 ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *".  Include c2  back in buffer.  */
 362                 seekUntilEndOfComment(buf, NULL, status);
 363             }
 364             break;
 365
 366         default:
 367             ucbuf_ungetc(c, buf); /* "/c" - put back the c */
 368             /* If get() failed this is a NOP */
 369             return SLASH;
 370         }
 371
 372     }
 373 }
 374
 375 static void seekUntilNewline(UCHARBUF* buf,
 376                              struct UString *token,
 377                              UErrorCode *status) {
 378     UChar32 c;
 379
 380     if (U_FAILURE(*status)) {
 381         return;
 382     }
 383
 384     do {
 385         c = ucbuf_getc(buf,status);
 386         /* add the char to token */
 387         if(token!=NULL){
 388             ustr_u32cat(token, c, status);
 389         }
 390     } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
 391 }
 392
 393 static void seekUntilEndOfComment(UCHARBUF *buf,
 394                                   struct UString *token,
 395                                   UErrorCode *status) {
 396     UChar32  c, d;
 397     uint32_t line;
 398
 399     if (U_FAILURE(*status)) {
 400         return;
 401     }
 402
 403     line = lineCount;
 404
 405     do {
 406         c = ucbuf_getc(buf, status);
 407
 408         if (c == ASTERISK) {
 409             d = ucbuf_getc(buf, status);
 410
 411             if (d != SLASH) {
 412                 ucbuf_ungetc(d, buf);
 413             } else {
 414                 break;
 415             }
 416         }
 417         /* add the char to token */
 418         if(token!=NULL){
 419             ustr_u32cat(token, c, status);
 420         }
 421         /* increment the lineCount */
 422         isNewline(c);
 423
 424     } while (c != U_EOF && *status == U_ZERO_ERROR);
 425
 426     if (c == U_EOF) {
 427         *status = U_INVALID_FORMAT_ERROR;
 428         error(line, "unterminated comment detected");
 429     }
 430 }
 431
 432 U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) {
 433     if (U_FAILURE(*status)) {
 434         return U_EOF;
 435     }
 436
 437     /* We expect to be called after the ESCAPE has been seen, but
 438      * u_fgetcx needs an ESCAPE to do its magic. */
 439     ucbuf_ungetc(ESCAPE, buf);
 440
 441     return ucbuf_getcx32(buf, status);
 442 }
 443
 444 static UBool isWhitespace(UChar32 c) {
 445     switch (c) {
 446         /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
 447     case 0x000A:
 448     case 0x2029:
 449         lineCount++;
 450     case 0x000D:
 451     case 0x0020:
 452     case 0x0009:
 453     case 0xFEFF:
 454         return TRUE;
 455
 456     default:
 457         return FALSE;
 458     }
 459 }
 460
 461 static UBool isNewline(UChar32 c) {
 462     switch (c) {
 463         /* '\n', '\r', 0x2029 */
 464     case 0x000A:
 465     case 0x2029:
 466         lineCount++;
 467     case 0x000D:
 468         return TRUE;
 469
 470     default:
 471         return FALSE;
 472     }
 473 }