icuSources/tools/genrb/read.c

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 1998-2008, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *
   9 * File read.c
  10 *
  11 * Modification History:
  12 *
  13 *   Date        Name        Description
  14 *   05/26/99    stephen     Creation.
  15 *   5/10/01     Ram         removed ustdio dependency
  16 *******************************************************************************
  17 */
  18
  19 #include "read.h"
  20 #include "errmsg.h"
  21 #include "unicode/ustring.h"
  22
  23 #define OPENBRACE    0x007B
  24 #define CLOSEBRACE   0x007D
  25 #define COMMA        0x002C
  26 #define QUOTE        0x0022
  27 #define ESCAPE       0x005C
  28 #define SLASH        0x002F
  29 #define ASTERISK     0x002A
  30 #define SPACE        0x0020
  31 #define COLON        0x003A
  32 #define BADBOM       0xFFFE
  33 #define CR           0x000D
  34 #define LF           0x000A
  35
  36 static int32_t lineCount;
  37
  38 /* Protos */
  39 static enum ETokenType getStringToken(UCHARBUF *buf,
  40                                       UChar32 initialChar,
  41                                       struct UString *token,
  42                                       UErrorCode *status);
  43
  44 static UChar32 getNextChar           (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status);
  45 static void    seekUntilNewline      (UCHARBUF *buf, struct UString *token, UErrorCode *status);
  46 static void    seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status);
  47 static UBool   isWhitespace          (UChar32 c);
  48 static UBool   isNewline             (UChar32 c);
  49
  50 void resetLineNumber() {
  51     lineCount = 1;
  52 }
  53
  54 /* Read and return the next token from the stream.  If the token is of
  55    type eString, fill in the token parameter with the token.  If the
  56    token is eError, then the status parameter will contain the
  57    specific error.  This will be eItemNotFound at the end of file,
  58    indicating that all tokens have been returned.  This method will
  59    never return eString twice in a row; instead, multiple adjacent
  60    string tokens will be merged into one, with no intervening
  61    space. */
  62 enum ETokenType getNextToken(UCHARBUF* buf,
  63                              struct UString *token,
  64                              uint32_t *linenumber, /* out: linenumber of token */
  65                              struct UString *comment,
  66                              UErrorCode *status) {
  67     enum ETokenType result;
  68     UChar32         c;
  69
  70     if (U_FAILURE(*status)) {
  71         return TOK_ERROR;
  72     }
  73
  74     /* Skip whitespace */
  75     c = getNextChar(buf, TRUE, comment, status);
  76
  77     if (U_FAILURE(*status)) {
  78         return TOK_ERROR;
  79     }
  80
  81     *linenumber = lineCount;
  82
  83     switch(c) {
  84     case BADBOM:
  85         return TOK_ERROR;
  86     case OPENBRACE:
  87         return TOK_OPEN_BRACE;
  88     case CLOSEBRACE:
  89         return TOK_CLOSE_BRACE;
  90     case COMMA:
  91         return TOK_COMMA;
  92     case U_EOF:
  93         return TOK_EOF;
  94     case COLON:
  95         return TOK_COLON;
  96
  97     default:
  98         result = getStringToken(buf, c, token, status);
  99     }
 100
 101     *linenumber = lineCount;
 102     return result;
 103 }
 104
 105 /* Copy a string token into the given UnicodeString.  Upon entry, we
 106    have already read the first character of the string token, which is
 107    not a whitespace character (but may be a QUOTE or ESCAPE). This
 108    function reads all subsequent characters that belong with this
 109    string, and copy them into the token parameter. The other
 110    important, and slightly convoluted purpose of this function is to
 111    merge adjacent strings.  It looks forward a bit, and if the next
 112    non comment, non whitespace item is a string, it reads it in as
 113    well.  If two adjacent strings are quoted, they are merged without
 114    intervening space.  Otherwise a single SPACE character is
 115    inserted. */
 116 static enum ETokenType getStringToken(UCHARBUF* buf,
 117                                       UChar32 initialChar,
 118                                       struct UString *token,
 119                                       UErrorCode *status) {
 120     UBool    lastStringWasQuoted;
 121     UChar32  c;
 122     UChar    target[3] = { '\0' };
 123     UChar    *pTarget   = target;
 124     int      len=0;
 125     UBool    isFollowingCharEscaped=FALSE;
 126     UBool    isNLUnescaped = FALSE;
 127     UChar32  prevC=0;
 128
 129     /* We are guaranteed on entry that initialChar is not a whitespace
 130        character. If we are at the EOF, or have some other problem, it
 131        doesn't matter; we still want to validly return the initialChar
 132        (if nothing else) as a string token. */
 133
 134     if (U_FAILURE(*status)) {
 135         return TOK_ERROR;
 136     }
 137
 138     /* setup */
 139     lastStringWasQuoted = FALSE;
 140     c = initialChar;
 141     ustr_setlen(token, 0, status);
 142
 143     if (U_FAILURE(*status)) {
 144         return TOK_ERROR;
 145     }
 146
 147     for (;;) {
 148         if (c == QUOTE) {
 149             if (!lastStringWasQuoted && token->fLength > 0) {
 150                 ustr_ucat(token, SPACE, status);
 151
 152                 if (U_FAILURE(*status)) {
 153                     return TOK_ERROR;
 154                 }
 155             }
 156
 157             lastStringWasQuoted = TRUE;
 158
 159             for (;;) {
 160                 c = ucbuf_getc(buf,status);
 161
 162                 /* EOF reached */
 163                 if (c == U_EOF) {
 164                     return TOK_EOF;
 165                 }
 166
 167                 /* Unterminated quoted strings */
 168                 if (U_FAILURE(*status)) {
 169                     return TOK_ERROR;
 170                 }
 171
 172                 if (c == QUOTE && !isFollowingCharEscaped) {
 173                     break;
 174                 }
 175
 176                 if (c == ESCAPE  && !isFollowingCharEscaped) {
 177                     pTarget = target;
 178                     c       = unescape(buf, status);
 179
 180                     if (c == U_ERR) {
 181                         return TOK_ERROR;
 182                     }
 183                     if(c == CR || c == LF){
 184                         isNLUnescaped = TRUE;
 185                     }
 186                 }
 187
 188                 if(c==ESCAPE && !isFollowingCharEscaped){
 189                     isFollowingCharEscaped = TRUE;
 190                 }else{
 191                     U_APPEND_CHAR32(c, pTarget,len);
 192                     pTarget = target;
 193                     ustr_uscat(token, pTarget,len, status);
 194                     isFollowingCharEscaped = FALSE;
 195                     len=0;
 196                     if(c == CR || c == LF){
 197                         if(isNLUnescaped == FALSE && prevC!=CR){
 198                             lineCount++;
 199                         }
 200                         isNLUnescaped = FALSE;
 201                     }
 202                 }
 203
 204                 if (U_FAILURE(*status)) {
 205                     return TOK_ERROR;
 206                 }
 207                 prevC = c;
 208             }
 209         } else {
 210             if (token->fLength > 0) {
 211                 ustr_ucat(token, SPACE, status);
 212
 213                 if (U_FAILURE(*status)) {
 214                     return TOK_ERROR;
 215                 }
 216             }
 217
 218             if(lastStringWasQuoted){
 219                 if(getShowWarning()){
 220                     warning(lineCount, "Mixing quoted and unquoted strings");
 221                 }
 222                 if(isStrict()){
 223                     return TOK_ERROR;
 224                 }
 225
 226             }
 227
 228             lastStringWasQuoted = FALSE;
 229
 230             /* if we reach here we are mixing
 231              * quoted and unquoted strings
 232              * warn in normal mode and error in
 233              * pedantic mode
 234              */
 235
 236             if (c == ESCAPE) {
 237                 pTarget = target;
 238                 c       = unescape(buf, status);
 239
 240                 /* EOF reached */
 241                 if (c == U_EOF) {
 242                     return TOK_ERROR;
 243                 }
 244             }
 245
 246             U_APPEND_CHAR32(c, pTarget,len);
 247             pTarget = target;
 248             ustr_uscat(token, pTarget,len, status);
 249             len=0;
 250
 251             if (U_FAILURE(*status)) {
 252                 return TOK_ERROR;
 253             }
 254
 255             for (;;) {
 256                 /* DON'T skip whitespace */
 257                 c = getNextChar(buf, FALSE, NULL, status);
 258
 259                 /* EOF reached */
 260                 if (c == U_EOF) {
 261                     ucbuf_ungetc(c, buf);
 262                     return TOK_STRING;
 263                 }
 264
 265                 if (U_FAILURE(*status)) {
 266                     return TOK_STRING;
 267                 }
 268
 269                 if (c == QUOTE
 270                         || c == OPENBRACE
 271                         || c == CLOSEBRACE
 272                         || c == COMMA
 273                         || c == COLON) {
 274                     ucbuf_ungetc(c, buf);
 275                     break;
 276                 }
 277
 278                 if (isWhitespace(c)) {
 279                     break;
 280                 }
 281
 282                 if (c == ESCAPE) {
 283                     pTarget = target;
 284                     c       = unescape(buf, status);
 285
 286                     if (c == U_ERR) {
 287                         return TOK_ERROR;
 288                     }
 289                 }
 290
 291                 U_APPEND_CHAR32(c, pTarget,len);
 292                 pTarget = target;
 293                 ustr_uscat(token, pTarget,len, status);
 294                 len=0;
 295                 if (U_FAILURE(*status)) {
 296                     return TOK_ERROR;
 297                 }
 298             }
 299         }
 300
 301         /* DO skip whitespace */
 302         c = getNextChar(buf, TRUE, NULL, status);
 303
 304         if (U_FAILURE(*status)) {
 305             return TOK_STRING;
 306         }
 307
 308         if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) {
 309             ucbuf_ungetc(c, buf);
 310             return TOK_STRING;
 311         }
 312     }
 313 }
 314
 315 /* Retrieve the next character.  If skipwhite is
 316    true, whitespace is skipped as well. */
 317 static UChar32 getNextChar(UCHARBUF* buf,
 318                            UBool skipwhite,
 319                            struct UString *token,
 320                            UErrorCode *status) {
 321     UChar32 c, c2;
 322
 323     if (U_FAILURE(*status)) {
 324         return U_EOF;
 325     }
 326
 327     for (;;) {
 328         c = ucbuf_getc(buf,status);
 329
 330         if (c == U_EOF) {
 331             return U_EOF;
 332         }
 333
 334         if (skipwhite && isWhitespace(c)) {
 335             continue;
 336         }
 337
 338         /* This also handles the get() failing case */
 339         if (c != SLASH) {
 340             return c;
 341         }
 342
 343         c = ucbuf_getc(buf,status); /* "/c" */
 344
 345         if (c == U_EOF) {
 346             return U_EOF;
 347         }
 348
 349         switch (c) {
 350         case SLASH:  /* "//" */
 351             seekUntilNewline(buf, NULL, status);
 352             break;
 353
 354         case ASTERISK:  /* "/*" */
 355             c2 = ucbuf_getc(buf, status); /* "/*c" */
 356             if(c2 == ASTERISK){  /* "/**" */
 357                 /* parse multi-line comment and store it in token*/
 358                 seekUntilEndOfComment(buf, token, status);
 359             } else {
 360                 ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/*".  Include c2  back in buffer.  */
 361                 seekUntilEndOfComment(buf, NULL, status);
 362             }
 363             break;
 364
 365         default:
 366             ucbuf_ungetc(c, buf); /* "/c" - put back the c */
 367             /* If get() failed this is a NOP */
 368             return SLASH;
 369         }
 370
 371     }
 372 }
 373
 374 static void seekUntilNewline(UCHARBUF* buf,
 375                              struct UString *token,
 376                              UErrorCode *status) {
 377     UChar32 c;
 378
 379     if (U_FAILURE(*status)) {
 380         return;
 381     }
 382
 383     do {
 384         c = ucbuf_getc(buf,status);
 385         /* add the char to token */
 386         if(token!=NULL){
 387             ustr_u32cat(token, c, status);
 388         }
 389     } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
 390 }
 391
 392 static void seekUntilEndOfComment(UCHARBUF *buf,
 393                                   struct UString *token,
 394                                   UErrorCode *status) {
 395     UChar32  c, d;
 396     uint32_t line;
 397
 398     if (U_FAILURE(*status)) {
 399         return;
 400     }
 401
 402     line = lineCount;
 403
 404     do {
 405         c = ucbuf_getc(buf, status);
 406
 407         if (c == ASTERISK) {
 408             d = ucbuf_getc(buf, status);
 409
 410             if (d != SLASH) {
 411                 ucbuf_ungetc(d, buf);
 412             } else {
 413                 break;
 414             }
 415         }
 416         /* add the char to token */
 417         if(token!=NULL){
 418             ustr_u32cat(token, c, status);
 419         }
 420         /* increment the lineCount */
 421         isNewline(c);
 422
 423     } while (c != U_EOF && *status == U_ZERO_ERROR);
 424
 425     if (c == U_EOF) {
 426         *status = U_INVALID_FORMAT_ERROR;
 427         error(line, "unterminated comment detected");
 428     }
 429 }
 430
 431 UChar32 unescape(UCHARBUF *buf,
 432                  UErrorCode *status) {
 433     if (U_FAILURE(*status)) {
 434         return U_EOF;
 435     }
 436
 437     /* We expect to be called after the ESCAPE has been seen, but
 438      * u_fgetcx needs an ESCAPE to do its magic. */
 439     ucbuf_ungetc(ESCAPE, buf);
 440
 441     return ucbuf_getcx32(buf, status);
 442 }
 443
 444 static UBool isWhitespace(UChar32 c) {
 445     switch (c) {
 446         /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
 447     case 0x000A:
 448     case 0x2029:
 449         lineCount++;
 450     case 0x000D:
 451     case 0x0020:
 452     case 0x0009:
 453     case 0xFEFF:
 454         return TRUE;
 455
 456     default:
 457         return FALSE;
 458     }
 459 }
 460
 461 static UBool isNewline(UChar32 c) {
 462     switch (c) {
 463         /* '\n', '\r', 0x2029 */
 464     case 0x000A:
 465     case 0x2029:
 466         lineCount++;
 467     case 0x000D:
 468         return TRUE;
 469
 470     default:
 471         return FALSE;
 472     }
 473 }