icuSources/tools/genrb/read.c

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 1998-2012, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *
   9 * File read.c
  10 *
  11 * Modification History:
  12 *
  13 *   Date        Name        Description
  14 *   05/26/99    stephen     Creation.
  15 *   5/10/01     Ram         removed ustdio dependency
  16 *******************************************************************************
  17 */
  18
  19 #include "read.h"
  20 #include "errmsg.h"
  21 #include "unicode/ustring.h"
  22 #include "unicode/utf16.h"
  23
  24 #define OPENBRACE    0x007B
  25 #define CLOSEBRACE   0x007D
  26 #define COMMA        0x002C
  27 #define QUOTE        0x0022
  28 #define ESCAPE       0x005C
  29 #define SLASH        0x002F
  30 #define ASTERISK     0x002A
  31 #define SPACE        0x0020
  32 #define COLON        0x003A
  33 #define BADBOM       0xFFFE
  34 #define CR           0x000D
  35 #define LF           0x000A
  36
  37 static int32_t lineCount;
  38
  39 /* Protos */
  40 static enum ETokenType getStringToken(UCHARBUF *buf,
  41                                       UChar32 initialChar,
  42                                       struct UString *token,
  43                                       UErrorCode *status);
  44
  45 static UChar32 getNextChar           (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status);
  46 static void    seekUntilNewline      (UCHARBUF *buf, struct UString *token, UErrorCode *status);
  47 static void    seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status);
  48 static UBool   isWhitespace          (UChar32 c);
  49 static UBool   isNewline             (UChar32 c);
  50
  51 U_CFUNC void resetLineNumber() {
  52     lineCount = 1;
  53 }
  54
  55 /* Read and return the next token from the stream.  If the token is of
  56    type eString, fill in the token parameter with the token.  If the
  57    token is eError, then the status parameter will contain the
  58    specific error.  This will be eItemNotFound at the end of file,
  59    indicating that all tokens have been returned.  This method will
  60    never return eString twice in a row; instead, multiple adjacent
  61    string tokens will be merged into one, with no intervening
  62    space. */
  63 U_CFUNC enum ETokenType
  64 getNextToken(UCHARBUF* buf,
  65              struct UString *token,
  66              uint32_t *linenumber, /* out: linenumber of token */
  67              struct UString *comment,
  68              UErrorCode *status) {
  69     enum ETokenType result;
  70     UChar32         c;
  71
  72     if (U_FAILURE(*status)) {
  73         return TOK_ERROR;
  74     }
  75
  76     /* Skip whitespace */
  77     c = getNextChar(buf, TRUE, comment, status);
  78
  79     if (U_FAILURE(*status)) {
  80         return TOK_ERROR;
  81     }
  82
  83     *linenumber = lineCount;
  84
  85     switch(c) {
  86     case BADBOM:
  87         return TOK_ERROR;
  88     case OPENBRACE:
  89         return TOK_OPEN_BRACE;
  90     case CLOSEBRACE:
  91         return TOK_CLOSE_BRACE;
  92     case COMMA:
  93         return TOK_COMMA;
  94     case U_EOF:
  95         return TOK_EOF;
  96     case COLON:
  97         return TOK_COLON;
  98
  99     default:
 100         result = getStringToken(buf, c, token, status);
 101     }
 102
 103     *linenumber = lineCount;
 104     return result;
 105 }
 106
 107 /* Copy a string token into the given UnicodeString.  Upon entry, we
 108    have already read the first character of the string token, which is
 109    not a whitespace character (but may be a QUOTE or ESCAPE). This
 110    function reads all subsequent characters that belong with this
 111    string, and copy them into the token parameter. The other
 112    important, and slightly convoluted purpose of this function is to
 113    merge adjacent strings.  It looks forward a bit, and if the next
 114    non comment, non whitespace item is a string, it reads it in as
 115    well.  If two adjacent strings are quoted, they are merged without
 116    intervening space.  Otherwise a single SPACE character is
 117    inserted. */
 118 static enum ETokenType getStringToken(UCHARBUF* buf,
 119                                       UChar32 initialChar,
 120                                       struct UString *token,
 121                                       UErrorCode *status) {
 122     UBool    lastStringWasQuoted;
 123     UChar32  c;
 124     UChar    target[3] = { '\0' };
 125     UChar    *pTarget   = target;
 126     int      len=0;
 127     UBool    isFollowingCharEscaped=FALSE;
 128     UBool    isNLUnescaped = FALSE;
 129     UChar32  prevC=0;
 130
 131     /* We are guaranteed on entry that initialChar is not a whitespace
 132        character. If we are at the EOF, or have some other problem, it
 133        doesn't matter; we still want to validly return the initialChar
 134        (if nothing else) as a string token. */
 135
 136     if (U_FAILURE(*status)) {
 137         return TOK_ERROR;
 138     }
 139
 140     /* setup */
 141     lastStringWasQuoted = FALSE;
 142     c = initialChar;
 143     ustr_setlen(token, 0, status);
 144
 145     if (U_FAILURE(*status)) {
 146         return TOK_ERROR;
 147     }
 148
 149     for (;;) {
 150         if (c == QUOTE) {
 151             if (!lastStringWasQuoted && token->fLength > 0) {
 152                 ustr_ucat(token, SPACE, status);
 153
 154                 if (U_FAILURE(*status)) {
 155                     return TOK_ERROR;
 156                 }
 157             }
 158
 159             lastStringWasQuoted = TRUE;
 160
 161             for (;;) {
 162                 c = ucbuf_getc(buf,status);
 163
 164                 /* EOF reached */
 165                 if (c == U_EOF) {
 166                     return TOK_EOF;
 167                 }
 168
 169                 /* Unterminated quoted strings */
 170                 if (U_FAILURE(*status)) {
 171                     return TOK_ERROR;
 172                 }
 173
 174                 if (c == QUOTE && !isFollowingCharEscaped) {
 175                     break;
 176                 }
 177
 178                 if (c == ESCAPE  && !isFollowingCharEscaped) {
 179                     pTarget = target;
 180                     c       = unescape(buf, status);
 181
 182                     if (c == U_ERR) {
 183                         return TOK_ERROR;
 184                     }
 185                     if(c == CR || c == LF){
 186                         isNLUnescaped = TRUE;
 187                     }
 188                 }
 189
 190                 if(c==ESCAPE && !isFollowingCharEscaped){
 191                     isFollowingCharEscaped = TRUE;
 192                 }else{
 193                     U_APPEND_CHAR32(c, pTarget,len);
 194                     pTarget = target;
 195                     ustr_uscat(token, pTarget,len, status);
 196                     isFollowingCharEscaped = FALSE;
 197                     len=0;
 198                     if(c == CR || c == LF){
 199                         if(isNLUnescaped == FALSE && prevC!=CR){
 200                             lineCount++;
 201                         }
 202                         isNLUnescaped = FALSE;
 203                     }
 204                 }
 205
 206                 if (U_FAILURE(*status)) {
 207                     return TOK_ERROR;
 208                 }
 209                 prevC = c;
 210             }
 211         } else {
 212             if (token->fLength > 0) {
 213                 ustr_ucat(token, SPACE, status);
 214
 215                 if (U_FAILURE(*status)) {
 216                     return TOK_ERROR;
 217                 }
 218             }
 219
 220             if(lastStringWasQuoted){
 221                 if(getShowWarning()){
 222                     warning(lineCount, "Mixing quoted and unquoted strings");
 223                 }
 224                 if(isStrict()){
 225                     return TOK_ERROR;
 226                 }
 227
 228             }
 229
 230             lastStringWasQuoted = FALSE;
 231
 232             /* if we reach here we are mixing
 233              * quoted and unquoted strings
 234              * warn in normal mode and error in
 235              * pedantic mode
 236              */
 237
 238             if (c == ESCAPE) {
 239                 pTarget = target;
 240                 c       = unescape(buf, status);
 241
 242                 /* EOF reached */
 243                 if (c == U_EOF) {
 244                     return TOK_ERROR;
 245                 }
 246             }
 247
 248             U_APPEND_CHAR32(c, pTarget,len);
 249             pTarget = target;
 250             ustr_uscat(token, pTarget,len, status);
 251             len=0;
 252
 253             if (U_FAILURE(*status)) {
 254                 return TOK_ERROR;
 255             }
 256
 257             for (;;) {
 258                 /* DON'T skip whitespace */
 259                 c = getNextChar(buf, FALSE, NULL, status);
 260
 261                 /* EOF reached */
 262                 if (c == U_EOF) {
 263                     ucbuf_ungetc(c, buf);
 264                     return TOK_STRING;
 265                 }
 266
 267                 if (U_FAILURE(*status)) {
 268                     return TOK_STRING;
 269                 }
 270
 271                 if (c == QUOTE
 272                         || c == OPENBRACE
 273                         || c == CLOSEBRACE
 274                         || c == COMMA
 275                         || c == COLON) {
 276                     ucbuf_ungetc(c, buf);
 277                     break;
 278                 }
 279
 280                 if (isWhitespace(c)) {
 281                     break;
 282                 }
 283
 284                 if (c == ESCAPE) {
 285                     pTarget = target;
 286                     c       = unescape(buf, status);
 287
 288                     if (c == U_ERR) {
 289                         return TOK_ERROR;
 290                     }
 291                 }
 292
 293                 U_APPEND_CHAR32(c, pTarget,len);
 294                 pTarget = target;
 295                 ustr_uscat(token, pTarget,len, status);
 296                 len=0;
 297                 if (U_FAILURE(*status)) {
 298                     return TOK_ERROR;
 299                 }
 300             }
 301         }
 302
 303         /* DO skip whitespace */
 304         c = getNextChar(buf, TRUE, NULL, status);
 305
 306         if (U_FAILURE(*status)) {
 307             return TOK_STRING;
 308         }
 309
 310         if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) {
 311             ucbuf_ungetc(c, buf);
 312             return TOK_STRING;
 313         }
 314     }
 315 }
 316
 317 /* Retrieve the next character.  If skipwhite is
 318    true, whitespace is skipped as well. */
 319 static UChar32 getNextChar(UCHARBUF* buf,
 320                            UBool skipwhite,
 321                            struct UString *token,
 322                            UErrorCode *status) {
 323     UChar32 c, c2;
 324
 325     if (U_FAILURE(*status)) {
 326         return U_EOF;
 327     }
 328
 329     for (;;) {
 330         c = ucbuf_getc(buf,status);
 331
 332         if (c == U_EOF) {
 333             return U_EOF;
 334         }
 335
 336         if (skipwhite && isWhitespace(c)) {
 337             continue;
 338         }
 339
 340         /* This also handles the get() failing case */
 341         if (c != SLASH) {
 342             return c;
 343         }
 344
 345         c = ucbuf_getc(buf,status); /* "/c" */
 346
 347         if (c == U_EOF) {
 348             return U_EOF;
 349         }
 350
 351         switch (c) {
 352         case SLASH:  /* "//" */
 353             seekUntilNewline(buf, NULL, status);
 354             break;
 355
 356         case ASTERISK:  /* " / * " */
 357             c2 = ucbuf_getc(buf, status); /* "/ * c" */
 358             if(c2 == ASTERISK){  /* "/ * *" */
 359                 /* parse multi-line comment and store it in token*/
 360                 seekUntilEndOfComment(buf, token, status);
 361             } else {
 362                 ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *".  Include c2  back in buffer.  */
 363                 seekUntilEndOfComment(buf, NULL, status);
 364             }
 365             break;
 366
 367         default:
 368             ucbuf_ungetc(c, buf); /* "/c" - put back the c */
 369             /* If get() failed this is a NOP */
 370             return SLASH;
 371         }
 372
 373     }
 374 }
 375
 376 static void seekUntilNewline(UCHARBUF* buf,
 377                              struct UString *token,
 378                              UErrorCode *status) {
 379     UChar32 c;
 380
 381     if (U_FAILURE(*status)) {
 382         return;
 383     }
 384
 385     do {
 386         c = ucbuf_getc(buf,status);
 387         /* add the char to token */
 388         if(token!=NULL){
 389             ustr_u32cat(token, c, status);
 390         }
 391     } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
 392 }
 393
 394 static void seekUntilEndOfComment(UCHARBUF *buf,
 395                                   struct UString *token,
 396                                   UErrorCode *status) {
 397     UChar32  c, d;
 398     uint32_t line;
 399
 400     if (U_FAILURE(*status)) {
 401         return;
 402     }
 403
 404     line = lineCount;
 405
 406     do {
 407         c = ucbuf_getc(buf, status);
 408
 409         if (c == ASTERISK) {
 410             d = ucbuf_getc(buf, status);
 411
 412             if (d != SLASH) {
 413                 ucbuf_ungetc(d, buf);
 414             } else {
 415                 break;
 416             }
 417         }
 418         /* add the char to token */
 419         if(token!=NULL){
 420             ustr_u32cat(token, c, status);
 421         }
 422         /* increment the lineCount */
 423         isNewline(c);
 424
 425     } while (c != U_EOF && *status == U_ZERO_ERROR);
 426
 427     if (c == U_EOF) {
 428         *status = U_INVALID_FORMAT_ERROR;
 429         error(line, "unterminated comment detected");
 430     }
 431 }
 432
 433 U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) {
 434     if (U_FAILURE(*status)) {
 435         return U_EOF;
 436     }
 437
 438     /* We expect to be called after the ESCAPE has been seen, but
 439      * u_fgetcx needs an ESCAPE to do its magic. */
 440     ucbuf_ungetc(ESCAPE, buf);
 441
 442     return ucbuf_getcx32(buf, status);
 443 }
 444
 445 static UBool isWhitespace(UChar32 c) {
 446     switch (c) {
 447         /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
 448     case 0x000A:
 449     case 0x2029:
 450         lineCount++;
 451     case 0x000D:
 452     case 0x0020:
 453     case 0x0009:
 454     case 0xFEFF:
 455         return TRUE;
 456
 457     default:
 458         return FALSE;
 459     }
 460 }
 461
 462 static UBool isNewline(UChar32 c) {
 463     switch (c) {
 464         /* '\n', '\r', 0x2029 */
 465     case 0x000A:
 466     case 0x2029:
 467         lineCount++;
 468     case 0x000D:
 469         return TRUE;
 470
 471     default:
 472         return FALSE;
 473     }
 474 }