kjs/lexer.cpp

   1 // -*- c-basic-offset: 2 -*-
   2 /*
   3  *  Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
   4  *  Copyright (C) 2006, 2007, 2008 Apple Inc. All Rights Reserved.
   5  *  Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
   6  *
   7  *  This library is free software; you can redistribute it and/or
   8  *  modify it under the terms of the GNU Library General Public
   9  *  License as published by the Free Software Foundation; either
  10  *  version 2 of the License, or (at your option) any later version.
  11  *
  12  *  This library is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  *  Library General Public License for more details.
  16  *
  17  *  You should have received a copy of the GNU Library General Public License
  18  *  along with this library; see the file COPYING.LIB.  If not, write to
  19  *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  20  *  Boston, MA 02110-1301, USA.
  21  *
  22  */
  23
  24 #include "config.h"
  25 #include "lexer.h"
  26
  27 #include "dtoa.h"
  28 #include "function.h"
  29 #include "nodes.h"
  30 #include "NodeInfo.h"
  31 #include <ctype.h>
  32 #include <limits.h>
  33 #include <string.h>
  34 #include <wtf/Assertions.h>
  35 #include <wtf/unicode/Unicode.h>
  36
  37 using namespace WTF;
  38 using namespace Unicode;
  39
  40 // we can't specify the namespace in yacc's C output, so do it here
  41 using namespace KJS;
  42
  43 #ifndef KDE_USE_FINAL
  44 #include "grammar.h"
  45 #endif
  46
  47 #include "lookup.h"
  48 #include "lexer.lut.h"
  49
  50 extern YYLTYPE kjsyylloc; // global bison variable holding token info
  51
  52 // a bridge for yacc from the C world to C++
  53 int kjsyylex()
  54 {
  55   return lexer().lex();
  56 }
  57
  58 namespace KJS {
  59
  60 static bool isDecimalDigit(int);
  61
  62 static const size_t initialReadBufferCapacity = 32;
  63 static const size_t initialStringTableCapacity = 64;
  64
  65 Lexer& lexer()
  66 {
  67     ASSERT(JSLock::currentThreadIsHoldingLock());
  68
  69     // FIXME: We'd like to avoid calling new here, but we don't currently
  70     // support tearing down the Lexer at app quit time, since that would involve
  71     // tearing down its UString data members without holding the JSLock.
  72     static Lexer* staticLexer = new Lexer;
  73     return *staticLexer;
  74 }
  75
  76 Lexer::Lexer()
  77     : yylineno(1)
  78     , restrKeyword(false)
  79     , eatNextIdentifier(false)
  80     , stackToken(-1)
  81     , lastToken(-1)
  82     , pos(0)
  83     , code(0)
  84     , length(0)
  85     , atLineStart(true)
  86     , current(0)
  87     , next1(0)
  88     , next2(0)
  89     , next3(0)
  90 {
  91     m_buffer8.reserveCapacity(initialReadBufferCapacity);
  92     m_buffer16.reserveCapacity(initialReadBufferCapacity);
  93     m_strings.reserveCapacity(initialStringTableCapacity);
  94     m_identifiers.reserveCapacity(initialStringTableCapacity);
  95 }
  96
  97 void Lexer::setCode(int startingLineNumber, const KJS::UChar *c, unsigned int len)
  98 {
  99     yylineno = 1 + startingLineNumber;
 100     restrKeyword = false;
 101     delimited = false;
 102     eatNextIdentifier = false;
 103     stackToken = -1;
 104     lastToken = -1;
 105     pos = 0;
 106     code = c;
 107     length = len;
 108     skipLF = false;
 109     skipCR = false;
 110     error = false;
 111     atLineStart = true;
 112
 113     // read first characters
 114     shift(4);
 115 }
 116
 117 void Lexer::shift(unsigned p)
 118 {
 119     // ECMA-262 calls for stripping Cf characters here, but we only do this for BOM,
 120     // see <https://bugs.webkit.org/show_bug.cgi?id=4931>.
 121
 122     while (p--) {
 123         current = next1;
 124         next1 = next2;
 125         next2 = next3;
 126         do {
 127             if (pos >= length) {
 128                 next3 = -1;
 129                 break;
 130             }
 131             next3 = code[pos++].uc;
 132         } while (next3 == 0xFEFF);
 133     }
 134 }
 135
 136 // called on each new line
 137 void Lexer::nextLine()
 138 {
 139   yylineno++;
 140   atLineStart = true;
 141 }
 142
 143 void Lexer::setDone(State s)
 144 {
 145   state = s;
 146   done = true;
 147 }
 148
 149 int Lexer::lex()
 150 {
 151   int token = 0;
 152   state = Start;
 153   unsigned short stringType = 0; // either single or double quotes
 154   m_buffer8.clear();
 155   m_buffer16.clear();
 156   done = false;
 157   terminator = false;
 158   skipLF = false;
 159   skipCR = false;
 160
 161   // did we push a token on the stack previously ?
 162   // (after an automatic semicolon insertion)
 163   if (stackToken >= 0) {
 164     setDone(Other);
 165     token = stackToken;
 166     stackToken = 0;
 167   }
 168
 169   while (!done) {
 170     if (skipLF && current != '\n') // found \r but not \n afterwards
 171         skipLF = false;
 172     if (skipCR && current != '\r') // found \n but not \r afterwards
 173         skipCR = false;
 174     if (skipLF || skipCR) // found \r\n or \n\r -> eat the second one
 175     {
 176         skipLF = false;
 177         skipCR = false;
 178         shift(1);
 179     }
 180     switch (state) {
 181     case Start:
 182       if (isWhiteSpace()) {
 183         // do nothing
 184       } else if (current == '/' && next1 == '/') {
 185         shift(1);
 186         state = InSingleLineComment;
 187       } else if (current == '/' && next1 == '*') {
 188         shift(1);
 189         state = InMultiLineComment;
 190       } else if (current == -1) {
 191         if (!terminator && !delimited) {
 192           // automatic semicolon insertion if program incomplete
 193           token = ';';
 194           stackToken = 0;
 195           setDone(Other);
 196         } else
 197           setDone(Eof);
 198       } else if (isLineTerminator()) {
 199         nextLine();
 200         terminator = true;
 201         if (restrKeyword) {
 202           token = ';';
 203           setDone(Other);
 204         }
 205       } else if (current == '"' || current == '\'') {
 206         state = InString;
 207         stringType = static_cast<unsigned short>(current);
 208       } else if (isIdentStart(current)) {
 209         record16(current);
 210         state = InIdentifierOrKeyword;
 211       } else if (current == '\\') {
 212         state = InIdentifierStartUnicodeEscapeStart;
 213       } else if (current == '0') {
 214         record8(current);
 215         state = InNum0;
 216       } else if (isDecimalDigit(current)) {
 217         record8(current);
 218         state = InNum;
 219       } else if (current == '.' && isDecimalDigit(next1)) {
 220         record8(current);
 221         state = InDecimal;
 222         // <!-- marks the beginning of a line comment (for www usage)
 223       } else if (current == '<' && next1 == '!' &&
 224                  next2 == '-' && next3 == '-') {
 225         shift(3);
 226         state = InSingleLineComment;
 227         // same for -->
 228       } else if (atLineStart && current == '-' && next1 == '-' &&  next2 == '>') {
 229         shift(2);
 230         state = InSingleLineComment;
 231       } else {
 232         token = matchPunctuator(current, next1, next2, next3);
 233         if (token != -1) {
 234           setDone(Other);
 235         } else {
 236           //      cerr << "encountered unknown character" << endl;
 237           setDone(Bad);
 238         }
 239       }
 240       break;
 241     case InString:
 242       if (current == stringType) {
 243         shift(1);
 244         setDone(String);
 245       } else if (isLineTerminator() || current == -1) {
 246         setDone(Bad);
 247       } else if (current == '\\') {
 248         state = InEscapeSequence;
 249       } else {
 250         record16(current);
 251       }
 252       break;
 253     // Escape Sequences inside of strings
 254     case InEscapeSequence:
 255       if (isOctalDigit(current)) {
 256         if (current >= '0' && current <= '3' &&
 257             isOctalDigit(next1) && isOctalDigit(next2)) {
 258           record16(convertOctal(current, next1, next2));
 259           shift(2);
 260           state = InString;
 261         } else if (isOctalDigit(current) && isOctalDigit(next1)) {
 262           record16(convertOctal('0', current, next1));
 263           shift(1);
 264           state = InString;
 265         } else if (isOctalDigit(current)) {
 266           record16(convertOctal('0', '0', current));
 267           state = InString;
 268         } else {
 269           setDone(Bad);
 270         }
 271       } else if (current == 'x')
 272         state = InHexEscape;
 273       else if (current == 'u')
 274         state = InUnicodeEscape;
 275       else if (isLineTerminator()) {
 276         nextLine();
 277         state = InString;
 278       } else {
 279         record16(singleEscape(static_cast<unsigned short>(current)));
 280         state = InString;
 281       }
 282       break;
 283     case InHexEscape:
 284       if (isHexDigit(current) && isHexDigit(next1)) {
 285         state = InString;
 286         record16(convertHex(current, next1));
 287         shift(1);
 288       } else if (current == stringType) {
 289         record16('x');
 290         shift(1);
 291         setDone(String);
 292       } else {
 293         record16('x');
 294         record16(current);
 295         state = InString;
 296       }
 297       break;
 298     case InUnicodeEscape:
 299       if (isHexDigit(current) && isHexDigit(next1) && isHexDigit(next2) && isHexDigit(next3)) {
 300         record16(convertUnicode(current, next1, next2, next3));
 301         shift(3);
 302         state = InString;
 303       } else if (current == stringType) {
 304         record16('u');
 305         shift(1);
 306         setDone(String);
 307       } else {
 308         setDone(Bad);
 309       }
 310       break;
 311     case InSingleLineComment:
 312       if (isLineTerminator()) {
 313         nextLine();
 314         terminator = true;
 315         if (restrKeyword) {
 316           token = ';';
 317           setDone(Other);
 318         } else
 319           state = Start;
 320       } else if (current == -1) {
 321         setDone(Eof);
 322       }
 323       break;
 324     case InMultiLineComment:
 325       if (current == -1) {
 326         setDone(Bad);
 327       } else if (isLineTerminator()) {
 328         nextLine();
 329       } else if (current == '*' && next1 == '/') {
 330         state = Start;
 331         shift(1);
 332       }
 333       break;
 334     case InIdentifierOrKeyword:
 335     case InIdentifier:
 336       if (isIdentPart(current))
 337         record16(current);
 338       else if (current == '\\')
 339         state = InIdentifierPartUnicodeEscapeStart;
 340       else
 341         setDone(state == InIdentifierOrKeyword ? IdentifierOrKeyword : Identifier);
 342       break;
 343     case InNum0:
 344       if (current == 'x' || current == 'X') {
 345         record8(current);
 346         state = InHex;
 347       } else if (current == '.') {
 348         record8(current);
 349         state = InDecimal;
 350       } else if (current == 'e' || current == 'E') {
 351         record8(current);
 352         state = InExponentIndicator;
 353       } else if (isOctalDigit(current)) {
 354         record8(current);
 355         state = InOctal;
 356       } else if (isDecimalDigit(current)) {
 357         record8(current);
 358         state = InDecimal;
 359       } else {
 360         setDone(Number);
 361       }
 362       break;
 363     case InHex:
 364       if (isHexDigit(current)) {
 365         record8(current);
 366       } else {
 367         setDone(Hex);
 368       }
 369       break;
 370     case InOctal:
 371       if (isOctalDigit(current)) {
 372         record8(current);
 373       }
 374       else if (isDecimalDigit(current)) {
 375         record8(current);
 376         state = InDecimal;
 377       } else
 378         setDone(Octal);
 379       break;
 380     case InNum:
 381       if (isDecimalDigit(current)) {
 382         record8(current);
 383       } else if (current == '.') {
 384         record8(current);
 385         state = InDecimal;
 386       } else if (current == 'e' || current == 'E') {
 387         record8(current);
 388         state = InExponentIndicator;
 389       } else
 390         setDone(Number);
 391       break;
 392     case InDecimal:
 393       if (isDecimalDigit(current)) {
 394         record8(current);
 395       } else if (current == 'e' || current == 'E') {
 396         record8(current);
 397         state = InExponentIndicator;
 398       } else
 399         setDone(Number);
 400       break;
 401     case InExponentIndicator:
 402       if (current == '+' || current == '-') {
 403         record8(current);
 404       } else if (isDecimalDigit(current)) {
 405         record8(current);
 406         state = InExponent;
 407       } else
 408         setDone(Bad);
 409       break;
 410     case InExponent:
 411       if (isDecimalDigit(current)) {
 412         record8(current);
 413       } else
 414         setDone(Number);
 415       break;
 416     case InIdentifierStartUnicodeEscapeStart:
 417       if (current == 'u')
 418         state = InIdentifierStartUnicodeEscape;
 419       else
 420         setDone(Bad);
 421       break;
 422     case InIdentifierPartUnicodeEscapeStart:
 423       if (current == 'u')
 424         state = InIdentifierPartUnicodeEscape;
 425       else
 426         setDone(Bad);
 427       break;
 428     case InIdentifierStartUnicodeEscape:
 429       if (!isHexDigit(current) || !isHexDigit(next1) || !isHexDigit(next2) || !isHexDigit(next3)) {
 430         setDone(Bad);
 431         break;
 432       }
 433       token = convertUnicode(current, next1, next2, next3).uc;
 434       shift(3);
 435       if (!isIdentStart(token)) {
 436         setDone(Bad);
 437         break;
 438       }
 439       record16(token);
 440       state = InIdentifier;
 441       break;
 442     case InIdentifierPartUnicodeEscape:
 443       if (!isHexDigit(current) || !isHexDigit(next1) || !isHexDigit(next2) || !isHexDigit(next3)) {
 444         setDone(Bad);
 445         break;
 446       }
 447       token = convertUnicode(current, next1, next2, next3).uc;
 448       shift(3);
 449       if (!isIdentPart(token)) {
 450         setDone(Bad);
 451         break;
 452       }
 453       record16(token);
 454       state = InIdentifier;
 455       break;
 456     default:
 457       ASSERT(!"Unhandled state in switch statement");
 458     }
 459
 460     // move on to the next character
 461     if (!done)
 462       shift(1);
 463     if (state != Start && state != InSingleLineComment)
 464       atLineStart = false;
 465   }
 466
 467   // no identifiers allowed directly after numeric literal, e.g. "3in" is bad
 468   if ((state == Number || state == Octal || state == Hex) && isIdentStart(current))
 469     state = Bad;
 470
 471   // terminate string
 472   m_buffer8.append('\0');
 473
 474 #ifdef KJS_DEBUG_LEX
 475   fprintf(stderr, "line: %d ", lineNo());
 476   fprintf(stderr, "yytext (%x): ", m_buffer8[0]);
 477   fprintf(stderr, "%s ", buffer8.data());
 478 #endif
 479
 480   double dval = 0;
 481   if (state == Number) {
 482     dval = kjs_strtod(m_buffer8.data(), 0L);
 483   } else if (state == Hex) { // scan hex numbers
 484     const char* p = m_buffer8.data() + 2;
 485     while (char c = *p++) {
 486       dval *= 16;
 487       dval += convertHex(c);
 488     }
 489
 490     if (dval >= mantissaOverflowLowerBound)
 491       dval = parseIntOverflow(m_buffer8.data() + 2, p - (m_buffer8.data() + 3), 16);
 492
 493     state = Number;
 494   } else if (state == Octal) {   // scan octal number
 495     const char* p = m_buffer8.data() + 1;
 496     while (char c = *p++) {
 497       dval *= 8;
 498       dval += c - '0';
 499     }
 500
 501     if (dval >= mantissaOverflowLowerBound)
 502       dval = parseIntOverflow(m_buffer8.data() + 1, p - (m_buffer8.data() + 2), 8);
 503
 504     state = Number;
 505   }
 506
 507 #ifdef KJS_DEBUG_LEX
 508   switch (state) {
 509   case Eof:
 510     printf("(EOF)\n");
 511     break;
 512   case Other:
 513     printf("(Other)\n");
 514     break;
 515   case Identifier:
 516     printf("(Identifier)/(Keyword)\n");
 517     break;
 518   case String:
 519     printf("(String)\n");
 520     break;
 521   case Number:
 522     printf("(Number)\n");
 523     break;
 524   default:
 525     printf("(unknown)");
 526   }
 527 #endif
 528
 529   if (state != Identifier && eatNextIdentifier)
 530     eatNextIdentifier = false;
 531
 532   restrKeyword = false;
 533   delimited = false;
 534   kjsyylloc.first_line = yylineno; // ???
 535   kjsyylloc.last_line = yylineno;
 536
 537   switch (state) {
 538   case Eof:
 539     token = 0;
 540     break;
 541   case Other:
 542     if(token == '}' || token == ';') {
 543       delimited = true;
 544     }
 545     break;
 546   case IdentifierOrKeyword:
 547     if ((token = Lookup::find(&mainTable, m_buffer16.data(), m_buffer16.size())) < 0) {
 548   case Identifier:
 549       // Lookup for keyword failed, means this is an identifier
 550       // Apply anonymous-function hack below (eat the identifier)
 551       if (eatNextIdentifier) {
 552         eatNextIdentifier = false;
 553         token = lex();
 554         break;
 555       }
 556       kjsyylval.ident = makeIdentifier(m_buffer16);
 557       token = IDENT;
 558       break;
 559     }
 560
 561     eatNextIdentifier = false;
 562     // Hack for "f = function somename() { ... }", too hard to get into the grammar
 563     if (token == FUNCTION && lastToken == '=' )
 564       eatNextIdentifier = true;
 565
 566     if (token == CONTINUE || token == BREAK ||
 567         token == RETURN || token == THROW)
 568       restrKeyword = true;
 569     break;
 570   case String:
 571     kjsyylval.string = makeUString(m_buffer16);
 572     token = STRING;
 573     break;
 574   case Number:
 575     kjsyylval.doubleValue = dval;
 576     token = NUMBER;
 577     break;
 578   case Bad:
 579 #ifdef KJS_DEBUG_LEX
 580     fprintf(stderr, "yylex: ERROR.\n");
 581 #endif
 582     error = true;
 583     return -1;
 584   default:
 585     ASSERT(!"unhandled numeration value in switch");
 586     error = true;
 587     return -1;
 588   }
 589   lastToken = token;
 590   return token;
 591 }
 592
 593 bool Lexer::isWhiteSpace() const
 594 {
 595   return current == '\t' || current == 0x0b || current == 0x0c || isSeparatorSpace(current);
 596 }
 597
 598 bool Lexer::isLineTerminator()
 599 {
 600   bool cr = (current == '\r');
 601   bool lf = (current == '\n');
 602   if (cr)
 603       skipLF = true;
 604   else if (lf)
 605       skipCR = true;
 606   return cr || lf || current == 0x2028 || current == 0x2029;
 607 }
 608
 609 bool Lexer::isIdentStart(int c)
 610 {
 611   return (category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other))
 612     || c == '$' || c == '_';
 613 }
 614
 615 bool Lexer::isIdentPart(int c)
 616 {
 617   return (category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other
 618         | Mark_NonSpacing | Mark_SpacingCombining | Number_DecimalDigit | Punctuation_Connector))
 619     || c == '$' || c == '_';
 620 }
 621
 622 static bool isDecimalDigit(int c)
 623 {
 624   return (c >= '0' && c <= '9');
 625 }
 626
 627 bool Lexer::isHexDigit(int c)
 628 {
 629   return (c >= '0' && c <= '9' ||
 630           c >= 'a' && c <= 'f' ||
 631           c >= 'A' && c <= 'F');
 632 }
 633
 634 bool Lexer::isOctalDigit(int c)
 635 {
 636   return (c >= '0' && c <= '7');
 637 }
 638
 639 int Lexer::matchPunctuator(int c1, int c2, int c3, int c4)
 640 {
 641   if (c1 == '>' && c2 == '>' && c3 == '>' && c4 == '=') {
 642     shift(4);
 643     return URSHIFTEQUAL;
 644   } else if (c1 == '=' && c2 == '=' && c3 == '=') {
 645     shift(3);
 646     return STREQ;
 647   } else if (c1 == '!' && c2 == '=' && c3 == '=') {
 648     shift(3);
 649     return STRNEQ;
 650    } else if (c1 == '>' && c2 == '>' && c3 == '>') {
 651     shift(3);
 652     return URSHIFT;
 653   } else if (c1 == '<' && c2 == '<' && c3 == '=') {
 654     shift(3);
 655     return LSHIFTEQUAL;
 656   } else if (c1 == '>' && c2 == '>' && c3 == '=') {
 657     shift(3);
 658     return RSHIFTEQUAL;
 659   } else if (c1 == '<' && c2 == '=') {
 660     shift(2);
 661     return LE;
 662   } else if (c1 == '>' && c2 == '=') {
 663     shift(2);
 664     return GE;
 665   } else if (c1 == '!' && c2 == '=') {
 666     shift(2);
 667     return NE;
 668   } else if (c1 == '+' && c2 == '+') {
 669     shift(2);
 670     if (terminator)
 671       return AUTOPLUSPLUS;
 672     else
 673       return PLUSPLUS;
 674   } else if (c1 == '-' && c2 == '-') {
 675     shift(2);
 676     if (terminator)
 677       return AUTOMINUSMINUS;
 678     else
 679       return MINUSMINUS;
 680   } else if (c1 == '=' && c2 == '=') {
 681     shift(2);
 682     return EQEQ;
 683   } else if (c1 == '+' && c2 == '=') {
 684     shift(2);
 685     return PLUSEQUAL;
 686   } else if (c1 == '-' && c2 == '=') {
 687     shift(2);
 688     return MINUSEQUAL;
 689   } else if (c1 == '*' && c2 == '=') {
 690     shift(2);
 691     return MULTEQUAL;
 692   } else if (c1 == '/' && c2 == '=') {
 693     shift(2);
 694     return DIVEQUAL;
 695   } else if (c1 == '&' && c2 == '=') {
 696     shift(2);
 697     return ANDEQUAL;
 698   } else if (c1 == '^' && c2 == '=') {
 699     shift(2);
 700     return XOREQUAL;
 701   } else if (c1 == '%' && c2 == '=') {
 702     shift(2);
 703     return MODEQUAL;
 704   } else if (c1 == '|' && c2 == '=') {
 705     shift(2);
 706     return OREQUAL;
 707   } else if (c1 == '<' && c2 == '<') {
 708     shift(2);
 709     return LSHIFT;
 710   } else if (c1 == '>' && c2 == '>') {
 711     shift(2);
 712     return RSHIFT;
 713   } else if (c1 == '&' && c2 == '&') {
 714     shift(2);
 715     return AND;
 716   } else if (c1 == '|' && c2 == '|') {
 717     shift(2);
 718     return OR;
 719   }
 720
 721   switch(c1) {
 722     case '=':
 723     case '>':
 724     case '<':
 725     case ',':
 726     case '!':
 727     case '~':
 728     case '?':
 729     case ':':
 730     case '.':
 731     case '+':
 732     case '-':
 733     case '*':
 734     case '/':
 735     case '&':
 736     case '|':
 737     case '^':
 738     case '%':
 739     case '(':
 740     case ')':
 741     case '{':
 742     case '}':
 743     case '[':
 744     case ']':
 745     case ';':
 746       shift(1);
 747       return static_cast<int>(c1);
 748     default:
 749       return -1;
 750   }
 751 }
 752
 753 unsigned short Lexer::singleEscape(unsigned short c)
 754 {
 755   switch(c) {
 756   case 'b':
 757     return 0x08;
 758   case 't':
 759     return 0x09;
 760   case 'n':
 761     return 0x0A;
 762   case 'v':
 763     return 0x0B;
 764   case 'f':
 765     return 0x0C;
 766   case 'r':
 767     return 0x0D;
 768   case '"':
 769     return 0x22;
 770   case '\'':
 771     return 0x27;
 772   case '\\':
 773     return 0x5C;
 774   default:
 775     return c;
 776   }
 777 }
 778
 779 unsigned short Lexer::convertOctal(int c1, int c2, int c3)
 780 {
 781   return static_cast<unsigned short>((c1 - '0') * 64 + (c2 - '0') * 8 + c3 - '0');
 782 }
 783
 784 unsigned char Lexer::convertHex(int c)
 785 {
 786   if (c >= '0' && c <= '9')
 787     return static_cast<unsigned char>(c - '0');
 788   if (c >= 'a' && c <= 'f')
 789     return static_cast<unsigned char>(c - 'a' + 10);
 790   return static_cast<unsigned char>(c - 'A' + 10);
 791 }
 792
 793 unsigned char Lexer::convertHex(int c1, int c2)
 794 {
 795   return ((convertHex(c1) << 4) + convertHex(c2));
 796 }
 797
 798 KJS::UChar Lexer::convertUnicode(int c1, int c2, int c3, int c4)
 799 {
 800   return KJS::UChar((convertHex(c1) << 4) + convertHex(c2),
 801                (convertHex(c3) << 4) + convertHex(c4));
 802 }
 803
 804 void Lexer::record8(int c)
 805 {
 806     ASSERT(c >= 0);
 807     ASSERT(c <= 0xff);
 808     m_buffer8.append(static_cast<char>(c));
 809 }
 810
 811 void Lexer::record16(int c)
 812 {
 813     ASSERT(c >= 0);
 814     ASSERT(c <= USHRT_MAX);
 815     record16(UChar(static_cast<unsigned short>(c)));
 816 }
 817
 818 void Lexer::record16(KJS::UChar c)
 819 {
 820     m_buffer16.append(c);
 821 }
 822
 823 bool Lexer::scanRegExp()
 824 {
 825   m_buffer16.clear();
 826   bool lastWasEscape = false;
 827   bool inBrackets = false;
 828
 829   while (1) {
 830     if (isLineTerminator() || current == -1)
 831       return false;
 832     else if (current != '/' || lastWasEscape == true || inBrackets == true)
 833     {
 834         // keep track of '[' and ']'
 835         if (!lastWasEscape) {
 836           if ( current == '[' && !inBrackets )
 837             inBrackets = true;
 838           if ( current == ']' && inBrackets )
 839             inBrackets = false;
 840         }
 841         record16(current);
 842         lastWasEscape =
 843             !lastWasEscape && (current == '\\');
 844     } else { // end of regexp
 845         m_pattern = UString(m_buffer16);
 846         m_buffer16.clear();
 847         shift(1);
 848         break;
 849     }
 850     shift(1);
 851   }
 852
 853   while (isIdentPart(current)) {
 854     record16(current);
 855     shift(1);
 856   }
 857   m_flags = UString(m_buffer16);
 858
 859   return true;
 860 }
 861
 862 void Lexer::clear()
 863 {
 864     deleteAllValues(m_strings);
 865     Vector<UString*> newStrings;
 866     newStrings.reserveCapacity(initialStringTableCapacity);
 867     m_strings.swap(newStrings);
 868
 869     deleteAllValues(m_identifiers);
 870     Vector<KJS::Identifier*> newIdentifiers;
 871     newIdentifiers.reserveCapacity(initialStringTableCapacity);
 872     m_identifiers.swap(newIdentifiers);
 873
 874     Vector<char> newBuffer8;
 875     newBuffer8.reserveCapacity(initialReadBufferCapacity);
 876     m_buffer8.swap(newBuffer8);
 877
 878     Vector<UChar> newBuffer16;
 879     newBuffer16.reserveCapacity(initialReadBufferCapacity);
 880     m_buffer16.swap(newBuffer16);
 881
 882     m_pattern = 0;
 883     m_flags = 0;
 884 }
 885
 886 Identifier* Lexer::makeIdentifier(const Vector<KJS::UChar>& buffer)
 887 {
 888     KJS::Identifier* identifier = new KJS::Identifier(buffer.data(), buffer.size());
 889     m_identifiers.append(identifier);
 890     return identifier;
 891 }
 892
 893 UString* Lexer::makeUString(const Vector<KJS::UChar>& buffer)
 894 {
 895     UString* string = new UString(buffer);
 896     m_strings.append(string);
 897     return string;
 898 }
 899
 900 } // namespace KJS