kjs/lexer.cpp

   1 // -*- c-basic-offset: 2 -*-
   2 /*
   3  *  Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
   4  *  Copyright (C) 2006, 2007 Apple Inc. All Rights Reserved.
   5  *  Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
   6  *
   7  *  This library is free software; you can redistribute it and/or
   8  *  modify it under the terms of the GNU Library General Public
   9  *  License as published by the Free Software Foundation; either
  10  *  version 2 of the License, or (at your option) any later version.
  11  *
  12  *  This library is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  *  Library General Public License for more details.
  16  *
  17  *  You should have received a copy of the GNU Library General Public License
  18  *  along with this library; see the file COPYING.LIB.  If not, write to
  19  *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  20  *  Boston, MA 02110-1301, USA.
  21  *
  22  */
  23
  24 #include "config.h"
  25 #include "lexer.h"
  26
  27 #include "dtoa.h"
  28 #include "function.h"
  29 #include "nodes.h"
  30 #include "NodeInfo.h"
  31 #include <ctype.h>
  32 #include <limits.h>
  33 #include <string.h>
  34 #include <wtf/Assertions.h>
  35 #include <wtf/unicode/Unicode.h>
  36
  37 using namespace WTF;
  38 using namespace Unicode;
  39
  40 // we can't specify the namespace in yacc's C output, so do it here
  41 using namespace KJS;
  42
  43 #ifndef KDE_USE_FINAL
  44 #include "grammar.h"
  45 #endif
  46
  47 #include "lookup.h"
  48 #include "lexer.lut.h"
  49
  50 extern YYLTYPE kjsyylloc; // global bison variable holding token info
  51
  52 // a bridge for yacc from the C world to C++
  53 int kjsyylex()
  54 {
  55   return lexer().lex();
  56 }
  57
  58 namespace KJS {
  59
  60 static bool isDecimalDigit(int);
  61
  62 static const size_t initialReadBufferCapacity = 32;
  63 static const size_t initialStringTableCapacity = 64;
  64
  65 Lexer& lexer()
  66 {
  67     ASSERT(JSLock::currentThreadIsHoldingLock());
  68
  69     // FIXME: We'd like to avoid calling new here, but we don't currently
  70     // support tearing down the Lexer at app quit time, since that would involve
  71     // tearing down its UString data members without holding the JSLock.
  72     static Lexer* staticLexer = new Lexer;
  73     return *staticLexer;
  74 }
  75
  76 Lexer::Lexer()
  77     : yylineno(1)
  78     , restrKeyword(false)
  79     , eatNextIdentifier(false)
  80     , stackToken(-1)
  81     , lastToken(-1)
  82     , pos(0)
  83     , code(0)
  84     , length(0)
  85     , atLineStart(true)
  86     , current(0)
  87     , next1(0)
  88     , next2(0)
  89     , next3(0)
  90 {
  91     m_buffer8.reserveCapacity(initialReadBufferCapacity);
  92     m_buffer16.reserveCapacity(initialReadBufferCapacity);
  93     m_strings.reserveCapacity(initialStringTableCapacity);
  94     m_identifiers.reserveCapacity(initialStringTableCapacity);
  95 }
  96
  97 void Lexer::setCode(int startingLineNumber, const KJS::UChar *c, unsigned int len)
  98 {
  99   yylineno = 1 + startingLineNumber;
 100   restrKeyword = false;
 101   delimited = false;
 102   eatNextIdentifier = false;
 103   stackToken = -1;
 104   lastToken = -1;
 105   pos = 0;
 106   code = c;
 107   length = len;
 108   skipLF = false;
 109   skipCR = false;
 110   error = false;
 111   atLineStart = true;
 112
 113   // read first characters
 114   current = (length > 0) ? code[0].uc : -1;
 115   next1 = (length > 1) ? code[1].uc : -1;
 116   next2 = (length > 2) ? code[2].uc : -1;
 117   next3 = (length > 3) ? code[3].uc : -1;
 118 }
 119
 120 void Lexer::shift(unsigned int p)
 121 {
 122   // Here would be a good place to strip Cf characters, but that has caused compatibility problems:
 123   // <http://bugs.webkit.org/show_bug.cgi?id=10183>.
 124   while (p--) {
 125     pos++;
 126     current = next1;
 127     next1 = next2;
 128     next2 = next3;
 129     next3 = (pos + 3 < length) ? code[pos + 3].uc : -1;
 130   }
 131 }
 132
 133 // called on each new line
 134 void Lexer::nextLine()
 135 {
 136   yylineno++;
 137   atLineStart = true;
 138 }
 139
 140 void Lexer::setDone(State s)
 141 {
 142   state = s;
 143   done = true;
 144 }
 145
 146 int Lexer::lex()
 147 {
 148   int token = 0;
 149   state = Start;
 150   unsigned short stringType = 0; // either single or double quotes
 151   m_buffer8.clear();
 152   m_buffer16.clear();
 153   done = false;
 154   terminator = false;
 155   skipLF = false;
 156   skipCR = false;
 157
 158   // did we push a token on the stack previously ?
 159   // (after an automatic semicolon insertion)
 160   if (stackToken >= 0) {
 161     setDone(Other);
 162     token = stackToken;
 163     stackToken = 0;
 164   }
 165
 166   while (!done) {
 167     if (skipLF && current != '\n') // found \r but not \n afterwards
 168         skipLF = false;
 169     if (skipCR && current != '\r') // found \n but not \r afterwards
 170         skipCR = false;
 171     if (skipLF || skipCR) // found \r\n or \n\r -> eat the second one
 172     {
 173         skipLF = false;
 174         skipCR = false;
 175         shift(1);
 176     }
 177     switch (state) {
 178     case Start:
 179       if (isWhiteSpace()) {
 180         // do nothing
 181       } else if (current == '/' && next1 == '/') {
 182         shift(1);
 183         state = InSingleLineComment;
 184       } else if (current == '/' && next1 == '*') {
 185         shift(1);
 186         state = InMultiLineComment;
 187       } else if (current == -1) {
 188         if (!terminator && !delimited) {
 189           // automatic semicolon insertion if program incomplete
 190           token = ';';
 191           stackToken = 0;
 192           setDone(Other);
 193         } else
 194           setDone(Eof);
 195       } else if (isLineTerminator()) {
 196         nextLine();
 197         terminator = true;
 198         if (restrKeyword) {
 199           token = ';';
 200           setDone(Other);
 201         }
 202       } else if (current == '"' || current == '\'') {
 203         state = InString;
 204         stringType = static_cast<unsigned short>(current);
 205       } else if (isIdentStart(current)) {
 206         record16(current);
 207         state = InIdentifierOrKeyword;
 208       } else if (current == '\\') {
 209         state = InIdentifierStartUnicodeEscapeStart;
 210       } else if (current == '0') {
 211         record8(current);
 212         state = InNum0;
 213       } else if (isDecimalDigit(current)) {
 214         record8(current);
 215         state = InNum;
 216       } else if (current == '.' && isDecimalDigit(next1)) {
 217         record8(current);
 218         state = InDecimal;
 219         // <!-- marks the beginning of a line comment (for www usage)
 220       } else if (current == '<' && next1 == '!' &&
 221                  next2 == '-' && next3 == '-') {
 222         shift(3);
 223         state = InSingleLineComment;
 224         // same for -->
 225       } else if (atLineStart && current == '-' && next1 == '-' &&  next2 == '>') {
 226         shift(2);
 227         state = InSingleLineComment;
 228       } else {
 229         token = matchPunctuator(current, next1, next2, next3);
 230         if (token != -1) {
 231           setDone(Other);
 232         } else {
 233           //      cerr << "encountered unknown character" << endl;
 234           setDone(Bad);
 235         }
 236       }
 237       break;
 238     case InString:
 239       if (current == stringType) {
 240         shift(1);
 241         setDone(String);
 242       } else if (isLineTerminator() || current == -1) {
 243         setDone(Bad);
 244       } else if (current == '\\') {
 245         state = InEscapeSequence;
 246       } else {
 247         record16(current);
 248       }
 249       break;
 250     // Escape Sequences inside of strings
 251     case InEscapeSequence:
 252       if (isOctalDigit(current)) {
 253         if (current >= '0' && current <= '3' &&
 254             isOctalDigit(next1) && isOctalDigit(next2)) {
 255           record16(convertOctal(current, next1, next2));
 256           shift(2);
 257           state = InString;
 258         } else if (isOctalDigit(current) && isOctalDigit(next1)) {
 259           record16(convertOctal('0', current, next1));
 260           shift(1);
 261           state = InString;
 262         } else if (isOctalDigit(current)) {
 263           record16(convertOctal('0', '0', current));
 264           state = InString;
 265         } else {
 266           setDone(Bad);
 267         }
 268       } else if (current == 'x')
 269         state = InHexEscape;
 270       else if (current == 'u')
 271         state = InUnicodeEscape;
 272       else if (isLineTerminator()) {
 273         nextLine();
 274         state = InString;
 275       } else {
 276         record16(singleEscape(static_cast<unsigned short>(current)));
 277         state = InString;
 278       }
 279       break;
 280     case InHexEscape:
 281       if (isHexDigit(current) && isHexDigit(next1)) {
 282         state = InString;
 283         record16(convertHex(current, next1));
 284         shift(1);
 285       } else if (current == stringType) {
 286         record16('x');
 287         shift(1);
 288         setDone(String);
 289       } else {
 290         record16('x');
 291         record16(current);
 292         state = InString;
 293       }
 294       break;
 295     case InUnicodeEscape:
 296       if (isHexDigit(current) && isHexDigit(next1) && isHexDigit(next2) && isHexDigit(next3)) {
 297         record16(convertUnicode(current, next1, next2, next3));
 298         shift(3);
 299         state = InString;
 300       } else if (current == stringType) {
 301         record16('u');
 302         shift(1);
 303         setDone(String);
 304       } else {
 305         setDone(Bad);
 306       }
 307       break;
 308     case InSingleLineComment:
 309       if (isLineTerminator()) {
 310         nextLine();
 311         terminator = true;
 312         if (restrKeyword) {
 313           token = ';';
 314           setDone(Other);
 315         } else
 316           state = Start;
 317       } else if (current == -1) {
 318         setDone(Eof);
 319       }
 320       break;
 321     case InMultiLineComment:
 322       if (current == -1) {
 323         setDone(Bad);
 324       } else if (isLineTerminator()) {
 325         nextLine();
 326       } else if (current == '*' && next1 == '/') {
 327         state = Start;
 328         shift(1);
 329       }
 330       break;
 331     case InIdentifierOrKeyword:
 332     case InIdentifier:
 333       if (isIdentPart(current))
 334         record16(current);
 335       else if (current == '\\')
 336         state = InIdentifierPartUnicodeEscapeStart;
 337       else
 338         setDone(state == InIdentifierOrKeyword ? IdentifierOrKeyword : Identifier);
 339       break;
 340     case InNum0:
 341       if (current == 'x' || current == 'X') {
 342         record8(current);
 343         state = InHex;
 344       } else if (current == '.') {
 345         record8(current);
 346         state = InDecimal;
 347       } else if (current == 'e' || current == 'E') {
 348         record8(current);
 349         state = InExponentIndicator;
 350       } else if (isOctalDigit(current)) {
 351         record8(current);
 352         state = InOctal;
 353       } else if (isDecimalDigit(current)) {
 354         record8(current);
 355         state = InDecimal;
 356       } else {
 357         setDone(Number);
 358       }
 359       break;
 360     case InHex:
 361       if (isHexDigit(current)) {
 362         record8(current);
 363       } else {
 364         setDone(Hex);
 365       }
 366       break;
 367     case InOctal:
 368       if (isOctalDigit(current)) {
 369         record8(current);
 370       }
 371       else if (isDecimalDigit(current)) {
 372         record8(current);
 373         state = InDecimal;
 374       } else
 375         setDone(Octal);
 376       break;
 377     case InNum:
 378       if (isDecimalDigit(current)) {
 379         record8(current);
 380       } else if (current == '.') {
 381         record8(current);
 382         state = InDecimal;
 383       } else if (current == 'e' || current == 'E') {
 384         record8(current);
 385         state = InExponentIndicator;
 386       } else
 387         setDone(Number);
 388       break;
 389     case InDecimal:
 390       if (isDecimalDigit(current)) {
 391         record8(current);
 392       } else if (current == 'e' || current == 'E') {
 393         record8(current);
 394         state = InExponentIndicator;
 395       } else
 396         setDone(Number);
 397       break;
 398     case InExponentIndicator:
 399       if (current == '+' || current == '-') {
 400         record8(current);
 401       } else if (isDecimalDigit(current)) {
 402         record8(current);
 403         state = InExponent;
 404       } else
 405         setDone(Bad);
 406       break;
 407     case InExponent:
 408       if (isDecimalDigit(current)) {
 409         record8(current);
 410       } else
 411         setDone(Number);
 412       break;
 413     case InIdentifierStartUnicodeEscapeStart:
 414       if (current == 'u')
 415         state = InIdentifierStartUnicodeEscape;
 416       else
 417         setDone(Bad);
 418       break;
 419     case InIdentifierPartUnicodeEscapeStart:
 420       if (current == 'u')
 421         state = InIdentifierPartUnicodeEscape;
 422       else
 423         setDone(Bad);
 424       break;
 425     case InIdentifierStartUnicodeEscape:
 426       if (!isHexDigit(current) || !isHexDigit(next1) || !isHexDigit(next2) || !isHexDigit(next3)) {
 427         setDone(Bad);
 428         break;
 429       }
 430       token = convertUnicode(current, next1, next2, next3).uc;
 431       shift(3);
 432       if (!isIdentStart(token)) {
 433         setDone(Bad);
 434         break;
 435       }
 436       record16(token);
 437       state = InIdentifier;
 438       break;
 439     case InIdentifierPartUnicodeEscape:
 440       if (!isHexDigit(current) || !isHexDigit(next1) || !isHexDigit(next2) || !isHexDigit(next3)) {
 441         setDone(Bad);
 442         break;
 443       }
 444       token = convertUnicode(current, next1, next2, next3).uc;
 445       shift(3);
 446       if (!isIdentPart(token)) {
 447         setDone(Bad);
 448         break;
 449       }
 450       record16(token);
 451       state = InIdentifier;
 452       break;
 453     default:
 454       ASSERT(!"Unhandled state in switch statement");
 455     }
 456
 457     // move on to the next character
 458     if (!done)
 459       shift(1);
 460     if (state != Start && state != InSingleLineComment)
 461       atLineStart = false;
 462   }
 463
 464   // no identifiers allowed directly after numeric literal, e.g. "3in" is bad
 465   if ((state == Number || state == Octal || state == Hex) && isIdentStart(current))
 466     state = Bad;
 467
 468   // terminate string
 469   m_buffer8.append('\0');
 470
 471 #ifdef KJS_DEBUG_LEX
 472   fprintf(stderr, "line: %d ", lineNo());
 473   fprintf(stderr, "yytext (%x): ", m_buffer8[0]);
 474   fprintf(stderr, "%s ", buffer8.data());
 475 #endif
 476
 477   double dval = 0;
 478   if (state == Number) {
 479     dval = kjs_strtod(m_buffer8.data(), 0L);
 480   } else if (state == Hex) { // scan hex numbers
 481     const char* p = m_buffer8.data() + 2;
 482     while (char c = *p++) {
 483       dval *= 16;
 484       dval += convertHex(c);
 485     }
 486
 487     if (dval >= mantissaOverflowLowerBound)
 488       dval = parseIntOverflow(m_buffer8.data() + 2, p - (m_buffer8.data() + 3), 16);
 489
 490     state = Number;
 491   } else if (state == Octal) {   // scan octal number
 492     const char* p = m_buffer8.data() + 1;
 493     while (char c = *p++) {
 494       dval *= 8;
 495       dval += c - '0';
 496     }
 497
 498     if (dval >= mantissaOverflowLowerBound)
 499       dval = parseIntOverflow(m_buffer8.data() + 1, p - (m_buffer8.data() + 2), 8);
 500
 501     state = Number;
 502   }
 503
 504 #ifdef KJS_DEBUG_LEX
 505   switch (state) {
 506   case Eof:
 507     printf("(EOF)\n");
 508     break;
 509   case Other:
 510     printf("(Other)\n");
 511     break;
 512   case Identifier:
 513     printf("(Identifier)/(Keyword)\n");
 514     break;
 515   case String:
 516     printf("(String)\n");
 517     break;
 518   case Number:
 519     printf("(Number)\n");
 520     break;
 521   default:
 522     printf("(unknown)");
 523   }
 524 #endif
 525
 526   if (state != Identifier && eatNextIdentifier)
 527     eatNextIdentifier = false;
 528
 529   restrKeyword = false;
 530   delimited = false;
 531   kjsyylloc.first_line = yylineno; // ???
 532   kjsyylloc.last_line = yylineno;
 533
 534   switch (state) {
 535   case Eof:
 536     token = 0;
 537     break;
 538   case Other:
 539     if(token == '}' || token == ';') {
 540       delimited = true;
 541     }
 542     break;
 543   case IdentifierOrKeyword:
 544     if ((token = Lookup::find(&mainTable, m_buffer16.data(), m_buffer16.size())) < 0) {
 545   case Identifier:
 546       // Lookup for keyword failed, means this is an identifier
 547       // Apply anonymous-function hack below (eat the identifier)
 548       if (eatNextIdentifier) {
 549         eatNextIdentifier = false;
 550         token = lex();
 551         break;
 552       }
 553       kjsyylval.ident = makeIdentifier(m_buffer16);
 554       token = IDENT;
 555       break;
 556     }
 557
 558     eatNextIdentifier = false;
 559     // Hack for "f = function somename() { ... }", too hard to get into the grammar
 560     if (token == FUNCTION && lastToken == '=' )
 561       eatNextIdentifier = true;
 562
 563     if (token == CONTINUE || token == BREAK ||
 564         token == RETURN || token == THROW)
 565       restrKeyword = true;
 566     break;
 567   case String:
 568     kjsyylval.string = makeUString(m_buffer16);
 569     token = STRING;
 570     break;
 571   case Number:
 572     kjsyylval.doubleValue = dval;
 573     token = NUMBER;
 574     break;
 575   case Bad:
 576 #ifdef KJS_DEBUG_LEX
 577     fprintf(stderr, "yylex: ERROR.\n");
 578 #endif
 579     error = true;
 580     return -1;
 581   default:
 582     ASSERT(!"unhandled numeration value in switch");
 583     error = true;
 584     return -1;
 585   }
 586   lastToken = token;
 587   return token;
 588 }
 589
 590 bool Lexer::isWhiteSpace() const
 591 {
 592   return current == '\t' || current == 0x0b || current == 0x0c || isSeparatorSpace(current);
 593 }
 594
 595 bool Lexer::isLineTerminator()
 596 {
 597   bool cr = (current == '\r');
 598   bool lf = (current == '\n');
 599   if (cr)
 600       skipLF = true;
 601   else if (lf)
 602       skipCR = true;
 603   return cr || lf || current == 0x2028 || current == 0x2029;
 604 }
 605
 606 bool Lexer::isIdentStart(int c)
 607 {
 608   return (category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other))
 609     || c == '$' || c == '_';
 610 }
 611
 612 bool Lexer::isIdentPart(int c)
 613 {
 614   return (category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other
 615         | Mark_NonSpacing | Mark_SpacingCombining | Number_DecimalDigit | Punctuation_Connector))
 616     || c == '$' || c == '_';
 617 }
 618
 619 static bool isDecimalDigit(int c)
 620 {
 621   return (c >= '0' && c <= '9');
 622 }
 623
 624 bool Lexer::isHexDigit(int c)
 625 {
 626   return (c >= '0' && c <= '9' ||
 627           c >= 'a' && c <= 'f' ||
 628           c >= 'A' && c <= 'F');
 629 }
 630
 631 bool Lexer::isOctalDigit(int c)
 632 {
 633   return (c >= '0' && c <= '7');
 634 }
 635
 636 int Lexer::matchPunctuator(int c1, int c2, int c3, int c4)
 637 {
 638   if (c1 == '>' && c2 == '>' && c3 == '>' && c4 == '=') {
 639     shift(4);
 640     return URSHIFTEQUAL;
 641   } else if (c1 == '=' && c2 == '=' && c3 == '=') {
 642     shift(3);
 643     return STREQ;
 644   } else if (c1 == '!' && c2 == '=' && c3 == '=') {
 645     shift(3);
 646     return STRNEQ;
 647    } else if (c1 == '>' && c2 == '>' && c3 == '>') {
 648     shift(3);
 649     return URSHIFT;
 650   } else if (c1 == '<' && c2 == '<' && c3 == '=') {
 651     shift(3);
 652     return LSHIFTEQUAL;
 653   } else if (c1 == '>' && c2 == '>' && c3 == '=') {
 654     shift(3);
 655     return RSHIFTEQUAL;
 656   } else if (c1 == '<' && c2 == '=') {
 657     shift(2);
 658     return LE;
 659   } else if (c1 == '>' && c2 == '=') {
 660     shift(2);
 661     return GE;
 662   } else if (c1 == '!' && c2 == '=') {
 663     shift(2);
 664     return NE;
 665   } else if (c1 == '+' && c2 == '+') {
 666     shift(2);
 667     if (terminator)
 668       return AUTOPLUSPLUS;
 669     else
 670       return PLUSPLUS;
 671   } else if (c1 == '-' && c2 == '-') {
 672     shift(2);
 673     if (terminator)
 674       return AUTOMINUSMINUS;
 675     else
 676       return MINUSMINUS;
 677   } else if (c1 == '=' && c2 == '=') {
 678     shift(2);
 679     return EQEQ;
 680   } else if (c1 == '+' && c2 == '=') {
 681     shift(2);
 682     return PLUSEQUAL;
 683   } else if (c1 == '-' && c2 == '=') {
 684     shift(2);
 685     return MINUSEQUAL;
 686   } else if (c1 == '*' && c2 == '=') {
 687     shift(2);
 688     return MULTEQUAL;
 689   } else if (c1 == '/' && c2 == '=') {
 690     shift(2);
 691     return DIVEQUAL;
 692   } else if (c1 == '&' && c2 == '=') {
 693     shift(2);
 694     return ANDEQUAL;
 695   } else if (c1 == '^' && c2 == '=') {
 696     shift(2);
 697     return XOREQUAL;
 698   } else if (c1 == '%' && c2 == '=') {
 699     shift(2);
 700     return MODEQUAL;
 701   } else if (c1 == '|' && c2 == '=') {
 702     shift(2);
 703     return OREQUAL;
 704   } else if (c1 == '<' && c2 == '<') {
 705     shift(2);
 706     return LSHIFT;
 707   } else if (c1 == '>' && c2 == '>') {
 708     shift(2);
 709     return RSHIFT;
 710   } else if (c1 == '&' && c2 == '&') {
 711     shift(2);
 712     return AND;
 713   } else if (c1 == '|' && c2 == '|') {
 714     shift(2);
 715     return OR;
 716   }
 717
 718   switch(c1) {
 719     case '=':
 720     case '>':
 721     case '<':
 722     case ',':
 723     case '!':
 724     case '~':
 725     case '?':
 726     case ':':
 727     case '.':
 728     case '+':
 729     case '-':
 730     case '*':
 731     case '/':
 732     case '&':
 733     case '|':
 734     case '^':
 735     case '%':
 736     case '(':
 737     case ')':
 738     case '{':
 739     case '}':
 740     case '[':
 741     case ']':
 742     case ';':
 743       shift(1);
 744       return static_cast<int>(c1);
 745     default:
 746       return -1;
 747   }
 748 }
 749
 750 unsigned short Lexer::singleEscape(unsigned short c)
 751 {
 752   switch(c) {
 753   case 'b':
 754     return 0x08;
 755   case 't':
 756     return 0x09;
 757   case 'n':
 758     return 0x0A;
 759   case 'v':
 760     return 0x0B;
 761   case 'f':
 762     return 0x0C;
 763   case 'r':
 764     return 0x0D;
 765   case '"':
 766     return 0x22;
 767   case '\'':
 768     return 0x27;
 769   case '\\':
 770     return 0x5C;
 771   default:
 772     return c;
 773   }
 774 }
 775
 776 unsigned short Lexer::convertOctal(int c1, int c2, int c3)
 777 {
 778   return static_cast<unsigned short>((c1 - '0') * 64 + (c2 - '0') * 8 + c3 - '0');
 779 }
 780
 781 unsigned char Lexer::convertHex(int c)
 782 {
 783   if (c >= '0' && c <= '9')
 784     return static_cast<unsigned char>(c - '0');
 785   if (c >= 'a' && c <= 'f')
 786     return static_cast<unsigned char>(c - 'a' + 10);
 787   return static_cast<unsigned char>(c - 'A' + 10);
 788 }
 789
 790 unsigned char Lexer::convertHex(int c1, int c2)
 791 {
 792   return ((convertHex(c1) << 4) + convertHex(c2));
 793 }
 794
 795 KJS::UChar Lexer::convertUnicode(int c1, int c2, int c3, int c4)
 796 {
 797   return KJS::UChar((convertHex(c1) << 4) + convertHex(c2),
 798                (convertHex(c3) << 4) + convertHex(c4));
 799 }
 800
 801 void Lexer::record8(int c)
 802 {
 803     ASSERT(c >= 0);
 804     ASSERT(c <= 0xff);
 805     m_buffer8.append(static_cast<char>(c));
 806 }
 807
 808 void Lexer::record16(int c)
 809 {
 810     ASSERT(c >= 0);
 811     ASSERT(c <= USHRT_MAX);
 812     record16(UChar(static_cast<unsigned short>(c)));
 813 }
 814
 815 void Lexer::record16(KJS::UChar c)
 816 {
 817     m_buffer16.append(c);
 818 }
 819
 820 bool Lexer::scanRegExp()
 821 {
 822   m_buffer16.clear();
 823   bool lastWasEscape = false;
 824   bool inBrackets = false;
 825
 826   while (1) {
 827     if (isLineTerminator() || current == -1)
 828       return false;
 829     else if (current != '/' || lastWasEscape == true || inBrackets == true)
 830     {
 831         // keep track of '[' and ']'
 832         if (!lastWasEscape) {
 833           if ( current == '[' && !inBrackets )
 834             inBrackets = true;
 835           if ( current == ']' && inBrackets )
 836             inBrackets = false;
 837         }
 838         record16(current);
 839         lastWasEscape =
 840             !lastWasEscape && (current == '\\');
 841     } else { // end of regexp
 842         m_pattern = UString(m_buffer16);
 843         m_buffer16.clear();
 844         shift(1);
 845         break;
 846     }
 847     shift(1);
 848   }
 849
 850   while (isIdentPart(current)) {
 851     record16(current);
 852     shift(1);
 853   }
 854   m_flags = UString(m_buffer16);
 855
 856   return true;
 857 }
 858
 859 void Lexer::clear()
 860 {
 861     deleteAllValues(m_strings);
 862     Vector<UString*> newStrings;
 863     newStrings.reserveCapacity(initialStringTableCapacity);
 864     m_strings.swap(newStrings);
 865
 866     deleteAllValues(m_identifiers);
 867     Vector<KJS::Identifier*> newIdentifiers;
 868     newIdentifiers.reserveCapacity(initialStringTableCapacity);
 869     m_identifiers.swap(newIdentifiers);
 870
 871     Vector<char> newBuffer8;
 872     newBuffer8.reserveCapacity(initialReadBufferCapacity);
 873     m_buffer8.swap(newBuffer8);
 874
 875     Vector<UChar> newBuffer16;
 876     newBuffer16.reserveCapacity(initialReadBufferCapacity);
 877     m_buffer16.swap(newBuffer16);
 878
 879     m_pattern = 0;
 880     m_flags = 0;
 881 }
 882
 883 Identifier* Lexer::makeIdentifier(const Vector<KJS::UChar>& buffer)
 884 {
 885     KJS::Identifier* identifier = new KJS::Identifier(buffer.data(), buffer.size());
 886     m_identifiers.append(identifier);
 887     return identifier;
 888 }
 889
 890 UString* Lexer::makeUString(const Vector<KJS::UChar>& buffer)
 891 {
 892     UString* string = new UString(buffer);
 893     m_strings.append(string);
 894     return string;
 895 }
 896
 897 } // namespace KJS