[apple/javascriptcore.git] / kjs / lexer.cpp

// -*- c-basic-offset: 2 -*-
/*
 *  Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
 *  Copyright (C) 2006, 2007, 2008 Apple Inc. All Rights Reserved.
 *  Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Library General Public
 *  License as published by the Free Software Foundation; either
 *  version 2 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Library General Public License for more details.
 *
 *  You should have received a copy of the GNU Library General Public License
 *  along with this library; see the file COPYING.LIB.  If not, write to
 *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 *  Boston, MA 02110-1301, USA.
 *
 */

#include "config.h"
#include "lexer.h"

#include "dtoa.h"
#include "function.h"
#include "nodes.h"
#include "NodeInfo.h"
#include <ctype.h>
#include <limits.h>
#include <string.h>
#include <wtf/Assertions.h>
#include <wtf/unicode/Unicode.h>

using namespace WTF;
using namespace Unicode;

// we can't specify the namespace in yacc's C output, so do it here
using namespace KJS;

#ifndef KDE_USE_FINAL
#include "grammar.h"
#endif

#include "lookup.h"
#include "lexer.lut.h"

extern YYLTYPE kjsyylloc; // global bison variable holding token info

// a bridge for yacc from the C world to C++
int kjsyylex()
{
  return lexer().lex();
}

namespace KJS {

static bool isDecimalDigit(int);

static const size_t initialReadBufferCapacity = 32;
static const size_t initialStringTableCapacity = 64;

Lexer& lexer()
{
    ASSERT(JSLock::currentThreadIsHoldingLock());

    // FIXME: We'd like to avoid calling new here, but we don't currently 
    // support tearing down the Lexer at app quit time, since that would involve
    // tearing down its UString data members without holding the JSLock.
    static Lexer* staticLexer = new Lexer;
    return *staticLexer;
}

Lexer::Lexer()
    : yylineno(1)
    , restrKeyword(false)
    , eatNextIdentifier(false)
    , stackToken(-1)
    , lastToken(-1)
    , pos(0)
    , code(0)
    , length(0)
    , atLineStart(true)
    , current(0)
    , next1(0)
    , next2(0)
    , next3(0)
{
    m_buffer8.reserveCapacity(initialReadBufferCapacity);
    m_buffer16.reserveCapacity(initialReadBufferCapacity);
    m_strings.reserveCapacity(initialStringTableCapacity);
    m_identifiers.reserveCapacity(initialStringTableCapacity);
}

void Lexer::setCode(int startingLineNumber, const KJS::UChar *c, unsigned int len)
{
    yylineno = 1 + startingLineNumber;
    restrKeyword = false;
    delimited = false;
    eatNextIdentifier = false;
    stackToken = -1;
    lastToken = -1;
    pos = 0;
    code = c;
    length = len;
    skipLF = false;
    skipCR = false;
    error = false;
    atLineStart = true;
    
    // read first characters
    shift(4);
}

void Lexer::shift(unsigned p)
{
    // ECMA-262 calls for stripping Cf characters here, but we only do this for BOM,
    // see <https://bugs.webkit.org/show_bug.cgi?id=4931>.
    
    while (p--) {
        current = next1;
        next1 = next2;
        next2 = next3;
        do {
            if (pos >= length) {
                next3 = -1;
                break;
            }
            next3 = code[pos++].uc;
        } while (next3 == 0xFEFF);
    }
}

// called on each new line
void Lexer::nextLine()
{
  yylineno++;
  atLineStart = true;
}

void Lexer::setDone(State s)
{
  state = s;
  done = true;
}

int Lexer::lex()
{
  int token = 0;
  state = Start;
  unsigned short stringType = 0; // either single or double quotes
  m_buffer8.clear();
  m_buffer16.clear();
  done = false;
  terminator = false;
  skipLF = false;
  skipCR = false;

  // did we push a token on the stack previously ?
  // (after an automatic semicolon insertion)
  if (stackToken >= 0) {
    setDone(Other);
    token = stackToken;
    stackToken = 0;
  }

  while (!done) {
    if (skipLF && current != '\n') // found \r but not \n afterwards
        skipLF = false;
    if (skipCR && current != '\r') // found \n but not \r afterwards
        skipCR = false;
    if (skipLF || skipCR) // found \r\n or \n\r -> eat the second one
    {
        skipLF = false;
        skipCR = false;
        shift(1);
    }
    switch (state) {
    case Start:
      if (isWhiteSpace()) {
        // do nothing
      } else if (current == '/' && next1 == '/') {
        shift(1);
        state = InSingleLineComment;
      } else if (current == '/' && next1 == '*') {
        shift(1);
        state = InMultiLineComment;
      } else if (current == -1) {
        if (!terminator && !delimited) {
          // automatic semicolon insertion if program incomplete
          token = ';';
          stackToken = 0;
          setDone(Other);
        } else
          setDone(Eof);
      } else if (isLineTerminator()) {
        nextLine();
        terminator = true;
        if (restrKeyword) {
          token = ';';
          setDone(Other);
        }
      } else if (current == '"' || current == '\'') {
        state = InString;
        stringType = static_cast<unsigned short>(current);
      } else if (isIdentStart(current)) {
        record16(current);
        state = InIdentifierOrKeyword;
      } else if (current == '\\') {
        state = InIdentifierStartUnicodeEscapeStart;
      } else if (current == '0') {
        record8(current);
        state = InNum0;
      } else if (isDecimalDigit(current)) {
        record8(current);
        state = InNum;
      } else if (current == '.' && isDecimalDigit(next1)) {
        record8(current);
        state = InDecimal;
        // <!-- marks the beginning of a line comment (for www usage)
      } else if (current == '<' && next1 == '!' &&
                 next2 == '-' && next3 == '-') {
        shift(3);
        state = InSingleLineComment;
        // same for -->
      } else if (atLineStart && current == '-' && next1 == '-' &&  next2 == '>') {
        shift(2);
        state = InSingleLineComment;
      } else {
        token = matchPunctuator(current, next1, next2, next3);
        if (token != -1) {
          setDone(Other);
        } else {
          //      cerr << "encountered unknown character" << endl;
          setDone(Bad);
        }
      }
      break;
    case InString:
      if (current == stringType) {
        shift(1);
        setDone(String);
      } else if (isLineTerminator() || current == -1) {
        setDone(Bad);
      } else if (current == '\\') {
        state = InEscapeSequence;
      } else {
        record16(current);
      }
      break;
    // Escape Sequences inside of strings
    case InEscapeSequence:
      if (isOctalDigit(current)) {
        if (current >= '0' && current <= '3' &&
            isOctalDigit(next1) && isOctalDigit(next2)) {
          record16(convertOctal(current, next1, next2));
          shift(2);
          state = InString;
        } else if (isOctalDigit(current) && isOctalDigit(next1)) {
          record16(convertOctal('0', current, next1));
          shift(1);
          state = InString;
        } else if (isOctalDigit(current)) {
          record16(convertOctal('0', '0', current));
          state = InString;
        } else {
          setDone(Bad);
        }
      } else if (current == 'x')
        state = InHexEscape;
      else if (current == 'u')
        state = InUnicodeEscape;
      else if (isLineTerminator()) {
        nextLine();
        state = InString;
      } else {
        record16(singleEscape(static_cast<unsigned short>(current)));
        state = InString;
      }
      break;
    case InHexEscape:
      if (isHexDigit(current) && isHexDigit(next1)) {
        state = InString;
        record16(convertHex(current, next1));
        shift(1);
      } else if (current == stringType) {
        record16('x');
        shift(1);
        setDone(String);
      } else {
        record16('x');
        record16(current);
        state = InString;
      }
      break;
    case InUnicodeEscape:
      if (isHexDigit(current) && isHexDigit(next1) && isHexDigit(next2) && isHexDigit(next3)) {
        record16(convertUnicode(current, next1, next2, next3));
        shift(3);
        state = InString;
      } else if (current == stringType) {
        record16('u');
        shift(1);
        setDone(String);
      } else {
        setDone(Bad);
      }
      break;
    case InSingleLineComment:
      if (isLineTerminator()) {
        nextLine();
        terminator = true;
        if (restrKeyword) {
          token = ';';
          setDone(Other);
        } else
          state = Start;
      } else if (current == -1) {
        setDone(Eof);
      }
      break;
    case InMultiLineComment:
      if (current == -1) {
        setDone(Bad);
      } else if (isLineTerminator()) {
        nextLine();
      } else if (current == '*' && next1 == '/') {
        state = Start;
        shift(1);
      }
      break;
    case InIdentifierOrKeyword:
    case InIdentifier:
      if (isIdentPart(current))
        record16(current);
      else if (current == '\\')
        state = InIdentifierPartUnicodeEscapeStart;
      else
        setDone(state == InIdentifierOrKeyword ? IdentifierOrKeyword : Identifier);
      break;
    case InNum0:
      if (current == 'x' || current == 'X') {
        record8(current);
        state = InHex;
      } else if (current == '.') {
        record8(current);
        state = InDecimal;
      } else if (current == 'e' || current == 'E') {
        record8(current);
        state = InExponentIndicator;
      } else if (isOctalDigit(current)) {
        record8(current);
        state = InOctal;
      } else if (isDecimalDigit(current)) {
        record8(current);
        state = InDecimal;
      } else {
        setDone(Number);
      }
      break;
    case InHex:
      if (isHexDigit(current)) {
        record8(current);
      } else {
        setDone(Hex);
      }
      break;
    case InOctal:
      if (isOctalDigit(current)) {
        record8(current);
      }
      else if (isDecimalDigit(current)) {
        record8(current);
        state = InDecimal;
      } else
        setDone(Octal);
      break;
    case InNum:
      if (isDecimalDigit(current)) {
        record8(current);
      } else if (current == '.') {
        record8(current);
        state = InDecimal;
      } else if (current == 'e' || current == 'E') {
        record8(current);
        state = InExponentIndicator;
      } else
        setDone(Number);
      break;
    case InDecimal:
      if (isDecimalDigit(current)) {
        record8(current);
      } else if (current == 'e' || current == 'E') {
        record8(current);
        state = InExponentIndicator;
      } else
        setDone(Number);
      break;
    case InExponentIndicator:
      if (current == '+' || current == '-') {
        record8(current);
      } else if (isDecimalDigit(current)) {
        record8(current);
        state = InExponent;
      } else
        setDone(Bad);
      break;
    case InExponent:
      if (isDecimalDigit(current)) {
        record8(current);
      } else
        setDone(Number);
      break;
    case InIdentifierStartUnicodeEscapeStart:
      if (current == 'u')
        state = InIdentifierStartUnicodeEscape;
      else
        setDone(Bad);
      break;
    case InIdentifierPartUnicodeEscapeStart:
      if (current == 'u')
        state = InIdentifierPartUnicodeEscape;
      else
        setDone(Bad);
      break;
    case InIdentifierStartUnicodeEscape:
      if (!isHexDigit(current) || !isHexDigit(next1) || !isHexDigit(next2) || !isHexDigit(next3)) {
        setDone(Bad);
        break;
      }
      token = convertUnicode(current, next1, next2, next3).uc;
      shift(3);
      if (!isIdentStart(token)) {
        setDone(Bad);
        break;
      }
      record16(token);
      state = InIdentifier;
      break;
    case InIdentifierPartUnicodeEscape:
      if (!isHexDigit(current) || !isHexDigit(next1) || !isHexDigit(next2) || !isHexDigit(next3)) {
        setDone(Bad);
        break;
      }
      token = convertUnicode(current, next1, next2, next3).uc;
      shift(3);
      if (!isIdentPart(token)) {
        setDone(Bad);
        break;
      }
      record16(token);
      state = InIdentifier;
      break;
    default:
      ASSERT(!"Unhandled state in switch statement");
    }

    // move on to the next character
    if (!done)
      shift(1);
    if (state != Start && state != InSingleLineComment)
      atLineStart = false;
  }

  // no identifiers allowed directly after numeric literal, e.g. "3in" is bad
  if ((state == Number || state == Octal || state == Hex) && isIdentStart(current))
    state = Bad;

  // terminate string
  m_buffer8.append('\0');

#ifdef KJS_DEBUG_LEX
  fprintf(stderr, "line: %d ", lineNo());
  fprintf(stderr, "yytext (%x): ", m_buffer8[0]);
  fprintf(stderr, "%s ", buffer8.data());
#endif

  double dval = 0;
  if (state == Number) {
    dval = kjs_strtod(m_buffer8.data(), 0L);
  } else if (state == Hex) { // scan hex numbers
    const char* p = m_buffer8.data() + 2;
    while (char c = *p++) {
      dval *= 16;
      dval += convertHex(c);
    }

    if (dval >= mantissaOverflowLowerBound)
      dval = parseIntOverflow(m_buffer8.data() + 2, p - (m_buffer8.data() + 3), 16);

    state = Number;
  } else if (state == Octal) {   // scan octal number
    const char* p = m_buffer8.data() + 1;
    while (char c = *p++) {
      dval *= 8;
      dval += c - '0';
    }

    if (dval >= mantissaOverflowLowerBound)
      dval = parseIntOverflow(m_buffer8.data() + 1, p - (m_buffer8.data() + 2), 8);

    state = Number;
  }

#ifdef KJS_DEBUG_LEX
  switch (state) {
  case Eof:
    printf("(EOF)\n");
    break;
  case Other:
    printf("(Other)\n");
    break;
  case Identifier:
    printf("(Identifier)/(Keyword)\n");
    break;
  case String:
    printf("(String)\n");
    break;
  case Number:
    printf("(Number)\n");
    break;
  default:
    printf("(unknown)");
  }
#endif

  if (state != Identifier && eatNextIdentifier)
    eatNextIdentifier = false;

  restrKeyword = false;
  delimited = false;
  kjsyylloc.first_line = yylineno; // ???
  kjsyylloc.last_line = yylineno;

  switch (state) {
  case Eof:
    token = 0;
    break;
  case Other:
    if(token == '}' || token == ';') {
      delimited = true;
    }
    break;
  case IdentifierOrKeyword:
    if ((token = Lookup::find(&mainTable, m_buffer16.data(), m_buffer16.size())) < 0) {
  case Identifier:
      // Lookup for keyword failed, means this is an identifier
      // Apply anonymous-function hack below (eat the identifier)
      if (eatNextIdentifier) {
        eatNextIdentifier = false;
        token = lex();
        break;
      }
      kjsyylval.ident = makeIdentifier(m_buffer16);
      token = IDENT;
      break;
    }

    eatNextIdentifier = false;
    // Hack for "f = function somename() { ... }", too hard to get into the grammar
    if (token == FUNCTION && lastToken == '=' )
      eatNextIdentifier = true;

    if (token == CONTINUE || token == BREAK ||
        token == RETURN || token == THROW)
      restrKeyword = true;
    break;
  case String:
    kjsyylval.string = makeUString(m_buffer16);
    token = STRING;
    break;
  case Number:
    kjsyylval.doubleValue = dval;
    token = NUMBER;
    break;
  case Bad:
#ifdef KJS_DEBUG_LEX
    fprintf(stderr, "yylex: ERROR.\n");
#endif
    error = true;
    return -1;
  default:
    ASSERT(!"unhandled numeration value in switch");
    error = true;
    return -1;
  }
  lastToken = token;
  return token;
}

bool Lexer::isWhiteSpace() const
{
  return current == '\t' || current == 0x0b || current == 0x0c || isSeparatorSpace(current);
}

bool Lexer::isLineTerminator()
{
  bool cr = (current == '\r');
  bool lf = (current == '\n');
  if (cr)
      skipLF = true;
  else if (lf)
      skipCR = true;
  return cr || lf || current == 0x2028 || current == 0x2029;
}

bool Lexer::isIdentStart(int c)
{
  return (category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other))
    || c == '$' || c == '_';
}

bool Lexer::isIdentPart(int c)
{
  return (category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other
        | Mark_NonSpacing | Mark_SpacingCombining | Number_DecimalDigit | Punctuation_Connector))
    || c == '$' || c == '_';
}

static bool isDecimalDigit(int c)
{
  return (c >= '0' && c <= '9');
}

bool Lexer::isHexDigit(int c)
{
  return (c >= '0' && c <= '9' ||
          c >= 'a' && c <= 'f' ||
          c >= 'A' && c <= 'F');
}

bool Lexer::isOctalDigit(int c)
{
  return (c >= '0' && c <= '7');
}

int Lexer::matchPunctuator(int c1, int c2, int c3, int c4)
{
  if (c1 == '>' && c2 == '>' && c3 == '>' && c4 == '=') {
    shift(4);
    return URSHIFTEQUAL;
  } else if (c1 == '=' && c2 == '=' && c3 == '=') {
    shift(3);
    return STREQ;
  } else if (c1 == '!' && c2 == '=' && c3 == '=') {
    shift(3);
    return STRNEQ;
   } else if (c1 == '>' && c2 == '>' && c3 == '>') {
    shift(3);
    return URSHIFT;
  } else if (c1 == '<' && c2 == '<' && c3 == '=') {
    shift(3);
    return LSHIFTEQUAL;
  } else if (c1 == '>' && c2 == '>' && c3 == '=') {
    shift(3);
    return RSHIFTEQUAL;
  } else if (c1 == '<' && c2 == '=') {
    shift(2);
    return LE;
  } else if (c1 == '>' && c2 == '=') {
    shift(2);
    return GE;
  } else if (c1 == '!' && c2 == '=') {
    shift(2);
    return NE;
  } else if (c1 == '+' && c2 == '+') {
    shift(2);
    if (terminator)
      return AUTOPLUSPLUS;
    else
      return PLUSPLUS;
  } else if (c1 == '-' && c2 == '-') {
    shift(2);
    if (terminator)
      return AUTOMINUSMINUS;
    else
      return MINUSMINUS;
  } else if (c1 == '=' && c2 == '=') {
    shift(2);
    return EQEQ;
  } else if (c1 == '+' && c2 == '=') {
    shift(2);
    return PLUSEQUAL;
  } else if (c1 == '-' && c2 == '=') {
    shift(2);
    return MINUSEQUAL;
  } else if (c1 == '*' && c2 == '=') {
    shift(2);
    return MULTEQUAL;
  } else if (c1 == '/' && c2 == '=') {
    shift(2);
    return DIVEQUAL;
  } else if (c1 == '&' && c2 == '=') {
    shift(2);
    return ANDEQUAL;
  } else if (c1 == '^' && c2 == '=') {
    shift(2);
    return XOREQUAL;
  } else if (c1 == '%' && c2 == '=') {
    shift(2);
    return MODEQUAL;
  } else if (c1 == '|' && c2 == '=') {
    shift(2);
    return OREQUAL;
  } else if (c1 == '<' && c2 == '<') {
    shift(2);
    return LSHIFT;
  } else if (c1 == '>' && c2 == '>') {
    shift(2);
    return RSHIFT;
  } else if (c1 == '&' && c2 == '&') {
    shift(2);
    return AND;
  } else if (c1 == '|' && c2 == '|') {
    shift(2);
    return OR;
  }

  switch(c1) {
    case '=':
    case '>':
    case '<':
    case ',':
    case '!':
    case '~':
    case '?':
    case ':':
    case '.':
    case '+':
    case '-':
    case '*':
    case '/':
    case '&':
    case '|':
    case '^':
    case '%':
    case '(':
    case ')':
    case '{':
    case '}':
    case '[':
    case ']':
    case ';':
      shift(1);
      return static_cast<int>(c1);
    default:
      return -1;
  }
}

unsigned short Lexer::singleEscape(unsigned short c)
{
  switch(c) {
  case 'b':
    return 0x08;
  case 't':
    return 0x09;
  case 'n':
    return 0x0A;
  case 'v':
    return 0x0B;
  case 'f':
    return 0x0C;
  case 'r':
    return 0x0D;
  case '"':
    return 0x22;
  case '\'':
    return 0x27;
  case '\\':
    return 0x5C;
  default:
    return c;
  }
}

unsigned short Lexer::convertOctal(int c1, int c2, int c3)
{
  return static_cast<unsigned short>((c1 - '0') * 64 + (c2 - '0') * 8 + c3 - '0');
}

unsigned char Lexer::convertHex(int c)
{
  if (c >= '0' && c <= '9')
    return static_cast<unsigned char>(c - '0');
  if (c >= 'a' && c <= 'f')
    return static_cast<unsigned char>(c - 'a' + 10);
  return static_cast<unsigned char>(c - 'A' + 10);
}

unsigned char Lexer::convertHex(int c1, int c2)
{
  return ((convertHex(c1) << 4) + convertHex(c2));
}

KJS::UChar Lexer::convertUnicode(int c1, int c2, int c3, int c4)
{
  return KJS::UChar((convertHex(c1) << 4) + convertHex(c2),
               (convertHex(c3) << 4) + convertHex(c4));
}

void Lexer::record8(int c)
{
    ASSERT(c >= 0);
    ASSERT(c <= 0xff);
    m_buffer8.append(static_cast<char>(c));
}

void Lexer::record16(int c)
{
    ASSERT(c >= 0);
    ASSERT(c <= USHRT_MAX);
    record16(UChar(static_cast<unsigned short>(c)));
}

void Lexer::record16(KJS::UChar c)
{
    m_buffer16.append(c);
}

bool Lexer::scanRegExp()
{
  m_buffer16.clear();
  bool lastWasEscape = false;
  bool inBrackets = false;

  while (1) {
    if (isLineTerminator() || current == -1)
      return false;
    else if (current != '/' || lastWasEscape == true || inBrackets == true)
    {
        // keep track of '[' and ']'
        if (!lastWasEscape) {
          if ( current == '[' && !inBrackets )
            inBrackets = true;
          if ( current == ']' && inBrackets )
            inBrackets = false;
        }
        record16(current);
        lastWasEscape =
            !lastWasEscape && (current == '\\');
    } else { // end of regexp
        m_pattern = UString(m_buffer16);
        m_buffer16.clear();
        shift(1);
        break;
    }
    shift(1);
  }

  while (isIdentPart(current)) {
    record16(current);
    shift(1);
  }
  m_flags = UString(m_buffer16);

  return true;
}

void Lexer::clear()
{
    deleteAllValues(m_strings);
    Vector<UString*> newStrings;
    newStrings.reserveCapacity(initialStringTableCapacity);
    m_strings.swap(newStrings);

    deleteAllValues(m_identifiers);
    Vector<KJS::Identifier*> newIdentifiers;
    newIdentifiers.reserveCapacity(initialStringTableCapacity);
    m_identifiers.swap(newIdentifiers);

    Vector<char> newBuffer8;
    newBuffer8.reserveCapacity(initialReadBufferCapacity);
    m_buffer8.swap(newBuffer8);

    Vector<UChar> newBuffer16;
    newBuffer16.reserveCapacity(initialReadBufferCapacity);
    m_buffer16.swap(newBuffer16);

    m_pattern = 0;
    m_flags = 0;
}

Identifier* Lexer::makeIdentifier(const Vector<KJS::UChar>& buffer)
{
    KJS::Identifier* identifier = new KJS::Identifier(buffer.data(), buffer.size());
    m_identifiers.append(identifier);
    return identifier;
}
 
UString* Lexer::makeUString(const Vector<KJS::UChar>& buffer)
{
    UString* string = new UString(buffer);
    m_strings.append(string);
    return string;
}

} // namespace KJS