Goose  Artifact [8ba5623828]

Artifact 8ba5623828a2acacdf5f78a9f9c65db18d5a2d2309faf1ba0941cc4ecea5eac6:

  • File bs/lex/lexer.cpp — part of check-in [459ee84d6b] at 2019-08-18 00:54:51 on branch trunk —
    • Implemented the while statement.
    • Fixed a lexer issue that generated invalid locations at the very end of files.
    (user: achavasse size: 7686)

#include "lex.h"

using namespace empathy;
using namespace empathy::lex;

Lexer::Lexer( istream& input, const string& filename ) :
    m_filename( Location::GetCachedFilename( filename ) ),
    m_input( input )
{}

void Lexer::newLine()
{
    ++m_lineNum;
    m_lastLineBreakOffset = getCurrentPos();
}

void Lexer::skipSpacingIfAny()
{
    while( m_input.good() )
    {
        auto c = m_input.peek();

        if( c == '\n' )
        {
            m_input.get();
            newLine();
            continue;
        }

        if( isspace( c ) )
        {
            m_input.get();
            continue;
        }

        if( c == '/' )
        {
            m_input.get();
            c = m_input.peek();
            if( c == '/' || c == '*' )
                skipComment();
            else
            {
                m_input.unget();
                return;
            }

            continue;
        }

        return;
    }
}

TermLoc Lexer::readStringLiteral( const LocationStart& ls )
{
    m_input.get();

    string str;
    bool complete = false;

    // TODO this is very simplistic, at some point
    // we'll want to handle escape characters etc.
    while( m_input.good() )
    {
        auto c = m_input.get();

        if( c == '"' )
        {
            complete = true;
            break;
        }

        if( c == '\n' )
        {
            newLine();
            break;
        }

        str += c;
    }

    if( !complete )
    {
        DiagnosticsManager::GetInstance().emitSyntaxErrorMessage(
            ls.toLoc( str.size() + 1 ),
            "unterminated string literal.", 0 );

        return TermLoc( str, ls.toLoc( str.size() + 1 ) );
    }

    return TermLoc( str, ls.toLoc( str.size() + 2 ) );
}

TermLoc Lexer::readIntegerLiteral( const LocationStart& ls )
{
    // Skip the leading zero, if any, then check for a base marker (x or b)
    // (might add 'o' for octal later but fuck octal for now)
    auto c = m_input.peek();
    if( c == '0' )
    {
        m_input.get();
        c = m_input.peek();
        if( c == 'x' )
        {
            m_input.get();
            return readIntegerLiteralHex( ls );
        }

        if( c == 'b' )
        {
            m_input.get();
            return readIntegerLiteralBin( ls );
        }
    }

    return readIntegerLiteralDec( ls );
}

TermLoc Lexer::readIntegerLiteralBin( const LocationStart& ls )
{
    // skip leading zeros
    while( m_input.good() )
    {
        auto c = m_input.peek();
        if( c!= '0' )
            break;

        m_input.get();
    }

    string str;
    while( m_input.good() )
    {
        auto c = m_input.peek();
        if( c != '0' && c != '1' )
            break;

        m_input.get();

        str.push_back( c );
    }

    return TermLoc( BigInt::FromBinString( str ), ls.toLoc( str.size() + 2 ) );
}

TermLoc Lexer::readIntegerLiteralDec( const LocationStart& ls )
{
    // skip leading zeros
    while( m_input.good() )
    {
        auto c = m_input.peek();
        if( c!= '0' )
            break;

        m_input.get();
    }

    string str;
    while( m_input.good() )
    {
        auto c = m_input.peek();
        if( !isdigit( c ) )
            break;

        m_input.get();

        str.push_back( c );
    }

    return TermLoc( BigInt::FromDecString( str ), ls.toLoc( str.size() ) );
}

TermLoc Lexer::readIntegerLiteralHex( const LocationStart& ls )
{
    // skip leading zeros
    while( m_input.good() )
    {
        auto c = m_input.peek();
        if( c!= '0' )
            break;

        m_input.get();
    }

    string str;
    while( m_input.good() )
    {
        auto c = m_input.peek();

        bool isHexDigit = isdigit( c );
        isHexDigit = isHexDigit || ( c >= 'a' && c <= 'f' );
        isHexDigit = isHexDigit || ( c >= 'A' && c <= 'F' );

        if( !isHexDigit )
            break;

        m_input.get();

        str.push_back( c );
    }

    return TermLoc( BigInt::FromHexString( str ), ls.toLoc( str.size() + 2 ) );
}

TermLoc Lexer::readAlphanumericIdentifier( const LocationStart& ls )
{
    // TODO this will need to be reworked some day to handle unicode
    // identifiers.

    string str;

    while( m_input.good() )
    {
        auto c = m_input.peek();
        if( !isalpha( c ) && !isdigit( c ) && c != '_' && c != '#' )
            break;

        m_input.get();
        str += c;
    }

    return TermLoc( StringId( str ), ls.toLoc( str.size() ) );
}

// TODO this is inefficient, put that stuff into a lookup table.
// Also maybe instead of being hardcoded in the lexer it could be
// configurable, so that when a new operator is defined it can reserve
// its characters here (either as a first char only, or not)

// Currently, - @ $ ~ and ! can only be used as the first character of an operator identifier,
// so that they can be used as the first character of a prefix operator (we don't want to make it mandatory to insert
// a space between an infix and a prefix operator).
bool Lexer::isOpFirstChar( char c )
{
    static string charTable = "+*/%&|^<>=?:.,-@$!~";
    return charTable.find_first_of( c ) != string::npos;
}

bool Lexer::isOpChar( char c )
{
    static string charTable = "+*/%&|^<>=?:.,";
    return charTable.find_first_of( c ) != string::npos;
}

TermLoc Lexer::readOperatorIdentifier( const LocationStart& ls )
{
    // TODO this will need to be reworked some day to handle unicode
    // identifiers.

    string str;

    auto c = m_input.get();
    str += c;

    while( m_input.good() )
    {
        auto c = m_input.peek();
        if( !isOpChar( c ) )
            break;

        m_input.get();
        str += c;
    }

    return TermLoc( StringId( str ), ls.toLoc( str.size() ) );
}

TermLoc Lexer::readSemicolon( const LocationStart& ls )
{
    auto offset = getCurrentPos();

    // The semicolon is just an identifier, but with a special lexing rule:
    // consecutive semicolons are collapsed into one.
    while( m_input.good() )
    {
        skipSpacingIfAny();
        auto c = m_input.peek();
        if( c != ';' )
            break;

        m_input.get();
    }

    return TermLoc( ";"_sid, ls.toLoc( getCurrentPos() - offset ) );
}

optional< TermLoc > Lexer::readToken()
{
    auto line = m_lineNum;
    skipSpacingIfAny();

    auto ls = getCurrentLocationStart();

    if( line != m_lineNum )
        return TermLoc( Delimiter::Newline, ls.toLoc() );

    if( !m_input.good() )
        return nullopt;

    auto c = m_input.peek();

    if( isdigit( c ) )
        return readIntegerLiteral( ls );

    if( isalpha( c ) || c == '_' || c == '#' )
        return readAlphanumericIdentifier( ls );

    switch( c )
    {
        case '(':
            m_input.get();
            return TermLoc( Delimiter::OpenParen, ls.toLoc() );

        case ')':
            m_input.get();
            return TermLoc( Delimiter::CloseParen, ls.toLoc() );

        case '{':
            m_input.get();
            return TermLoc( Delimiter::OpenBrace, ls.toLoc() );

        case '}':
            m_input.get();
            return TermLoc( Delimiter::CloseBrace, ls.toLoc() );

        case '[':
            m_input.get();
            return TermLoc( Delimiter::OpenBracket, ls.toLoc() );

        case ']':
            m_input.get();
            return TermLoc( Delimiter::CloseBracket, ls.toLoc() );

        case '"':
            return readStringLiteral( ls );

        case ';':
            return readSemicolon( ls );
    }

    if( isOpFirstChar( c ) )
        return readOperatorIdentifier( ls );

    DiagnosticsManager::GetInstance().emitSyntaxErrorMessage(
        ls.toLoc(), "unrecognized token.", 0 );
    return nullopt;
}