Goose  Artifact [bc529ac8a5]

Artifact bc529ac8a52478f13e1444c3297d390fe5744266dabef995b9762bfd64dba112:

  • File bs/lexer/lexer.cpp — part of check-in [8c4d9f9fe1] at 2019-01-13 15:18:28 on branch trunk — Lexer: operator identifiers. (user: achavasse size: 4796)

#include "lexer.h"

using namespace empathy;
using namespace empathy::util;
using namespace empathy::ir;

void Lexer::newLine()
{
    ++m_lineNum;
    m_lastLineBreakOffset = m_input.tellg();
}

void Lexer::skipSpacingIfAny()
{
    while( m_input.good() )
    {
        auto c = m_input.peek();

        if( isspace( c ) )
        {
            m_input.get();
            continue;
        }

        if( c == '\n' )
        {
            m_input.get();
            newLine();
            continue;
        }

        if( c == '/' )
        {
            m_input.get();
            c = m_input.peek();
            if( c == '/' )
                skipLineComment();
            else if( c == '*' )
            {
                m_input.get();
                skipBlockComment();
            }
            else
            {
                m_input.unget();
                return;
            }

            continue;
        }

        return;
    }
}

Term Lexer::readStringLiteral( Location&& loc )
{
    string str;
    bool complete = false;

    // TODO this is very simplistic, at some point
    // we'll want to handle escape characters etc.
    while( m_input.good() )
    {
        auto c = m_input.get();

        if( c == '"' )
        {
            complete = true;
            break;
        }

        if( c == '\n' )
        {
            newLine();
            break;
        }

        str += c;
    }

    Term result( move( loc ), str );

    if( !complete )
    {
        cout << result.location() << ": unterminated string literal.\n";
        result.setFaulty();
    }

    return result;
}

Term Lexer::readIntegerLiteral( Location&& loc )
{
    uint64_t val = 0;

    while( m_input.good() )
    {
        auto c = m_input.peek();
        if( !isdigit( c ) )
            break;

        m_input.get();
        val = val * 10 + c - '0';
    }

    Term result( move( loc ), val );
    return result;
}

Term Lexer::readAlphanumericIdentifier( Location&& loc )
{
    // TODO this will need to be reworked some day to handle unicode
    // identifiers.

    string str;

    while( m_input.good() )
    {
        auto c = m_input.peek();
        if( !isalpha( c ) && !isdigit( c ) && c != '_' )
            break;

        m_input.get();
        str += c;
    }

    Term result( move( loc ), str );
    return result;
}

// TODO this is inefficient, put that stuff into a lookup table.
// Also maybe instead of being hardcoded in the lexer it could be
// configurable, so that when a new operator is defined it can reserve
// its characters here (either as a first char only, or not)

// Currently, - @ $ # ~ and ! can only be used as the first character of an operator identifier,
// so that they can be used as the first character of a prefix operator (we don't want to make it mandatory to insert
// a space between an infix and a prefix operator).
bool Lexer::isOpFirstChar( char c )
{
    static string charTable = "+*/%&|^<>=?:.,-@$#!~";
    return charTable.find_first_of( c ) != string::npos;
}

bool Lexer::isOpChar( char c )
{
    static string charTable = "+*/%&|^<>=?:.,";
    return charTable.find_first_of( c ) != string::npos;
}

Term Lexer::readOperatorIdentifier( Location&& loc )
{
    // TODO this will need to be reworked some day to handle unicode
    // identifiers.

    string str;

    while( m_input.good() )
    {
        auto c = m_input.peek();
        if( !isOpChar( c ) )
            break;

        m_input.get();
        str += c;
    }

    Term result( move( loc ), str );
    return result;
}

optional< Term > Lexer::readToken()
{
    skipSpacingIfAny();
    if( !m_input.good() )
        return nullopt;

    Location loc( m_filename, m_lineNum, m_input.tellg() - m_lastLineBreakOffset );

    auto c = m_input.peek();

    if( isdigit( c ) )
        return readIntegerLiteral( move( loc ) );

    if( isalpha( c ) || c == '_' )
        return readAlphanumericIdentifier( move( loc ) );

    if( isOpFirstChar( c ) )
        return readOperatorIdentifier( move( loc ) );

    switch( c )
    {
        case '{':
            m_input.get();
            return Term( move( loc ), "{"_sid );

        case '}':
            m_input.get();
            return Term( move( loc ), "}"_sid );

        case '(':
            m_input.get();
            return Term( move( loc ), "("_sid );

        case ')':
            m_input.get();
            return Term( move( loc ), ")"_sid );

        case '[':
            m_input.get();
            return Term( move( loc ), "["_sid );

        case ']':
            m_input.get();
            return Term( move( loc ), "]"_sid );

        case '"':
            m_input.get();
            return readStringLiteral( move( loc ) );
    }

    cout << loc << ": unrecognized token.\n";
    return nullopt;
}