#include "lex.h"
using namespace empathy;
using namespace empathy::lex;
Lexer::Lexer( istream& input, const ptr< string >& filename ) :
m_filename( filename ),
m_input( input )
{}
void Lexer::newLine()
{
++m_lineNum;
m_lastLineBreakOffset = m_input.tellg();
}
void Lexer::skipSpacingIfAny()
{
while( m_input.good() )
{
auto c = m_input.peek();
if( c == '\n' )
{
m_input.get();
newLine();
continue;
}
if( isspace( c ) )
{
m_input.get();
continue;
}
if( c == '/' )
{
m_input.get();
c = m_input.peek();
if( c == '/' || c == '*' )
skipComment();
else
{
m_input.unget();
return;
}
continue;
}
return;
}
}
Term Lexer::readStringLiteral( Location&& loc )
{
string str;
bool complete = false;
// TODO this is very simplistic, at some point
// we'll want to handle escape characters etc.
while( m_input.good() )
{
auto c = m_input.get();
if( c == '"' )
{
complete = true;
break;
}
if( c == '\n' )
{
newLine();
break;
}
str += c;
}
Term result( move( loc ), str );
if( !complete )
{
cout << result.location() << ": unterminated string literal.\n";
result.setFaulty();
}
return result;
}
Term Lexer::readIntegerLiteral( Location&& loc )
{
// skip leading zeros
while( m_input.good() )
{
auto c = m_input.peek();
if( c!= '0' )
break;
m_input.get();
}
string str;
while( m_input.good() )
{
auto c = m_input.peek();
if( !isdigit( c ) )
break;
m_input.get();
str.push_back( c );
}
return Term( move( loc ), APSInt( str ) );
}
Term Lexer::readAlphanumericIdentifier( Location&& loc )
{
// TODO this will need to be reworked some day to handle unicode
// identifiers.
string str;
while( m_input.good() )
{
auto c = m_input.peek();
if( !isalpha( c ) && !isdigit( c ) && c != '_' && c != '#' )
break;
m_input.get();
str += c;
}
Term result( move( loc ), StringId( str ) );
return result;
}
// TODO this is inefficient, put that stuff into a lookup table.
// Also maybe instead of being hardcoded in the lexer it could be
// configurable, so that when a new operator is defined it can reserve
// its characters here (either as a first char only, or not)
// Currently, - @ $ ~ and ! can only be used as the first character of an operator identifier,
// so that they can be used as the first character of a prefix operator (we don't want to make it mandatory to insert
// a space between an infix and a prefix operator).
bool Lexer::isOpFirstChar( char c )
{
static string charTable = "+*/%&|^<>=?:.,-@$!~";
return charTable.find_first_of( c ) != string::npos;
}
bool Lexer::isOpChar( char c )
{
static string charTable = "+*/%&|^<>=?:.,";
return charTable.find_first_of( c ) != string::npos;
}
Term Lexer::readOperatorIdentifier( Location&& loc )
{
// TODO this will need to be reworked some day to handle unicode
// identifiers.
string str;
auto c = m_input.get();
str += c;
while( m_input.good() )
{
auto c = m_input.peek();
if( !isOpChar( c ) )
break;
m_input.get();
str += c;
}
Term result( move( loc ), StringId( str ) );
return result;
}
Term Lexer::readSemicolon( Location&& loc )
{
// The semicolon is just an identifier, but with a special lexing rule:
// consecutive semicolons are collapsed into one.
while( m_input.good() )
{
skipSpacingIfAny();
auto c = m_input.peek();
if( c != ';' )
break;
m_input.get();
}
Term result( move( loc ), ";"_sid );
return result;
}
optional< Term > Lexer::readToken()
{
auto line = m_lineNum;
skipSpacingIfAny();
Location loc( m_filename, m_lineNum, m_input.tellg() - m_lastLineBreakOffset + 1 );
if( line != m_lineNum )
return Term( move( loc ), Delimiter::Newline );
if( !m_input.good() )
return nullopt;
auto c = m_input.peek();
if( isdigit( c ) )
return readIntegerLiteral( move( loc ) );
if( isalpha( c ) || c == '_' || c == '#' )
return readAlphanumericIdentifier( move( loc ) );
switch( c )
{
case '(':
m_input.get();
return Term( move( loc ), Delimiter::OpenParen );
case ')':
m_input.get();
return Term( move( loc ), Delimiter::CloseParen );
case '{':
m_input.get();
return Term( move( loc ), Delimiter::OpenBrace );
case '}':
m_input.get();
return Term( move( loc ), Delimiter::CloseBrace );
case '[':
m_input.get();
return Term( move( loc ), Delimiter::OpenBracket );
case ']':
m_input.get();
return Term( move( loc ), Delimiter::CloseBracket );
case '"':
m_input.get();
return readStringLiteral( move( loc ) );
case ';':
return readSemicolon( move( loc ) );
}
if( isOpFirstChar( c ) )
return readOperatorIdentifier( move( loc ) );
cout << loc << ": unrecognized token.\n";
return nullopt;
}