#include "lex.h"
using namespace empathy;
using namespace empathy::lex;
Lexer::Lexer( istream& input, const string& filename ) :
m_filename( Location::GetCachedFilename( filename ) ),
m_input( input )
{}
void Lexer::newLine()
{
++m_lineNum;
m_lastLineBreakOffset = getCurrentPos();
}
void Lexer::skipSpacingIfAny()
{
while( m_input.good() )
{
auto c = m_input.peek();
if( c == '\n' )
{
m_input.get();
newLine();
continue;
}
if( isspace( c ) )
{
m_input.get();
continue;
}
if( c == '/' )
{
m_input.get();
c = m_input.peek();
if( c == '/' || c == '*' )
skipComment();
else
{
m_input.unget();
return;
}
continue;
}
return;
}
}
TermLoc Lexer::readAlphanumericIdentifier( const LocationStart& ls )
{
// TODO this will need to be reworked some day to handle unicode
// identifiers.
string str;
while( m_input.good() )
{
auto c = m_input.peek();
if( !isalpha( c ) && !isdigit( c ) && c != '_' && c != '#' )
break;
m_input.get();
str += c;
}
return TermLoc( StringId( str ), ls.toLoc( str.size() ) );
}
// TODO this is inefficient, put that stuff into a lookup table.
// Also maybe instead of being hardcoded in the lexer it could be
// configurable, so that when a new operator is defined it can reserve
// its characters here (either as a first char only, or not)
// Currently, - @ $ ~ and ! can only be used as the first character of an operator identifier,
// so that they can be used as the first character of a prefix operator (we don't want to make it mandatory to insert
// a space between an infix and a prefix operator).
bool Lexer::isOpFirstChar( char c )
{
static string charTable = "+*/%&|^<>=?:.,-@$!~";
return charTable.find_first_of( c ) != string::npos;
}
bool Lexer::isOpChar( char c )
{
static string charTable = "+*/%&|^<>=?:.,";
return charTable.find_first_of( c ) != string::npos;
}
TermLoc Lexer::readOperatorIdentifier( const LocationStart& ls )
{
// TODO this will need to be reworked some day to handle unicode
// identifiers.
string str;
auto c = m_input.get();
str += c;
while( m_input.good() )
{
auto c = m_input.peek();
if( !isOpChar( c ) )
break;
m_input.get();
str += c;
}
return TermLoc( StringId( str ), ls.toLoc( str.size() ) );
}
TermLoc Lexer::readSemicolon( const LocationStart& ls )
{
auto offset = getCurrentPos();
// The semicolon is just an identifier, but with a special lexing rule:
// consecutive semicolons are collapsed into one.
while( m_input.good() )
{
skipSpacingIfAny();
auto c = m_input.peek();
if( c != ';' )
break;
m_input.get();
}
return TermLoc( ";"_sid, ls.toLoc( getCurrentPos() - offset ) );
}
optional< TermLoc > Lexer::readToken()
{
auto line = m_lineNum;
skipSpacingIfAny();
auto ls = getCurrentLocationStart();
if( line != m_lineNum )
return TermLoc( Delimiter::Newline, ls.toLoc() );
if( !m_input.good() )
{
m_lookAheadCache.clear();
return nullopt;
}
auto c = m_input.peek();
if( isdigit( c ) )
return readIntegerLiteral( ls );
if( isalpha( c ) || c == '_' || c == '#' )
return readAlphanumericIdentifier( ls );
switch( c )
{
case '(':
m_input.get();
return TermLoc( Delimiter::OpenParen, ls.toLoc() );
case ')':
m_input.get();
return TermLoc( Delimiter::CloseParen, ls.toLoc() );
case '{':
m_input.get();
return TermLoc( Delimiter::OpenBrace, ls.toLoc() );
case '}':
m_input.get();
return TermLoc( Delimiter::CloseBrace, ls.toLoc() );
case '[':
m_input.get();
return TermLoc( Delimiter::OpenBracket, ls.toLoc() );
case ']':
m_input.get();
return TermLoc( Delimiter::CloseBracket, ls.toLoc() );
case '"':
return readStringLiteral( ls );
case ';':
return readSemicolon( ls );
}
if( isOpFirstChar( c ) )
return readOperatorIdentifier( ls );
DiagnosticsManager::GetInstance().emitLexerErrorMessage(
ls.toLoc(), "unrecognized token." );
return nullopt;
}