#include "lex.h"
using namespace goose;
using namespace goose::lex;
Lexer::Lexer( istream& input, const string& filename ) :
m_filename( Location::GetCachedFilename( filename ) ),
m_input( input )
{
}
void Lexer::newLine()
{
++m_lineNum;
m_lastLineBreakOffset = getCurrentPos();
}
void Lexer::skipSpacingIfAny()
{
while( m_input.good() )
{
auto c = m_input.peek();
if( c == '\n' )
{
m_input.get();
newLine();
continue;
}
if( isspace( c ) )
{
m_input.get();
continue;
}
if( c == '/' )
{
m_input.get();
c = m_input.peek();
if( c == '/' || c == '*' )
skipComment();
else
{
m_input.unget();
return;
}
continue;
}
return;
}
}
TermLoc Lexer::readAlphanumericIdentifier( const LocationPoint& ls )
{
// TODO this will need to be reworked some day to handle unicode
// identifiers.
string str;
while( m_input.good() )
{
auto c = m_input.peek();
if( !isalpha( c ) && !isdigit( c ) && c != '_' && c != '#' && c != '@' )
break;
m_input.get();
str += c;
}
return TermLoc( StringId( str ), ls.toLoc( str.size() ) );
}
// TODO this is inefficient, put that stuff into a lookup table.
// Also maybe instead of being hardcoded in the lexer it could be configurable, so that when a new
// operator is defined it can reserve its characters here (either as a first char only, or not)
// Currently, - $ ~ and ! can only be used as the first character of an operator identifier, so that
// they can be used as the first character of a prefix operator (we don't want to make it mandatory
// to insert a space between an infix and a prefix operator).
bool Lexer::isOpFirstChar( char c )
{
static string charTable = "+*/%&|^<>=?:.,-$!~";
return charTable.find_first_of( c ) != string::npos;
}
bool Lexer::isOpChar( char c, char firstChar )
{
// $ is accepted if the identifier started with a $, because we have a $$ operator. Bit hackish,
// bleh
if( firstChar == '$' && c == '$' )
return true;
static string charTable = "+*/%&|^<>=?:.,";
return charTable.find_first_of( c ) != string::npos;
}
TermLoc Lexer::readOperatorIdentifier( const LocationPoint& ls )
{
// TODO this will need to be reworked some day to handle unicode identifiers.
string str;
auto firstChar = m_input.get();
str += firstChar;
while( m_input.good() )
{
auto c = m_input.peek();
if( !isOpChar( c, firstChar ) )
break;
m_input.get();
str += c;
}
return TermLoc( StringId( str ), ls.toLoc( str.size() ) );
}
TermLoc Lexer::readSemicolon( const LocationPoint& ls )
{
auto offset = getCurrentPos();
// The semicolon is just an identifier, but with a special lexing rule: consecutive semicolons
// are collapsed into one.
while( m_input.good() )
{
skipSpacingIfAny();
auto c = m_input.peek();
if( c != ';' )
break;
m_input.get();
}
return TermLoc( ";"_sid, ls.toLoc( getCurrentPos() - offset ) );
}
optional< TermLoc > Lexer::readToken()
{
skipSpacingIfAny();
auto ls = currentLocationPoint();
if( !m_input.good() )
{
m_lookAheadCache.clear();
return nullopt;
}
auto c = m_input.peek();
if( isdigit( c ) )
return readIntegerLiteral( ls );
if( isalpha( c ) || c == '_' || c == '#' || c == '@' )
return readAlphanumericIdentifier( ls );
switch( c )
{
case '(':
m_input.get();
return TermLoc( Delimiter::OpenParen, ls.toLoc() );
case ')':
m_input.get();
return TermLoc( Delimiter::CloseParen, ls.toLoc() );
case '{':
m_input.get();
return TermLoc( Delimiter::OpenBrace, ls.toLoc() );
case '}':
m_input.get();
return TermLoc( Delimiter::CloseBrace, ls.toLoc() );
case '[':
m_input.get();
return TermLoc( Delimiter::OpenBracket, ls.toLoc() );
case ']':
m_input.get();
return TermLoc( Delimiter::CloseBracket, ls.toLoc() );
case '\'':
return readCharacterLiteral( ls );
case '"':
return readStringLiteral( ls );
case ';':
return readSemicolon( ls );
}
if( isOpFirstChar( c ) )
return readOperatorIdentifier( ls );
DiagnosticsManager::GetInstance().emitLexerErrorMessage( ls.toLoc(), "unrecognized token." );
return nullopt;
}