Files
Fig/src/lexer.cpp

525 lines
16 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <fig_string.hpp>
#include <error.hpp>
#include <token.hpp>
#include <lexer.hpp>
#include <fig_string.hpp>
#include <utils.hpp>
namespace Fig
{
const std::unordered_map<FString, TokenType> Lexer::symbol_map{
// 双字符
{FString(u8"=="), TokenType::Equal},
{FString(u8"!="), TokenType::NotEqual},
{FString(u8"<="), TokenType::LessEqual},
{FString(u8">="), TokenType::GreaterEqual},
{FString(u8"<<"), TokenType::ShiftLeft},
{FString(u8">>"), TokenType::ShiftRight},
{FString(u8"+="), TokenType::PlusEqual},
{FString(u8"-="), TokenType::MinusEqual},
{FString(u8"*="), TokenType::AsteriskEqual},
{FString(u8"/="), TokenType::SlashEqual},
{FString(u8"%="), TokenType::PercentEqual},
{FString(u8"^="), TokenType::CaretEqual},
{FString(u8"++"), TokenType::DoublePlus},
{FString(u8"--"), TokenType::DoubleMinus},
{FString(u8"&&"), TokenType::DoubleAmpersand},
{FString(u8"||"), TokenType::DoublePipe},
{FString(u8":="), TokenType::Walrus},
{FString(u8"**"), TokenType::Power},
{FString(u8"->"), TokenType::RightArrow},
{FString(u8"=>"), TokenType::DoubleArrow},
// 单字符
{FString(u8"+"), TokenType::Plus},
{FString(u8"-"), TokenType::Minus},
{FString(u8"*"), TokenType::Asterisk},
{FString(u8"/"), TokenType::Slash},
{FString(u8"%"), TokenType::Percent},
{FString(u8"^"), TokenType::Caret},
{FString(u8"&"), TokenType::Ampersand},
{FString(u8"|"), TokenType::Pipe},
{FString(u8"~"), TokenType::Tilde},
{FString(u8"="), TokenType::Assign},
{FString(u8"<"), TokenType::Less},
{FString(u8">"), TokenType::Greater},
{FString(u8"."), TokenType::Dot},
{FString(u8","), TokenType::Comma},
{FString(u8":"), TokenType::Colon},
{FString(u8";"), TokenType::Semicolon},
{FString(u8"'"), TokenType::SingleQuote},
{FString(u8"\""), TokenType::DoubleQuote},
{FString(u8"("), TokenType::LeftParen},
{FString(u8")"), TokenType::RightParen},
{FString(u8"["), TokenType::LeftBracket},
{FString(u8"]"), TokenType::RightBracket},
{FString(u8"{"), TokenType::LeftBrace},
{FString(u8"}"), TokenType::RightBrace}};
const std::unordered_map<FString, TokenType> Lexer::keyword_map{
{FString(u8"and"), TokenType::And},
{FString(u8"or"), TokenType::Or},
{FString(u8"not"), TokenType::Not},
{FString(u8"import"), TokenType::Import},
{FString(u8"func"), TokenType::Function},
{FString(u8"var"), TokenType::Variable},
{FString(u8"const"), TokenType::Const},
{FString(u8"final"), TokenType::Final},
{FString(u8"while"), TokenType::While},
{FString(u8"for"), TokenType::For},
{FString(u8"if"), TokenType::If},
{FString(u8"else"), TokenType::Else},
{FString(u8"struct"), TokenType::Struct},
{FString(u8"interface"), TokenType::Interface},
{FString(u8"implement"), TokenType::Implement},
{FString(u8"public"), TokenType::Public},
{FString(u8"return"), TokenType::Return},
// {FString(u8"Null"), TokenType::TypeNull},
// {FString(u8"Int"), TokenType::TypeInt},
// {FString(u8"String"), TokenType::TypeString},
// {FString(u8"Bool"), TokenType::TypeBool},
// {FString(u8"Double"), TokenType::TypeDouble},
};
void Lexer::skipLine()
{
while (*it != U'\n' and hasNext())
{
next();
}
next(); // skip '\n'
++line;
}
Token Lexer::scanIdentifier()
{
FString identifier;
while (hasNext())
{
UTF8Char c = *it;
if (c.isAlnum() || c == U'_')
{
identifier += c.getString();
next();
}
else
{
break;
}
}
if (this->keyword_map.contains(identifier))
{
return Token(identifier, this->keyword_map.at(identifier));
}
else if (identifier == u8"true" || identifier == u8"false")
{
return Token(identifier, TokenType::LiteralBool);
}
else if (identifier == u8"null")
{
// null instance
return Token(identifier, TokenType::LiteralNull);
}
if (keyword_map.contains(Utils::toLower(identifier)))
{
pushWarning(1, identifier); // Identifier is too similar to a keyword or a primitive type
}
if (identifier.length() <= 1)
{
pushWarning(2, identifier); // The identifier is too abstract
}
return Token(identifier, TokenType::Identifier);
}
Token Lexer::scanString()
{
FString str;
bool unterminated = true;
size_t str_start_col = it.column() - 1;
while (hasNext())
{
UTF8Char c = *it;
if (c == U'"' || c == U'\n')
{
next();
unterminated = false;
break;
}
else if (c == U'\\') // c is '\'
{
if (it.isEnd())
{
error = SyntaxError(u8"Unterminated FString", this->line, it.column());
return IllegalTok;
}
next();
UTF8Char ec = *it;
if (ec == U'n')
{
next();
str += u8"\n";
}
else if (ec == U't')
{
next();
str += u8"\t";
}
else if (ec == U'v')
{
next();
str += u8"\v";
}
else if (ec == U'b')
{
next();
str += u8"\b";
}
else if (ec == U'"')
{
next();
str += u8"\"";
}
else if (ec == U'\'')
{
next();
str += u8"'";
}
else
{
error = SyntaxError(FStringView(
std::format(
"Unsupported escape character: {}",
FString(ec.getString()).toBasicString())),
this->line,
it.column());
return IllegalTok;
}
}
else
{
str += c.getString();
next();
}
}
if (unterminated)
{
error = SyntaxError(u8"Unterminated FString", this->line, str_start_col);
return IllegalTok;
}
return Token(str, TokenType::LiteralString);
}
Token Lexer::scanRawString()
{
FString str;
bool unterminated = true;
size_t str_start_col = it.column() - 1;
while (hasNext())
{
UTF8Char c = *it;
if (c == U'"' || c == U'\n')
{
next();
unterminated = false;
break;
}
else
{
str += c.getString();
next();
}
}
if (unterminated)
{
error = SyntaxError(u8"Unterminated FString", this->line, str_start_col);
return IllegalTok;
}
return Token(str, TokenType::LiteralString);
}
Token Lexer::scanMultilineString()
{
FString str;
bool unterminated = true;
uint8_t end = 0;
size_t str_start_col = it.column() - 1;
while (hasNext())
{
UTF8Char c = *it;
if (c == U'"')
{
if (end == 3)
{
next();
unterminated = false;
break;
}
end++;
next();
continue;
}
else if (c == U'\\') // c is '\'
{
if (it.isEnd())
{
error = SyntaxError(u8"Unterminated FString", this->line, it.column());
return IllegalTok;
}
next();
UTF8Char ec = *it;
if (ec == U'n')
{
next();
str += u8"\n";
}
else if (ec == U't')
{
next();
str += u8"\t";
}
else if (ec == U'v')
{
next();
str += u8"\v";
}
else if (ec == U'b')
{
next();
str += u8"\b";
}
else if (ec == U'"')
{
next();
str += u8"\"";
}
else if (ec == U'\'')
{
next();
str += u8"'";
}
else if (ec == U'\\')
{
next();
str += u8"\\";
}
else
{
error = SyntaxError(FStringView(
std::format(
"Unsupported escape character: {}",
FString(ec.getString()).toBasicString())),
this->line,
it.column());
return IllegalTok;
}
}
else
{
str += c.getString();
}
end = 0;
}
if (unterminated)
{
error = SyntaxError(u8"Unterminated FString", this->line, str_start_col);
return IllegalTok;
}
return Token(str, TokenType::LiteralString);
}
Token Lexer::scanNumber()
{
FString numStr;
bool hasPoint = false;
// 负号(减号) 直接交由 scanSymbol处理在parser中被分类->与数字结合/变为操作数
while (hasNext())
{
UTF8Char ch = *it;
if (ch.isDigit() or ch == U'e') // . / e / - for scientific counting
{
numStr += ch.getString();
next();
}
else if (ch == U'-' and numStr.ends_with(U'-'))
{
numStr += ch.getString();
next();
}
else if (ch == U'.' and not hasPoint)
{
hasPoint = true;
numStr += ch.getString();
next();
}
else
{
break;
}
}
// Numbers in Fig-lang
/*
114514
1145.14
1.14e3 -> 1140
1.14e-3 -> 0.00114
.3 -> 0.3
*/
// checking legality
if ((*numStr.end()) == u'e') // e 后面必须跟整数表示科学计数
{
error = SyntaxError(FStringView(
std::format("Ellegal number literal: {}", numStr.toBasicString())),
this->line, it.column());
return IllegalTok;
}
return Token(numStr, TokenType::LiteralNumber);
}
Token Lexer::scanSymbol()
{
FString sym;
UTF8Char ch = *it;
sym += ch.getString();
UTF8Char peek = UTF8Char(u8"");
if (hasNext() and (peek = it.peek()).isPunct()) // 窥探下一个操作符
{
FString symd = FString(sym + peek.getString());
if (this->symbol_map.contains(symd))
{
// Operator length is 2
next();
sym = symd;
}
// Operator length is 1
else if (!this->symbol_map.contains(sym))
{
// check legality
error = SyntaxError(FStringView(
std::format("No such a operator: {}", sym.toBasicString())),
this->line, it.column());
}
}
next();
return Token(sym, this->symbol_map.at(sym)); // const object 'symbol_map', operator[] call is invalid
}
Token Lexer::scanComments()
{
// entry: when iterator current char is '/' and peek is '/' or '*'
// current char is '/'
FString comment;
if (it.peek() == U'/')
{
next();
next();
UTF8Char c = *it;
while (c != U'\n' and hasNext())
{
comment += c.getString();
next();
}
next();
}
else
{
next();
next();
UTF8Char c = *it;
bool terminated = false;
while (hasNext())
{
if (c == U'*' and hasNext() and it.peek() == U'/')
{
next(); // skip '*'
next(); // skip '/'
next(); // to next char
terminated = true;
break;
}
else
{
comment += c.getString();
next();
}
}
if (!terminated)
{
error = SyntaxError(FStringView(u8"Unterminated multiline comment"), this->line, it.column());
next();
return IllegalTok;
}
}
return Token(comment, TokenType::Comments);
}
Token Lexer::nextToken()
{
if (!hasNext())
{
return EOFTok;
}
UTF8Char ch = *it;
while (ch.isSpace())
{
next();
ch = *it;
if (!hasNext())
{
return EOFTok.setPos(getCurrentLine(), getCurrentColumn());
}
}
last_line = getCurrentLine();
last_column = getCurrentColumn();
if (ch == U'r' and hasNext() and it.peek() == U'"')
{
// r""
// raw FString
next();
next();
return scanRawString().setPos(last_line, last_column);
}
if (ch.isAlpha() || ch == U'_')
{
return scanIdentifier().setPos(last_line, last_column);
}
else if (ch == U'"')
{
next();
return scanString().setPos(last_line, last_column);
}
else if (ch.isDigit())
{
return scanNumber().setPos(last_line, last_column);
}
else if (ch == U'/')
{
UTF8Char c{u8""};
if (!hasNext())
{
next();
return Token(u8"/", this->symbol_map.at(u8"/")).setPos(last_line, last_column);
}
c = it.peek();
if (c != U'/' and c != U'*')
{
next();
return Token(u8"/", this->symbol_map.at(u8"/")).setPos(last_line, last_column);
}
return scanComments().setPos(last_line, last_column);
}
else if (ch.isPunct())
{
return scanSymbol().setPos(last_line, last_column);
}
else
{
error = SyntaxError(FStringView(
std::format("Cannot tokenize char: '{}'", FString(ch.getString()).toBasicString())),
this->line, it.column());
if (hasNext())
{
next();
}
return IllegalTok.setPos(last_line, last_column);
}
}
} // namespace Fig