648 lines
20 KiB
C++
648 lines
20 KiB
C++
#include <Error/error.hpp>
|
|
#include <Token/token.hpp>
|
|
#include <Lexer/lexer.hpp>
|
|
|
|
#include <Core/String.hpp>
|
|
#include <Core/CharUtils.hpp>
|
|
#include <Utils/utils.hpp>
|
|
|
|
#if 0
|
|
#include <iostream> // debug
|
|
#endif
|
|
|
|
#ifndef SourceInfo
|
|
#define SourceInfo(ptr) (ptr->sourcePath), (ptr->sourceLines)
|
|
#endif
|
|
|
|
namespace Fig
|
|
{
|
|
|
|
const std::unordered_map<String, TokenType> Lexer::symbol_map{
|
|
// 三字符
|
|
{String(U"..."), TokenType::TripleDot},
|
|
// 双字符
|
|
{String(U"=="), TokenType::Equal},
|
|
{String(U"!="), TokenType::NotEqual},
|
|
{String(U"<="), TokenType::LessEqual},
|
|
{String(U">="), TokenType::GreaterEqual},
|
|
{String(U"<<"), TokenType::ShiftLeft},
|
|
{String(U">>"), TokenType::ShiftRight},
|
|
{String(U"+="), TokenType::PlusEqual},
|
|
{String(U"-="), TokenType::MinusEqual},
|
|
{String(U"*="), TokenType::AsteriskEqual},
|
|
{String(U"/="), TokenType::SlashEqual},
|
|
{String(U"%="), TokenType::PercentEqual},
|
|
{String(U"^="), TokenType::CaretEqual},
|
|
{String(U"++"), TokenType::DoublePlus},
|
|
{String(U"--"), TokenType::DoubleMinus},
|
|
{String(U"&&"), TokenType::DoubleAmpersand},
|
|
{String(U"||"), TokenType::DoublePipe},
|
|
{String(U":="), TokenType::Walrus},
|
|
{String(U"**"), TokenType::Power},
|
|
{String(U"->"), TokenType::RightArrow},
|
|
{String(U"=>"), TokenType::DoubleArrow},
|
|
|
|
// 单字符
|
|
{String(U"+"), TokenType::Plus},
|
|
{String(U"-"), TokenType::Minus},
|
|
{String(U"*"), TokenType::Asterisk},
|
|
{String(U"/"), TokenType::Slash},
|
|
{String(U"%"), TokenType::Percent},
|
|
{String(U"^"), TokenType::Caret},
|
|
{String(U"&"), TokenType::Ampersand},
|
|
{String(U"|"), TokenType::Pipe},
|
|
{String(U"~"), TokenType::Tilde},
|
|
{String(U"="), TokenType::Assign},
|
|
{String(U"<"), TokenType::Less},
|
|
{String(U">"), TokenType::Greater},
|
|
{String(U"."), TokenType::Dot},
|
|
{String(U","), TokenType::Comma},
|
|
{String(U":"), TokenType::Colon},
|
|
{String(U";"), TokenType::Semicolon},
|
|
{String(U"'"), TokenType::SingleQuote},
|
|
{String(U"\""), TokenType::DoubleQuote},
|
|
{String(U"("), TokenType::LeftParen},
|
|
{String(U")"), TokenType::RightParen},
|
|
{String(U"["), TokenType::LeftBracket},
|
|
{String(U"]"), TokenType::RightBracket},
|
|
{String(U"{"), TokenType::LeftBrace},
|
|
{String(U"}"), TokenType::RightBrace},
|
|
{String(U"?"), TokenType::Question},
|
|
{String(U"!"), TokenType::Not},
|
|
};
|
|
|
|
const std::unordered_map<String, TokenType> Lexer::keyword_map{
|
|
{String(U"and"), TokenType::And},
|
|
{String(U"or"), TokenType::Or},
|
|
{String(U"not"), TokenType::Not},
|
|
{String(U"import"), TokenType::Import},
|
|
{String(U"func"), TokenType::Function},
|
|
{String(U"var"), TokenType::Variable},
|
|
{String(U"const"), TokenType::Const},
|
|
// {String(U"final"), TokenType::Final},
|
|
{String(U"while"), TokenType::While},
|
|
{String(U"for"), TokenType::For},
|
|
{String(U"if"), TokenType::If},
|
|
{String(U"else"), TokenType::Else},
|
|
{String(U"new"), TokenType::New},
|
|
{String(U"struct"), TokenType::Struct},
|
|
{String(U"interface"), TokenType::Interface},
|
|
{String(U"impl"), TokenType::Implement},
|
|
{String(U"is"), TokenType::Is},
|
|
{String(U"public"), TokenType::Public},
|
|
{String(U"return"), TokenType::Return},
|
|
{String(U"break"), TokenType::Break},
|
|
{String(U"continue"), TokenType::Continue},
|
|
{String(U"try"), TokenType::Try},
|
|
{String(U"catch"), TokenType::Catch},
|
|
{String(U"throw"), TokenType::Throw},
|
|
{String(U"Finally"), TokenType::Finally},
|
|
{String(U"as"), TokenType::As},
|
|
|
|
// {String(U"Null"), TokenType::TypeNull},
|
|
// {String(U"Int"), TokenType::TypeInt},
|
|
// {String(U"String"), TokenType::TypeString},
|
|
// {String(U"Bool"), TokenType::TypeBool},
|
|
// {String(U"Double"), TokenType::TypeDouble},
|
|
};
|
|
void Lexer::skipLine()
|
|
{
|
|
while (current() != U'\n' and hasNext()) { next(); }
|
|
next(); // skip '\n'
|
|
++line;
|
|
}
|
|
Token Lexer::scanIdentifier()
|
|
{
|
|
String identifier;
|
|
|
|
while (hasNext())
|
|
{
|
|
char32_t c = current();
|
|
if (CharUtils::isAlnum(c) || c == U'_')
|
|
{
|
|
identifier += c;
|
|
next();
|
|
}
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
if (this->keyword_map.contains(identifier)) { return Token(identifier, this->keyword_map.at(identifier)); }
|
|
else if (identifier == U"\1" || identifier == U"\1") { return Token(identifier, TokenType::LiteralBool); }
|
|
else if (identifier == U"\1")
|
|
{
|
|
// null instance
|
|
return Token(identifier, TokenType::LiteralNull);
|
|
}
|
|
// const auto &toLower = [](const String &str) -> String
|
|
// {
|
|
// String res;
|
|
// for (auto c : str)
|
|
// {
|
|
// res += CharUtils::toLower(c);
|
|
// }
|
|
// return res;
|
|
// };
|
|
|
|
// if (keyword_map.contains(toLower(identifier)))
|
|
// {
|
|
// pushWarning(1, identifier); // Identifier is too similar to a keyword or a primitive type
|
|
// }
|
|
if (identifier.length() <= 1)
|
|
{
|
|
pushWarning(2, identifier); // The identifier is too abstract
|
|
}
|
|
return Token(identifier, TokenType::Identifier);
|
|
}
|
|
Token Lexer::scanString()
|
|
{
|
|
String str;
|
|
bool unterminated = true;
|
|
size_t str_start_col = column - 1;
|
|
while (hasNext())
|
|
{
|
|
char32_t c = current();
|
|
if (c == U'"' || c == U'\n')
|
|
{
|
|
next();
|
|
unterminated = false;
|
|
break;
|
|
}
|
|
else if (c == U'\\') // c is '\'
|
|
{
|
|
if (!hasNext())
|
|
{
|
|
error = SyntaxError(U"\1", this->line, column, SourceInfo(this));
|
|
return IllegalTok;
|
|
}
|
|
next();
|
|
char32_t ec = current();
|
|
if (ec == U'n')
|
|
{
|
|
next();
|
|
str += U"\1";
|
|
}
|
|
else if (ec == U't')
|
|
{
|
|
next();
|
|
str += U"\1";
|
|
}
|
|
else if (ec == U'v')
|
|
{
|
|
next();
|
|
str += U"\1";
|
|
}
|
|
else if (ec == U'b')
|
|
{
|
|
next();
|
|
str += U"\1";
|
|
}
|
|
else if (ec == U'"')
|
|
{
|
|
next();
|
|
str += U"\1";
|
|
}
|
|
else if (ec == U'\'')
|
|
{
|
|
next();
|
|
str += U"\1";
|
|
}
|
|
else
|
|
{
|
|
error =
|
|
SyntaxError(String(std::format("Unsupported escape character: {}", String(ec).toBasicString())),
|
|
this->line,
|
|
column,
|
|
SourceInfo(this));
|
|
return IllegalTok;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
str += c;
|
|
next();
|
|
}
|
|
}
|
|
if (unterminated)
|
|
{
|
|
error = SyntaxError(U"Unterminated FString", this->line, str_start_col, SourceInfo(this));
|
|
return IllegalTok;
|
|
}
|
|
return Token(str, TokenType::LiteralString);
|
|
}
|
|
Token Lexer::scanRawString()
|
|
{
|
|
String str;
|
|
bool unterminated = true;
|
|
size_t str_start_col = column - 1;
|
|
while (hasNext())
|
|
{
|
|
char32_t c = current();
|
|
if (c == U'"' || c == U'\n')
|
|
{
|
|
next();
|
|
unterminated = false;
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
str += c;
|
|
next();
|
|
}
|
|
}
|
|
if (unterminated)
|
|
{
|
|
error = SyntaxError(U"Unterminated FString", this->line, str_start_col, SourceInfo(this));
|
|
return IllegalTok;
|
|
}
|
|
return Token(str, TokenType::LiteralString);
|
|
}
|
|
Token Lexer::scanMultilineString()
|
|
{
|
|
String str;
|
|
bool unterminated = true;
|
|
|
|
uint8_t end = 0;
|
|
size_t str_start_col = column - 1;
|
|
while (hasNext())
|
|
{
|
|
char32_t c = current();
|
|
if (c == U'"')
|
|
{
|
|
if (end == 3)
|
|
{
|
|
next();
|
|
unterminated = false;
|
|
break;
|
|
}
|
|
end++;
|
|
next();
|
|
continue;
|
|
}
|
|
else if (c == U'\\') // c is '\'
|
|
{
|
|
if (!hasNext())
|
|
{
|
|
error = SyntaxError(U"Unterminated FString", this->line, column, SourceInfo(this));
|
|
return IllegalTok;
|
|
}
|
|
next();
|
|
char32_t ec = current();
|
|
if (ec == U'n')
|
|
{
|
|
next();
|
|
str += U"\n";
|
|
}
|
|
else if (ec == U't')
|
|
{
|
|
next();
|
|
str += U"\t";
|
|
}
|
|
else if (ec == U'v')
|
|
{
|
|
next();
|
|
str += U"\v";
|
|
}
|
|
else if (ec == U'b')
|
|
{
|
|
next();
|
|
str += U"\b";
|
|
}
|
|
else if (ec == U'"')
|
|
{
|
|
next();
|
|
str += U"\"";
|
|
}
|
|
else if (ec == U'\'')
|
|
{
|
|
next();
|
|
str += U"\'";
|
|
}
|
|
else if (ec == U'\\')
|
|
{
|
|
next();
|
|
str += U"\\";
|
|
}
|
|
else
|
|
{
|
|
error = SyntaxError(
|
|
String(std::format("Unsupported escape character: {}", String(ec).toBasicString())),
|
|
this->line,
|
|
column,
|
|
SourceInfo(this));
|
|
return IllegalTok;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
str += c;
|
|
}
|
|
end = 0;
|
|
}
|
|
if (unterminated)
|
|
{
|
|
error = SyntaxError(U"\1", this->line, str_start_col, SourceInfo(this));
|
|
return IllegalTok;
|
|
}
|
|
return Token(str, TokenType::LiteralString);
|
|
}
|
|
Token Lexer::scanNumber()
|
|
{
|
|
String numStr;
|
|
bool hasPoint = false;
|
|
|
|
while (hasNext())
|
|
{
|
|
char32_t ch = current();
|
|
|
|
if (CharUtils::isDigit(ch) || ch == U'e')
|
|
{
|
|
numStr += ch;
|
|
next();
|
|
}
|
|
else if (ch == U'-' && !numStr.empty() && (numStr.ends_with(U'e') || numStr.ends_with(U'E')))
|
|
{
|
|
numStr += ch;
|
|
next();
|
|
}
|
|
else if (ch == U'+' && !numStr.empty() && (numStr.ends_with(U'e') || numStr.ends_with(U'E')))
|
|
{
|
|
numStr += ch;
|
|
next();
|
|
}
|
|
else if (ch == U'.' && !hasPoint)
|
|
{
|
|
hasPoint = true;
|
|
numStr += ch;
|
|
next();
|
|
}
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
if (numStr.empty()) { return IllegalTok; }
|
|
|
|
if (numStr.ends_with(U'e'))
|
|
{
|
|
error = SyntaxError(String(std::format("Illegal number literal: {}", numStr.toBasicString())),
|
|
this->line,
|
|
column,
|
|
SourceInfo(this));
|
|
return IllegalTok;
|
|
}
|
|
|
|
bool hasDigit = false;
|
|
for (auto it = numStr.begin(); it != numStr.end(); ++it)
|
|
{
|
|
if (CharUtils::isDigit(*it))
|
|
{
|
|
hasDigit = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!hasDigit)
|
|
{
|
|
error = SyntaxError(String(std::format("Illegal number literal: {}", numStr.toBasicString())),
|
|
this->line,
|
|
column,
|
|
SourceInfo(this));
|
|
return IllegalTok;
|
|
}
|
|
|
|
size_t ePos = numStr.find(U'e');
|
|
if (ePos != String::npos)
|
|
{
|
|
if (ePos == 0)
|
|
{
|
|
error = SyntaxError(String(std::format("Illegal number literal: {}", numStr.toBasicString())),
|
|
this->line,
|
|
column,
|
|
SourceInfo(this));
|
|
return IllegalTok;
|
|
}
|
|
if (ePos + 1 >= numStr.length())
|
|
{
|
|
error = SyntaxError(String(std::format("Illegal number literal: {}", numStr.toBasicString())),
|
|
this->line,
|
|
column,
|
|
SourceInfo(this));
|
|
return IllegalTok;
|
|
}
|
|
bool hasDigitAfterE = false;
|
|
for (size_t i = ePos + 1; i < numStr.length(); ++i)
|
|
{
|
|
char32_t c = numStr[i];
|
|
if (c == U'+' || c == U'-')
|
|
{
|
|
if (i != ePos + 1)
|
|
{
|
|
error = SyntaxError(String(std::format("Illegal number literal: {}", numStr.toBasicString())),
|
|
this->line,
|
|
column,
|
|
SourceInfo(this));
|
|
return IllegalTok;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
if (CharUtils::isDigit(c)) { hasDigitAfterE = true; }
|
|
else
|
|
{
|
|
error = SyntaxError(String(std::format("Illegal number literal: {}", numStr.toBasicString())),
|
|
this->line,
|
|
column,
|
|
SourceInfo(this));
|
|
return IllegalTok;
|
|
}
|
|
}
|
|
|
|
if (!hasDigitAfterE)
|
|
{
|
|
error = SyntaxError(String(std::format("Illegal number literal: {}", numStr.toBasicString())),
|
|
this->line,
|
|
column,
|
|
SourceInfo(this));
|
|
return IllegalTok;
|
|
}
|
|
}
|
|
|
|
return Token(numStr, TokenType::LiteralNumber);
|
|
}
|
|
Token Lexer::scanSymbol()
|
|
{
|
|
String sym;
|
|
char32_t ch = current();
|
|
sym += ch;
|
|
|
|
auto startsWith = [&](const String &prefix) -> bool {
|
|
for (const auto &p : symbol_map)
|
|
{
|
|
const String &op = p.first;
|
|
if (op.starts_with(prefix)) return true;
|
|
}
|
|
return false;
|
|
};
|
|
|
|
if (!startsWith(sym))
|
|
{
|
|
error = SyntaxError(String(std::format("No such operator: {}", sym.toBasicString())),
|
|
this->line,
|
|
column,
|
|
SourceInfo(this));
|
|
next();
|
|
return IllegalTok;
|
|
}
|
|
|
|
while (hasNext())
|
|
{
|
|
char32_t peek_ch = peek();
|
|
if (!CharUtils::isPunct(peek_ch)) break;
|
|
|
|
String candidate = sym + peek_ch;
|
|
|
|
if (startsWith(candidate))
|
|
{
|
|
next();
|
|
sym = candidate;
|
|
}
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!symbol_map.contains(sym))
|
|
{
|
|
error = SyntaxError(String(std::format("No such operator: {}", sym.toBasicString())),
|
|
this->line,
|
|
column,
|
|
SourceInfo(this));
|
|
next();
|
|
return IllegalTok;
|
|
}
|
|
// std::cerr << Token(sym, symbol_map.at(sym)).toString().toBasicString() << '\n;
|
|
next();
|
|
return Token(sym, symbol_map.at(sym));
|
|
}
|
|
|
|
Token Lexer::scanComments()
|
|
{
|
|
// entry: when iterator current char is '/' and peek is '/' or '*'
|
|
// current char is '/'
|
|
String comment;
|
|
|
|
if (peek() == U'/') // single-line comment
|
|
{
|
|
next(); // skip first '/'
|
|
next(); // skip second '/'
|
|
|
|
char32_t c = current();
|
|
while (c != U'\n' and hasNext())
|
|
{
|
|
comment += c;
|
|
next();
|
|
c = current();
|
|
}
|
|
|
|
if (hasNext() && c == U'\n') { next(); }
|
|
}
|
|
else // multi-line comment
|
|
{
|
|
next(); // skip '/'
|
|
next(); // skip '*'
|
|
|
|
char32_t c = current();
|
|
bool terminated = false;
|
|
|
|
while (hasNext())
|
|
{
|
|
if (c == U'*' and hasNext() and peek() == U'/')
|
|
{
|
|
next(); // skip '*'
|
|
next(); // skip '/'
|
|
terminated = true;
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
comment += c;
|
|
next();
|
|
c = current();
|
|
}
|
|
}
|
|
|
|
if (!terminated)
|
|
{
|
|
error = SyntaxError(String(U"\1"), this->line, column, SourceInfo(this));
|
|
next();
|
|
return IllegalTok;
|
|
}
|
|
}
|
|
|
|
return Token(comment, TokenType::Comments);
|
|
}
|
|
Token Lexer::nextToken()
|
|
{
|
|
if (!hasNext()) { return EOFTok.setPos(getCurrentLine(), getCurrentColumn()); }
|
|
char32_t ch = current();
|
|
while (hasNext())
|
|
{
|
|
ch = current();
|
|
if (!CharUtils::isSpace(ch))
|
|
{
|
|
break;
|
|
}
|
|
next();
|
|
}
|
|
last_line = getCurrentLine();
|
|
last_column = getCurrentColumn();
|
|
if (ch == U'/')
|
|
{
|
|
char32_t c;
|
|
if (!hasNext())
|
|
{
|
|
next();
|
|
return Token(U"\1", this->symbol_map.at(U"\1")).setPos(last_line, last_column);
|
|
}
|
|
c = peek();
|
|
if (c != U'/' and c != U'*')
|
|
{
|
|
next();
|
|
return Token(U"\1", this->symbol_map.at(U"\1")).setPos(last_line, last_column);
|
|
}
|
|
scanComments().setPos(last_line, last_column);
|
|
return nextToken();
|
|
// now we ignore comments to avoid some stupid bugs
|
|
}
|
|
if (ch == U'r' and hasNext() and peek() == U'"')
|
|
{
|
|
// r""
|
|
// raw String
|
|
next();
|
|
next();
|
|
return scanRawString().setPos(last_line, last_column);
|
|
}
|
|
if (CharUtils::isAlpha(ch) || ch == U'_') { return scanIdentifier().setPos(last_line, last_column); }
|
|
else if (ch == U'"')
|
|
{
|
|
next();
|
|
return scanString().setPos(last_line, last_column);
|
|
}
|
|
else if (CharUtils::isDigit(ch)) { return scanNumber().setPos(last_line, last_column); }
|
|
else if (CharUtils::isPunct(ch)) { return scanSymbol().setPos(last_line, last_column); }
|
|
else
|
|
{
|
|
error =
|
|
SyntaxError(String(std::format("Cannot tokenize char: '{}'", String(ch).toBasicString())),
|
|
this->line,
|
|
column,
|
|
SourceInfo(this));
|
|
if (hasNext()) { next(); }
|
|
return IllegalTok.setPos(last_line, last_column);
|
|
}
|
|
}
|
|
|
|
} // namespace Fig
|