尝试UTF32String

This commit is contained in:
2026-02-05 22:20:21 +08:00
parent d897f41c57
commit 9c68b2e77d
69 changed files with 3078 additions and 2605 deletions

View File

@@ -1,9 +1,9 @@
#include <Core/fig_string.hpp>
#include <Error/error.hpp>
#include <Token/token.hpp>
#include <Lexer/lexer.hpp>
#include <Core/fig_string.hpp>
#include <Core/String.hpp>
#include <Core/CharUtils.hpp>
#include <Utils/utils.hpp>
#if 0
@@ -17,113 +17,110 @@
namespace Fig
{
const std::unordered_map<FString, TokenType> Lexer::symbol_map{
const std::unordered_map<String, TokenType> Lexer::symbol_map{
// 三字符
{FString(u8"..."), TokenType::TripleDot},
{String(U"..."), TokenType::TripleDot},
// 双字符
{FString(u8"=="), TokenType::Equal},
{FString(u8"!="), TokenType::NotEqual},
{FString(u8"<="), TokenType::LessEqual},
{FString(u8">="), TokenType::GreaterEqual},
{FString(u8"<<"), TokenType::ShiftLeft},
{FString(u8">>"), TokenType::ShiftRight},
{FString(u8"+="), TokenType::PlusEqual},
{FString(u8"-="), TokenType::MinusEqual},
{FString(u8"*="), TokenType::AsteriskEqual},
{FString(u8"/="), TokenType::SlashEqual},
{FString(u8"%="), TokenType::PercentEqual},
{FString(u8"^="), TokenType::CaretEqual},
{FString(u8"++"), TokenType::DoublePlus},
{FString(u8"--"), TokenType::DoubleMinus},
{FString(u8"&&"), TokenType::DoubleAmpersand},
{FString(u8"||"), TokenType::DoublePipe},
{FString(u8":="), TokenType::Walrus},
{FString(u8"**"), TokenType::Power},
{FString(u8"->"), TokenType::RightArrow},
{FString(u8"=>"), TokenType::DoubleArrow},
{String(U"=="), TokenType::Equal},
{String(U"!="), TokenType::NotEqual},
{String(U"<="), TokenType::LessEqual},
{String(U">="), TokenType::GreaterEqual},
{String(U"<<"), TokenType::ShiftLeft},
{String(U">>"), TokenType::ShiftRight},
{String(U"+="), TokenType::PlusEqual},
{String(U"-="), TokenType::MinusEqual},
{String(U"*="), TokenType::AsteriskEqual},
{String(U"/="), TokenType::SlashEqual},
{String(U"%="), TokenType::PercentEqual},
{String(U"^="), TokenType::CaretEqual},
{String(U"++"), TokenType::DoublePlus},
{String(U"--"), TokenType::DoubleMinus},
{String(U"&&"), TokenType::DoubleAmpersand},
{String(U"||"), TokenType::DoublePipe},
{String(U":="), TokenType::Walrus},
{String(U"**"), TokenType::Power},
{String(U"->"), TokenType::RightArrow},
{String(U"=>"), TokenType::DoubleArrow},
// 单字符
{FString(u8"+"), TokenType::Plus},
{FString(u8"-"), TokenType::Minus},
{FString(u8"*"), TokenType::Asterisk},
{FString(u8"/"), TokenType::Slash},
{FString(u8"%"), TokenType::Percent},
{FString(u8"^"), TokenType::Caret},
{FString(u8"&"), TokenType::Ampersand},
{FString(u8"|"), TokenType::Pipe},
{FString(u8"~"), TokenType::Tilde},
{FString(u8"="), TokenType::Assign},
{FString(u8"<"), TokenType::Less},
{FString(u8">"), TokenType::Greater},
{FString(u8"."), TokenType::Dot},
{FString(u8","), TokenType::Comma},
{FString(u8":"), TokenType::Colon},
{FString(u8";"), TokenType::Semicolon},
{FString(u8"'"), TokenType::SingleQuote},
{FString(u8"\""), TokenType::DoubleQuote},
{FString(u8"("), TokenType::LeftParen},
{FString(u8")"), TokenType::RightParen},
{FString(u8"["), TokenType::LeftBracket},
{FString(u8"]"), TokenType::RightBracket},
{FString(u8"{"), TokenType::LeftBrace},
{FString(u8"}"), TokenType::RightBrace},
{FString(u8"?"), TokenType::Question},
{FString(u8"!"), TokenType::Not},
{String(U"+"), TokenType::Plus},
{String(U"-"), TokenType::Minus},
{String(U"*"), TokenType::Asterisk},
{String(U"/"), TokenType::Slash},
{String(U"%"), TokenType::Percent},
{String(U"^"), TokenType::Caret},
{String(U"&"), TokenType::Ampersand},
{String(U"|"), TokenType::Pipe},
{String(U"~"), TokenType::Tilde},
{String(U"="), TokenType::Assign},
{String(U"<"), TokenType::Less},
{String(U">"), TokenType::Greater},
{String(U"."), TokenType::Dot},
{String(U","), TokenType::Comma},
{String(U":"), TokenType::Colon},
{String(U";"), TokenType::Semicolon},
{String(U"'"), TokenType::SingleQuote},
{String(U"\""), TokenType::DoubleQuote},
{String(U"("), TokenType::LeftParen},
{String(U")"), TokenType::RightParen},
{String(U"["), TokenType::LeftBracket},
{String(U"]"), TokenType::RightBracket},
{String(U"{"), TokenType::LeftBrace},
{String(U"}"), TokenType::RightBrace},
{String(U"?"), TokenType::Question},
{String(U"!"), TokenType::Not},
};
const std::unordered_map<FString, TokenType> Lexer::keyword_map{
{FString(u8"and"), TokenType::And},
{FString(u8"or"), TokenType::Or},
{FString(u8"not"), TokenType::Not},
{FString(u8"import"), TokenType::Import},
{FString(u8"func"), TokenType::Function},
{FString(u8"var"), TokenType::Variable},
{FString(u8"const"), TokenType::Const},
// {FString(u8"final"), TokenType::Final},
{FString(u8"while"), TokenType::While},
{FString(u8"for"), TokenType::For},
{FString(u8"if"), TokenType::If},
{FString(u8"else"), TokenType::Else},
{FString(u8"new"), TokenType::New},
{FString(u8"struct"), TokenType::Struct},
{FString(u8"interface"), TokenType::Interface},
{FString(u8"impl"), TokenType::Implement},
{FString(u8"is"), TokenType::Is},
{FString(u8"public"), TokenType::Public},
{FString(u8"return"), TokenType::Return},
{FString(u8"break"), TokenType::Break},
{FString(u8"continue"), TokenType::Continue},
{FString(u8"try"), TokenType::Try},
{FString(u8"catch"), TokenType::Catch},
{FString(u8"throw"), TokenType::Throw},
{FString(u8"Finally"), TokenType::Finally},
{FString(u8"as"), TokenType::As},
const std::unordered_map<String, TokenType> Lexer::keyword_map{
{String(U"and"), TokenType::And},
{String(U"or"), TokenType::Or},
{String(U"not"), TokenType::Not},
{String(U"import"), TokenType::Import},
{String(U"func"), TokenType::Function},
{String(U"var"), TokenType::Variable},
{String(U"const"), TokenType::Const},
// {String(U"final"), TokenType::Final},
{String(U"while"), TokenType::While},
{String(U"for"), TokenType::For},
{String(U"if"), TokenType::If},
{String(U"else"), TokenType::Else},
{String(U"new"), TokenType::New},
{String(U"struct"), TokenType::Struct},
{String(U"interface"), TokenType::Interface},
{String(U"impl"), TokenType::Implement},
{String(U"is"), TokenType::Is},
{String(U"public"), TokenType::Public},
{String(U"return"), TokenType::Return},
{String(U"break"), TokenType::Break},
{String(U"continue"), TokenType::Continue},
{String(U"try"), TokenType::Try},
{String(U"catch"), TokenType::Catch},
{String(U"throw"), TokenType::Throw},
{String(U"Finally"), TokenType::Finally},
{String(U"as"), TokenType::As},
// {FString(u8"Null"), TokenType::TypeNull},
// {FString(u8"Int"), TokenType::TypeInt},
// {FString(u8"String"), TokenType::TypeString},
// {FString(u8"Bool"), TokenType::TypeBool},
// {FString(u8"Double"), TokenType::TypeDouble},
// {String(U"Null"), TokenType::TypeNull},
// {String(U"Int"), TokenType::TypeInt},
// {String(U"String"), TokenType::TypeString},
// {String(U"Bool"), TokenType::TypeBool},
// {String(U"Double"), TokenType::TypeDouble},
};
void Lexer::skipLine()
{
while (*it != U'\n' and hasNext())
{
next();
}
while (current() != U'\n' and hasNext()) { next(); }
next(); // skip '\n'
++line;
}
Token Lexer::scanIdentifier()
{
FString identifier;
String identifier;
while (hasNext())
{
UTF8Char c = *it;
if (c.isAlnum() || c == U'_')
char32_t c = current();
if (CharUtils::isAlnum(c) || c == U'_')
{
identifier += c.getString();
identifier += c;
next();
}
else
@@ -131,23 +128,27 @@ namespace Fig
break;
}
}
if (this->keyword_map.contains(identifier))
{
return Token(identifier, this->keyword_map.at(identifier));
}
else if (identifier == u8"true" || identifier == u8"false")
{
return Token(identifier, TokenType::LiteralBool);
}
else if (identifier == u8"null")
if (this->keyword_map.contains(identifier)) { return Token(identifier, this->keyword_map.at(identifier)); }
else if (identifier == U"\1" || identifier == U"\1") { return Token(identifier, TokenType::LiteralBool); }
else if (identifier == U"\1")
{
// null instance
return Token(identifier, TokenType::LiteralNull);
}
if (keyword_map.contains(Utils::toLower(identifier)))
{
pushWarning(1, identifier); // Identifier is too similar to a keyword or a primitive type
}
// const auto &toLower = [](const String &str) -> String
// {
// String res;
// for (auto c : str)
// {
// res += CharUtils::toLower(c);
// }
// return res;
// };
// if (keyword_map.contains(toLower(identifier)))
// {
// pushWarning(1, identifier); // Identifier is too similar to a keyword or a primitive type
// }
if (identifier.length() <= 1)
{
pushWarning(2, identifier); // The identifier is too abstract
@@ -156,12 +157,12 @@ namespace Fig
}
Token Lexer::scanString()
{
FString str;
String str;
bool unterminated = true;
size_t str_start_col = it.column() - 1;
size_t str_start_col = column - 1;
while (hasNext())
{
UTF8Char c = *it;
char32_t c = current();
if (c == U'"' || c == U'\n')
{
next();
@@ -170,74 +171,74 @@ namespace Fig
}
else if (c == U'\\') // c is '\'
{
if (it.isEnd())
if (!hasNext())
{
error = SyntaxError(u8"Unterminated FString", this->line, it.column(), SourceInfo(this));
error = SyntaxError(U"\1", this->line, column, SourceInfo(this));
return IllegalTok;
}
next();
UTF8Char ec = *it;
char32_t ec = current();
if (ec == U'n')
{
next();
str += u8"\n";
str += U"\1";
}
else if (ec == U't')
{
next();
str += u8"\t";
str += U"\1";
}
else if (ec == U'v')
{
next();
str += u8"\v";
str += U"\1";
}
else if (ec == U'b')
{
next();
str += u8"\b";
str += U"\1";
}
else if (ec == U'"')
{
next();
str += u8"\"";
str += U"\1";
}
else if (ec == U'\'')
{
next();
str += u8"'";
str += U"\1";
}
else
{
error = SyntaxError(FString(std::format("Unsupported escape character: {}",
FString(ec.getString()).toBasicString())),
this->line,
it.column(),
SourceInfo(this));
error =
SyntaxError(String(std::format("Unsupported escape character: {}", String(ec).toBasicString())),
this->line,
column,
SourceInfo(this));
return IllegalTok;
}
}
else
{
str += c.getString();
str += c;
next();
}
}
if (unterminated)
{
error = SyntaxError(u8"Unterminated FString", this->line, str_start_col, SourceInfo(this));
error = SyntaxError(U"Unterminated FString", this->line, str_start_col, SourceInfo(this));
return IllegalTok;
}
return Token(str, TokenType::LiteralString);
}
Token Lexer::scanRawString()
{
FString str;
String str;
bool unterminated = true;
size_t str_start_col = it.column() - 1;
size_t str_start_col = column - 1;
while (hasNext())
{
UTF8Char c = *it;
char32_t c = current();
if (c == U'"' || c == U'\n')
{
next();
@@ -246,27 +247,27 @@ namespace Fig
}
else
{
str += c.getString();
str += c;
next();
}
}
if (unterminated)
{
error = SyntaxError(u8"Unterminated FString", this->line, str_start_col, SourceInfo(this));
error = SyntaxError(U"Unterminated FString", this->line, str_start_col, SourceInfo(this));
return IllegalTok;
}
return Token(str, TokenType::LiteralString);
}
Token Lexer::scanMultilineString()
{
FString str;
String str;
bool unterminated = true;
uint8_t end = 0;
size_t str_start_col = it.column() - 1;
size_t str_start_col = column - 1;
while (hasNext())
{
UTF8Char c = *it;
char32_t c = current();
if (c == U'"')
{
if (end == 3)
@@ -281,99 +282,99 @@ namespace Fig
}
else if (c == U'\\') // c is '\'
{
if (it.isEnd())
if (!hasNext())
{
error = SyntaxError(u8"Unterminated FString", this->line, it.column(), SourceInfo(this));
error = SyntaxError(U"Unterminated FString", this->line, column, SourceInfo(this));
return IllegalTok;
}
next();
UTF8Char ec = *it;
char32_t ec = current();
if (ec == U'n')
{
next();
str += u8"\n";
str += U"\n";
}
else if (ec == U't')
{
next();
str += u8"\t";
str += U"\t";
}
else if (ec == U'v')
{
next();
str += u8"\v";
str += U"\v";
}
else if (ec == U'b')
{
next();
str += u8"\b";
str += U"\b";
}
else if (ec == U'"')
{
next();
str += u8"\"";
str += U"\"";
}
else if (ec == U'\'')
{
next();
str += u8"'";
str += U"\'";
}
else if (ec == U'\\')
{
next();
str += u8"\\";
str += U"\\";
}
else
{
error = SyntaxError(FString(std::format("Unsupported escape character: {}",
FString(ec.getString()).toBasicString())),
this->line,
it.column(),
SourceInfo(this));
error = SyntaxError(
String(std::format("Unsupported escape character: {}", String(ec).toBasicString())),
this->line,
column,
SourceInfo(this));
return IllegalTok;
}
}
else
{
str += c.getString();
str += c;
}
end = 0;
}
if (unterminated)
{
error = SyntaxError(u8"Unterminated FString", this->line, str_start_col, SourceInfo(this));
error = SyntaxError(U"\1", this->line, str_start_col, SourceInfo(this));
return IllegalTok;
}
return Token(str, TokenType::LiteralString);
}
Token Lexer::scanNumber()
{
FString numStr;
String numStr;
bool hasPoint = false;
while (hasNext())
{
UTF8Char ch = *it;
char32_t ch = current();
if (ch.isDigit() || ch == U'e')
if (CharUtils::isDigit(ch) || ch == U'e')
{
numStr += ch.getString();
numStr += ch;
next();
}
else if (ch == U'-' && !numStr.empty() && (numStr.ends_with(U'e') || numStr.ends_with(U'E')))
{
numStr += ch.getString();
numStr += ch;
next();
}
else if (ch == U'+' && !numStr.empty() && (numStr.ends_with(U'e') || numStr.ends_with(U'E')))
{
numStr += ch.getString();
numStr += ch;
next();
}
else if (ch == U'.' && !hasPoint)
{
hasPoint = true;
numStr += ch.getString();
numStr += ch;
next();
}
else
@@ -385,9 +386,9 @@ namespace Fig
if (numStr.ends_with(U'e'))
{
error = SyntaxError(FString(std::format("Illegal number literal: {}", numStr.toBasicString())),
error = SyntaxError(String(std::format("Illegal number literal: {}", numStr.toBasicString())),
this->line,
it.column(),
column,
SourceInfo(this));
return IllegalTok;
}
@@ -395,7 +396,7 @@ namespace Fig
bool hasDigit = false;
for (auto it = numStr.begin(); it != numStr.end(); ++it)
{
if (isdigit(*it))
if (CharUtils::isDigit(*it))
{
hasDigit = true;
break;
@@ -404,55 +405,55 @@ namespace Fig
if (!hasDigit)
{
error = SyntaxError(FString(std::format("Illegal number literal: {}", numStr.toBasicString())),
error = SyntaxError(String(std::format("Illegal number literal: {}", numStr.toBasicString())),
this->line,
it.column(),
column,
SourceInfo(this));
return IllegalTok;
}
size_t ePos = numStr.find(U'e');
if (ePos != FString::npos)
if (ePos != String::npos)
{
if (ePos == 0)
{
error = SyntaxError(FString(std::format("Illegal number literal: {}", numStr.toBasicString())),
error = SyntaxError(String(std::format("Illegal number literal: {}", numStr.toBasicString())),
this->line,
it.column(),
column,
SourceInfo(this));
return IllegalTok;
}
if (ePos + 1 >= numStr.length())
{
error = SyntaxError(FString(std::format("Illegal number literal: {}", numStr.toBasicString())),
error = SyntaxError(String(std::format("Illegal number literal: {}", numStr.toBasicString())),
this->line,
it.column(),
column,
SourceInfo(this));
return IllegalTok;
}
bool hasDigitAfterE = false;
for (size_t i = ePos + 1; i < numStr.length(); ++i)
{
UTF8Char c = std::u8string(1,numStr[i]);
char32_t c = numStr[i];
if (c == U'+' || c == U'-')
{
if (i != ePos + 1)
{
error = SyntaxError(FString(std::format("Illegal number literal: {}", numStr.toBasicString())),
error = SyntaxError(String(std::format("Illegal number literal: {}", numStr.toBasicString())),
this->line,
it.column(),
column,
SourceInfo(this));
return IllegalTok;
}
continue;
}
if (c.isDigit()) { hasDigitAfterE = true; }
if (CharUtils::isDigit(c)) { hasDigitAfterE = true; }
else
{
error = SyntaxError(FString(std::format("Illegal number literal: {}", numStr.toBasicString())),
error = SyntaxError(String(std::format("Illegal number literal: {}", numStr.toBasicString())),
this->line,
it.column(),
column,
SourceInfo(this));
return IllegalTok;
}
@@ -460,9 +461,9 @@ namespace Fig
if (!hasDigitAfterE)
{
error = SyntaxError(FString(std::format("Illegal number literal: {}", numStr.toBasicString())),
error = SyntaxError(String(std::format("Illegal number literal: {}", numStr.toBasicString())),
this->line,
it.column(),
column,
SourceInfo(this));
return IllegalTok;
}
@@ -472,25 +473,24 @@ namespace Fig
}
Token Lexer::scanSymbol()
{
FString sym;
UTF8Char ch = *it;
sym += ch.getString();
String sym;
char32_t ch = current();
sym += ch;
auto startsWith = [&](const FString &prefix) -> bool {
auto startsWith = [&](const String &prefix) -> bool {
for (const auto &p : symbol_map)
{
const FString &op = p.first;
if (op.starts_with(prefix))
return true;
const String &op = p.first;
if (op.starts_with(prefix)) return true;
}
return false;
};
if (!startsWith(sym))
{
error = SyntaxError(FString(std::format("No such operator: {}", sym.toBasicString())),
error = SyntaxError(String(std::format("No such operator: {}", sym.toBasicString())),
this->line,
it.column(),
column,
SourceInfo(this));
next();
return IllegalTok;
@@ -498,11 +498,10 @@ namespace Fig
while (hasNext())
{
UTF8Char peek = it.peek();
if (!peek.isPunct())
break;
char32_t peek_ch = peek();
if (!CharUtils::isPunct(peek_ch)) break;
FString candidate = sym + FString(peek.getString());
String candidate = sym + peek_ch;
if (startsWith(candidate))
{
@@ -517,9 +516,9 @@ namespace Fig
if (!symbol_map.contains(sym))
{
error = SyntaxError(FString(std::format("No such operator: {}", sym.toBasicString())),
error = SyntaxError(String(std::format("No such operator: {}", sym.toBasicString())),
this->line,
it.column(),
column,
SourceInfo(this));
next();
return IllegalTok;
@@ -533,37 +532,34 @@ namespace Fig
{
// entry: when iterator current char is '/' and peek is '/' or '*'
// current char is '/'
FString comment;
String comment;
if (it.peek() == U'/') // single-line comment
if (peek() == U'/') // single-line comment
{
next(); // skip first '/'
next(); // skip second '/'
UTF8Char c = *it;
char32_t c = current();
while (c != U'\n' and hasNext())
{
comment += c.getString();
comment += c;
next();
c = *it;
c = current();
}
if (hasNext() && c == U'\n')
{
next();
}
if (hasNext() && c == U'\n') { next(); }
}
else // multi-line comment
{
next(); // skip '/'
next(); // skip '*'
UTF8Char c = *it;
char32_t c = current();
bool terminated = false;
while (hasNext())
{
if (c == U'*' and hasNext() and it.peek() == U'/')
if (c == U'*' and hasNext() and peek() == U'/')
{
next(); // skip '*'
next(); // skip '/'
@@ -572,16 +568,15 @@ namespace Fig
}
else
{
comment += c.getString();
comment += c;
next();
c = *it;
c = current();
}
}
if (!terminated)
{
error =
SyntaxError(FString(u8"Unterminated multiline comment"), this->line, it.column(), SourceInfo(this));
error = SyntaxError(String(U"\1"), this->line, column, SourceInfo(this));
next();
return IllegalTok;
}
@@ -591,76 +586,61 @@ namespace Fig
}
Token Lexer::nextToken()
{
if (!hasNext())
if (!hasNext()) { return EOFTok.setPos(getCurrentLine(), getCurrentColumn()); }
char32_t ch = current();
while (hasNext())
{
return EOFTok;
}
UTF8Char ch = *it;
while (ch.isSpace())
{
next();
ch = *it;
if (!hasNext())
ch = current();
if (!CharUtils::isSpace(ch))
{
return EOFTok.setPos(getCurrentLine(), getCurrentColumn());
break;
}
next();
}
last_line = getCurrentLine();
last_column = getCurrentColumn();
if (ch == U'/')
{
UTF8Char c{u8""};
char32_t c;
if (!hasNext())
{
next();
return Token(u8"/", this->symbol_map.at(u8"/")).setPos(last_line, last_column);
return Token(U"\1", this->symbol_map.at(U"\1")).setPos(last_line, last_column);
}
c = it.peek();
c = peek();
if (c != U'/' and c != U'*')
{
next();
return Token(u8"/", this->symbol_map.at(u8"/")).setPos(last_line, last_column);
return Token(U"\1", this->symbol_map.at(U"\1")).setPos(last_line, last_column);
}
scanComments().setPos(last_line, last_column);
return nextToken();
// now we ignore comments to avoid some stupid bugs
}
if (ch == U'r' and hasNext() and it.peek() == U'"')
if (ch == U'r' and hasNext() and peek() == U'"')
{
// r""
// raw FString
// raw String
next();
next();
return scanRawString().setPos(last_line, last_column);
}
if (ch.isAlpha() || ch == U'_')
{
return scanIdentifier().setPos(last_line, last_column);
}
if (CharUtils::isAlpha(ch) || ch == U'_') { return scanIdentifier().setPos(last_line, last_column); }
else if (ch == U'"')
{
next();
return scanString().setPos(last_line, last_column);
}
else if (ch.isDigit())
{
return scanNumber().setPos(last_line, last_column);
}
else if (ch.isPunct())
{
return scanSymbol().setPos(last_line, last_column);
}
else if (CharUtils::isDigit(ch)) { return scanNumber().setPos(last_line, last_column); }
else if (CharUtils::isPunct(ch)) { return scanSymbol().setPos(last_line, last_column); }
else
{
error =
SyntaxError(FString(std::format("Cannot tokenize char: '{}'", FString(ch.getString()).toBasicString())),
SyntaxError(String(std::format("Cannot tokenize char: '{}'", String(ch).toBasicString())),
this->line,
it.column(),
column,
SourceInfo(this));
if (hasNext())
{
next();
}
if (hasNext()) { next(); }
return IllegalTok.setPos(last_line, last_column);
}
}