#include #include #include #include #include #include #if 0 #include // debug #endif #ifndef SourceInfo #define SourceInfo(ptr) (ptr->sourcePath), (ptr->sourceLines) #endif namespace Fig { const std::unordered_map Lexer::symbol_map{ // 三字符 {String(U"..."), TokenType::TripleDot}, // 双字符 {String(U"=="), TokenType::Equal}, {String(U"!="), TokenType::NotEqual}, {String(U"<="), TokenType::LessEqual}, {String(U">="), TokenType::GreaterEqual}, {String(U"<<"), TokenType::ShiftLeft}, {String(U">>"), TokenType::ShiftRight}, {String(U"+="), TokenType::PlusEqual}, {String(U"-="), TokenType::MinusEqual}, {String(U"*="), TokenType::AsteriskEqual}, {String(U"/="), TokenType::SlashEqual}, {String(U"%="), TokenType::PercentEqual}, {String(U"^="), TokenType::CaretEqual}, {String(U"++"), TokenType::DoublePlus}, {String(U"--"), TokenType::DoubleMinus}, {String(U"&&"), TokenType::DoubleAmpersand}, {String(U"||"), TokenType::DoublePipe}, {String(U":="), TokenType::Walrus}, {String(U"**"), TokenType::Power}, {String(U"->"), TokenType::RightArrow}, {String(U"=>"), TokenType::DoubleArrow}, // 单字符 {String(U"+"), TokenType::Plus}, {String(U"-"), TokenType::Minus}, {String(U"*"), TokenType::Asterisk}, {String(U"/"), TokenType::Slash}, {String(U"%"), TokenType::Percent}, {String(U"^"), TokenType::Caret}, {String(U"&"), TokenType::Ampersand}, {String(U"|"), TokenType::Pipe}, {String(U"~"), TokenType::Tilde}, {String(U"="), TokenType::Assign}, {String(U"<"), TokenType::Less}, {String(U">"), TokenType::Greater}, {String(U"."), TokenType::Dot}, {String(U","), TokenType::Comma}, {String(U":"), TokenType::Colon}, {String(U";"), TokenType::Semicolon}, {String(U"'"), TokenType::SingleQuote}, {String(U"\""), TokenType::DoubleQuote}, {String(U"("), TokenType::LeftParen}, {String(U")"), TokenType::RightParen}, {String(U"["), TokenType::LeftBracket}, {String(U"]"), TokenType::RightBracket}, {String(U"{"), TokenType::LeftBrace}, {String(U"}"), TokenType::RightBrace}, {String(U"?"), TokenType::Question}, {String(U"!"), TokenType::Not}, }; const std::unordered_map Lexer::keyword_map{ {String(U"and"), TokenType::And}, {String(U"or"), TokenType::Or}, {String(U"not"), TokenType::Not}, {String(U"import"), TokenType::Import}, {String(U"func"), TokenType::Function}, {String(U"var"), TokenType::Variable}, {String(U"const"), TokenType::Const}, // {String(U"final"), TokenType::Final}, {String(U"while"), TokenType::While}, {String(U"for"), TokenType::For}, {String(U"if"), TokenType::If}, {String(U"else"), TokenType::Else}, {String(U"new"), TokenType::New}, {String(U"struct"), TokenType::Struct}, {String(U"interface"), TokenType::Interface}, {String(U"impl"), TokenType::Implement}, {String(U"is"), TokenType::Is}, {String(U"public"), TokenType::Public}, {String(U"return"), TokenType::Return}, {String(U"break"), TokenType::Break}, {String(U"continue"), TokenType::Continue}, {String(U"try"), TokenType::Try}, {String(U"catch"), TokenType::Catch}, {String(U"throw"), TokenType::Throw}, {String(U"Finally"), TokenType::Finally}, {String(U"as"), TokenType::As}, // {String(U"Null"), TokenType::TypeNull}, // {String(U"Int"), TokenType::TypeInt}, // {String(U"String"), TokenType::TypeString}, // {String(U"Bool"), TokenType::TypeBool}, // {String(U"Double"), TokenType::TypeDouble}, }; void Lexer::skipLine() { while (current() != U'\n' and hasNext()) { next(); } next(); // skip '\n' ++line; } Token Lexer::scanIdentifier() { String identifier; while (hasNext()) { char32_t c = current(); if (CharUtils::isAlnum(c) || c == U'_') { identifier += c; next(); } else { break; } } if (this->keyword_map.contains(identifier)) { return Token(identifier, this->keyword_map.at(identifier)); } else if (identifier == U"\1" || identifier == U"\1") { return Token(identifier, TokenType::LiteralBool); } else if (identifier == U"\1") { // null instance return Token(identifier, TokenType::LiteralNull); } // const auto &toLower = [](const String &str) -> String // { // String res; // for (auto c : str) // { // res += CharUtils::toLower(c); // } // return res; // }; // if (keyword_map.contains(toLower(identifier))) // { // pushWarning(1, identifier); // Identifier is too similar to a keyword or a primitive type // } if (identifier.length() <= 1) { pushWarning(2, identifier); // The identifier is too abstract } return Token(identifier, TokenType::Identifier); } Token Lexer::scanString() { String str; bool unterminated = true; size_t str_start_col = column - 1; while (hasNext()) { char32_t c = current(); if (c == U'"' || c == U'\n') { next(); unterminated = false; break; } else if (c == U'\\') // c is '\' { if (!hasNext()) { error = SyntaxError(U"\1", this->line, column, SourceInfo(this)); return IllegalTok; } next(); char32_t ec = current(); if (ec == U'n') { next(); str += U"\1"; } else if (ec == U't') { next(); str += U"\1"; } else if (ec == U'v') { next(); str += U"\1"; } else if (ec == U'b') { next(); str += U"\1"; } else if (ec == U'"') { next(); str += U"\1"; } else if (ec == U'\'') { next(); str += U"\1"; } else { error = SyntaxError(String(std::format("Unsupported escape character: {}", String(ec).toBasicString())), this->line, column, SourceInfo(this)); return IllegalTok; } } else { str += c; next(); } } if (unterminated) { error = SyntaxError(U"Unterminated FString", this->line, str_start_col, SourceInfo(this)); return IllegalTok; } return Token(str, TokenType::LiteralString); } Token Lexer::scanRawString() { String str; bool unterminated = true; size_t str_start_col = column - 1; while (hasNext()) { char32_t c = current(); if (c == U'"' || c == U'\n') { next(); unterminated = false; break; } else { str += c; next(); } } if (unterminated) { error = SyntaxError(U"Unterminated FString", this->line, str_start_col, SourceInfo(this)); return IllegalTok; } return Token(str, TokenType::LiteralString); } Token Lexer::scanMultilineString() { String str; bool unterminated = true; uint8_t end = 0; size_t str_start_col = column - 1; while (hasNext()) { char32_t c = current(); if (c == U'"') { if (end == 3) { next(); unterminated = false; break; } end++; next(); continue; } else if (c == U'\\') // c is '\' { if (!hasNext()) { error = SyntaxError(U"Unterminated FString", this->line, column, SourceInfo(this)); return IllegalTok; } next(); char32_t ec = current(); if (ec == U'n') { next(); str += U"\n"; } else if (ec == U't') { next(); str += U"\t"; } else if (ec == U'v') { next(); str += U"\v"; } else if (ec == U'b') { next(); str += U"\b"; } else if (ec == U'"') { next(); str += U"\""; } else if (ec == U'\'') { next(); str += U"\'"; } else if (ec == U'\\') { next(); str += U"\\"; } else { error = SyntaxError( String(std::format("Unsupported escape character: {}", String(ec).toBasicString())), this->line, column, SourceInfo(this)); return IllegalTok; } } else { str += c; } end = 0; } if (unterminated) { error = SyntaxError(U"\1", this->line, str_start_col, SourceInfo(this)); return IllegalTok; } return Token(str, TokenType::LiteralString); } Token Lexer::scanNumber() { String numStr; bool hasPoint = false; while (hasNext()) { char32_t ch = current(); if (CharUtils::isDigit(ch) || ch == U'e') { numStr += ch; next(); } else if (ch == U'-' && !numStr.empty() && (numStr.ends_with(U'e') || numStr.ends_with(U'E'))) { numStr += ch; next(); } else if (ch == U'+' && !numStr.empty() && (numStr.ends_with(U'e') || numStr.ends_with(U'E'))) { numStr += ch; next(); } else if (ch == U'.' && !hasPoint) { hasPoint = true; numStr += ch; next(); } else { break; } } if (numStr.empty()) { return IllegalTok; } if (numStr.ends_with(U'e')) { error = SyntaxError(String(std::format("Illegal number literal: {}", numStr.toBasicString())), this->line, column, SourceInfo(this)); return IllegalTok; } bool hasDigit = false; for (auto it = numStr.begin(); it != numStr.end(); ++it) { if (CharUtils::isDigit(*it)) { hasDigit = true; break; } } if (!hasDigit) { error = SyntaxError(String(std::format("Illegal number literal: {}", numStr.toBasicString())), this->line, column, SourceInfo(this)); return IllegalTok; } size_t ePos = numStr.find(U'e'); if (ePos != String::npos) { if (ePos == 0) { error = SyntaxError(String(std::format("Illegal number literal: {}", numStr.toBasicString())), this->line, column, SourceInfo(this)); return IllegalTok; } if (ePos + 1 >= numStr.length()) { error = SyntaxError(String(std::format("Illegal number literal: {}", numStr.toBasicString())), this->line, column, SourceInfo(this)); return IllegalTok; } bool hasDigitAfterE = false; for (size_t i = ePos + 1; i < numStr.length(); ++i) { char32_t c = numStr[i]; if (c == U'+' || c == U'-') { if (i != ePos + 1) { error = SyntaxError(String(std::format("Illegal number literal: {}", numStr.toBasicString())), this->line, column, SourceInfo(this)); return IllegalTok; } continue; } if (CharUtils::isDigit(c)) { hasDigitAfterE = true; } else { error = SyntaxError(String(std::format("Illegal number literal: {}", numStr.toBasicString())), this->line, column, SourceInfo(this)); return IllegalTok; } } if (!hasDigitAfterE) { error = SyntaxError(String(std::format("Illegal number literal: {}", numStr.toBasicString())), this->line, column, SourceInfo(this)); return IllegalTok; } } return Token(numStr, TokenType::LiteralNumber); } Token Lexer::scanSymbol() { String sym; char32_t ch = current(); sym += ch; auto startsWith = [&](const String &prefix) -> bool { for (const auto &p : symbol_map) { const String &op = p.first; if (op.starts_with(prefix)) return true; } return false; }; if (!startsWith(sym)) { error = SyntaxError(String(std::format("No such operator: {}", sym.toBasicString())), this->line, column, SourceInfo(this)); next(); return IllegalTok; } while (hasNext()) { char32_t peek_ch = peek(); if (!CharUtils::isPunct(peek_ch)) break; String candidate = sym + peek_ch; if (startsWith(candidate)) { next(); sym = candidate; } else { break; } } if (!symbol_map.contains(sym)) { error = SyntaxError(String(std::format("No such operator: {}", sym.toBasicString())), this->line, column, SourceInfo(this)); next(); return IllegalTok; } // std::cerr << Token(sym, symbol_map.at(sym)).toString().toBasicString() << '\n; next(); return Token(sym, symbol_map.at(sym)); } Token Lexer::scanComments() { // entry: when iterator current char is '/' and peek is '/' or '*' // current char is '/' String comment; if (peek() == U'/') // single-line comment { next(); // skip first '/' next(); // skip second '/' char32_t c = current(); while (c != U'\n' and hasNext()) { comment += c; next(); c = current(); } if (hasNext() && c == U'\n') { next(); } } else // multi-line comment { next(); // skip '/' next(); // skip '*' char32_t c = current(); bool terminated = false; while (hasNext()) { if (c == U'*' and hasNext() and peek() == U'/') { next(); // skip '*' next(); // skip '/' terminated = true; break; } else { comment += c; next(); c = current(); } } if (!terminated) { error = SyntaxError(String(U"\1"), this->line, column, SourceInfo(this)); next(); return IllegalTok; } } return Token(comment, TokenType::Comments); } Token Lexer::nextToken() { if (!hasNext()) { return EOFTok.setPos(getCurrentLine(), getCurrentColumn()); } char32_t ch = current(); while (hasNext()) { ch = current(); if (!CharUtils::isSpace(ch)) { break; } next(); } last_line = getCurrentLine(); last_column = getCurrentColumn(); if (ch == U'/') { char32_t c; if (!hasNext()) { next(); return Token(U"\1", this->symbol_map.at(U"\1")).setPos(last_line, last_column); } c = peek(); if (c != U'/' and c != U'*') { next(); return Token(U"\1", this->symbol_map.at(U"\1")).setPos(last_line, last_column); } scanComments().setPos(last_line, last_column); return nextToken(); // now we ignore comments to avoid some stupid bugs } if (ch == U'r' and hasNext() and peek() == U'"') { // r"" // raw String next(); next(); return scanRawString().setPos(last_line, last_column); } if (CharUtils::isAlpha(ch) || ch == U'_') { return scanIdentifier().setPos(last_line, last_column); } else if (ch == U'"') { next(); return scanString().setPos(last_line, last_column); } else if (CharUtils::isDigit(ch)) { return scanNumber().setPos(last_line, last_column); } else if (CharUtils::isPunct(ch)) { return scanSymbol().setPos(last_line, last_column); } else { error = SyntaxError(String(std::format("Cannot tokenize char: '{}'", String(ch).toBasicString())), this->line, column, SourceInfo(this)); if (hasNext()) { next(); } return IllegalTok.setPos(last_line, last_column); } } } // namespace Fig