v0.3.1

2025-12-19 20:38:40 +08:00
commit 73c828d99b
83 changed files with 13068 additions and 0 deletions
--- a/src/lexer.cpp
+++ b/src/lexer.cpp
@@ -0,0 +1,524 @@
+#include <fig_string.hpp>
+#include <error.hpp>
+#include <token.hpp>
+#include <lexer.hpp>
+
+#include <fig_string.hpp>
+#include <utils.hpp>
+
+namespace Fig
+{
+
+    const std::unordered_map<FString, TokenType> Lexer::symbol_map{
+        // 双字符
+        {FString(u8"=="), TokenType::Equal},
+        {FString(u8"!="), TokenType::NotEqual},
+        {FString(u8"<="), TokenType::LessEqual},
+        {FString(u8">="), TokenType::GreaterEqual},
+        {FString(u8"<<"), TokenType::ShiftLeft},
+        {FString(u8">>"), TokenType::ShiftRight},
+        {FString(u8"+="), TokenType::PlusEqual},
+        {FString(u8"-="), TokenType::MinusEqual},
+        {FString(u8"*="), TokenType::AsteriskEqual},
+        {FString(u8"/="), TokenType::SlashEqual},
+        {FString(u8"%="), TokenType::PercentEqual},
+        {FString(u8"^="), TokenType::CaretEqual},
+        {FString(u8"++"), TokenType::DoublePlus},
+        {FString(u8"--"), TokenType::DoubleMinus},
+        {FString(u8"&&"), TokenType::DoubleAmpersand},
+        {FString(u8"||"), TokenType::DoublePipe},
+        {FString(u8":="), TokenType::Walrus},
+        {FString(u8"**"), TokenType::Power},
+        {FString(u8"->"), TokenType::RightArrow},
+
+        // 单字符
+        {FString(u8"+"), TokenType::Plus},
+        {FString(u8"-"), TokenType::Minus},
+        {FString(u8"*"), TokenType::Asterisk},
+        {FString(u8"/"), TokenType::Slash},
+        {FString(u8"%"), TokenType::Percent},
+        {FString(u8"^"), TokenType::Caret},
+        {FString(u8"&"), TokenType::Ampersand},
+        {FString(u8"|"), TokenType::Pipe},
+        {FString(u8"~"), TokenType::Tilde},
+        {FString(u8"="), TokenType::Assign},
+        {FString(u8"<"), TokenType::Less},
+        {FString(u8">"), TokenType::Greater},
+        {FString(u8"."), TokenType::Dot},
+        {FString(u8","), TokenType::Comma},
+        {FString(u8":"), TokenType::Colon},
+        {FString(u8";"), TokenType::Semicolon},
+        {FString(u8"'"), TokenType::SingleQuote},
+        {FString(u8"\""), TokenType::DoubleQuote},
+        {FString(u8"("), TokenType::LeftParen},
+        {FString(u8")"), TokenType::RightParen},
+        {FString(u8"["), TokenType::LeftBracket},
+        {FString(u8"]"), TokenType::RightBracket},
+        {FString(u8"{"), TokenType::LeftBrace},
+        {FString(u8"}"), TokenType::RightBrace}};
+
+    const std::unordered_map<FString, TokenType> Lexer::keyword_map{
+        {FString(u8"and"), TokenType::And},
+        {FString(u8"or"), TokenType::Or},
+        {FString(u8"not"), TokenType::Not},
+        {FString(u8"import"), TokenType::Import},
+        {FString(u8"fun"), TokenType::Function},
+        {FString(u8"var"), TokenType::Variable},
+        {FString(u8"const"), TokenType::Const},
+        {FString(u8"final"), TokenType::Final},
+        {FString(u8"while"), TokenType::While},
+        {FString(u8"for"), TokenType::For},
+        {FString(u8"if"), TokenType::If},
+        {FString(u8"else"), TokenType::Else},
+        {FString(u8"struct"), TokenType::Struct},
+        {FString(u8"interface"), TokenType::Interface},
+        {FString(u8"implement"), TokenType::Implement},
+        {FString(u8"public"), TokenType::Public},
+        {FString(u8"return"), TokenType::Return},
+
+
+        // {FString(u8"Null"), TokenType::TypeNull},
+        // {FString(u8"Int"), TokenType::TypeInt},
+        // {FString(u8"String"), TokenType::TypeString},
+        // {FString(u8"Bool"), TokenType::TypeBool},
+        // {FString(u8"Double"), TokenType::TypeDouble},
+    };
+    void Lexer::skipLine()
+    {
+        while (*it != U'\n' and hasNext())
+        {
+            next();
+        }
+        next(); // skip '\n'
+        ++line;
+    }
+    Token Lexer::scanIdentifier()
+    {
+        FString identifier;
+
+        while (hasNext())
+        {
+            UTF8Char c = *it;
+            if (c.isAlnum() || c == U'_')
+            {
+                identifier += c.getString();
+                next();
+            }
+            else
+            {
+                break;
+            }
+        }
+        if (this->keyword_map.contains(identifier))
+        {
+            return Token(identifier, this->keyword_map.at(identifier));
+        }
+        else if (identifier == u8"true" || identifier == u8"false")
+        {
+            return Token(identifier, TokenType::LiteralBool);
+        }
+        else if (identifier == u8"null")
+        {
+            // null instance
+            return Token(identifier, TokenType::LiteralNull);
+        }
+        if (keyword_map.contains(Utils::toLower(identifier)))
+        {
+            pushWarning(1, identifier); // Identifier is too similar to a keyword or a primitive type
+        }
+        if (identifier.length() <= 1)
+        {
+            pushWarning(2, identifier); // The identifier is too abstract
+        }
+        return Token(identifier, TokenType::Identifier);
+    }
+    Token Lexer::scanString()
+    {
+        FString str;
+        bool unterminated = true;
+        size_t str_start_col = it.column() - 1;
+        while (hasNext())
+        {
+            UTF8Char c = *it;
+            if (c == U'"' || c == U'\n')
+            {
+                next();
+                unterminated = false;
+                break;
+            }
+            else if (c == U'\\') // c is '\'
+            {
+                if (it.isEnd())
+                {
+                    error = SyntaxError(u8"Unterminated FString", this->line, it.column());
+                    return IllegalTok;
+                }
+                next();
+                UTF8Char ec = *it;
+                if (ec == U'n')
+                {
+                    next();
+                    str += u8"\n";
+                }
+                else if (ec == U't')
+                {
+                    next();
+                    str += u8"\t";
+                }
+                else if (ec == U'v')
+                {
+                    next();
+                    str += u8"\v";
+                }
+                else if (ec == U'b')
+                {
+                    next();
+                    str += u8"\b";
+                }
+                else if (ec == U'"')
+                {
+                    next();
+                    str += u8"\"";
+                }
+                else if (ec == U'\'')
+                {
+                    next();
+                    str += u8"'";
+                }
+                else
+                {
+                    error = SyntaxError(FStringView(
+                                            std::format(
+                                                "Unsupported escape character: {}",
+                                                FString(ec.getString()).toBasicString())),
+                                        this->line,
+                                        it.column());
+                    return IllegalTok;
+                }
+            }
+            else
+            {
+                str += c.getString();
+                next();
+            }
+        }
+        if (unterminated)
+        {
+            error = SyntaxError(u8"Unterminated FString", this->line, str_start_col);
+            return IllegalTok;
+        }
+        return Token(str, TokenType::LiteralString);
+    }
+    Token Lexer::scanRawString()
+    {
+        FString str;
+        bool unterminated = true;
+        size_t str_start_col = it.column() - 1;
+        while (hasNext())
+        {
+            UTF8Char c = *it;
+            if (c == U'"' || c == U'\n')
+            {
+                next();
+                unterminated = false;
+                break;
+            }
+            else
+            {
+                str += c.getString();
+                next();
+            }
+        }
+        if (unterminated)
+        {
+            error = SyntaxError(u8"Unterminated FString", this->line, str_start_col);
+            return IllegalTok;
+        }
+        return Token(str, TokenType::LiteralString);
+    }
+    Token Lexer::scanMultilineString()
+    {
+        FString str;
+        bool unterminated = true;
+
+        uint8_t end = 0;
+        size_t str_start_col = it.column() - 1;
+        while (hasNext())
+        {
+            UTF8Char c = *it;
+            if (c == U'"')
+            {
+                if (end == 3)
+                {
+                    next();
+                    unterminated = false;
+                    break;
+                }
+                end++;
+                next();
+                continue;
+            }
+            else if (c == U'\\') // c is '\'
+            {
+                if (it.isEnd())
+                {
+                    error = SyntaxError(u8"Unterminated FString", this->line, it.column());
+                    return IllegalTok;
+                }
+                next();
+                UTF8Char ec = *it;
+                if (ec == U'n')
+                {
+                    next();
+                    str += u8"\n";
+                }
+                else if (ec == U't')
+                {
+                    next();
+                    str += u8"\t";
+                }
+                else if (ec == U'v')
+                {
+                    next();
+                    str += u8"\v";
+                }
+                else if (ec == U'b')
+                {
+                    next();
+                    str += u8"\b";
+                }
+                else if (ec == U'"')
+                {
+                    next();
+                    str += u8"\"";
+                }
+                else if (ec == U'\'')
+                {
+                    next();
+                    str += u8"'";
+                }
+                else if (ec == U'\\')
+                {
+                    next();
+                    str += u8"\\";
+                }
+                else
+                {
+                    error = SyntaxError(FStringView(
+                                            std::format(
+                                                "Unsupported escape character: {}",
+                                                FString(ec.getString()).toBasicString())),
+                                        this->line,
+                                        it.column());
+                    return IllegalTok;
+                }
+            }
+            else
+            {
+                str += c.getString();
+            }
+            end = 0;
+        }
+        if (unterminated)
+        {
+            error = SyntaxError(u8"Unterminated FString", this->line, str_start_col);
+            return IllegalTok;
+        }
+        return Token(str, TokenType::LiteralString);
+    }
+    Token Lexer::scanNumber()
+    {
+        FString numStr;
+        bool hasPoint = false;
+        // 负号(减号) 直接交由 scanSymbol处理，在parser中被分类->与数字结合/变为操作数
+        while (hasNext())
+        {
+            UTF8Char ch = *it;
+            if (ch.isDigit() or ch == U'e') // . / e / - for scientific counting
+            {
+                numStr += ch.getString();
+                next();
+            }
+            else if (ch == U'-' and numStr.ends_with(U'-'))
+            {
+                numStr += ch.getString();
+                next();
+            }
+            else if (ch == U'.' and not hasPoint)
+            {
+                hasPoint = true;
+                numStr += ch.getString();
+                next();
+            }
+            else
+            {
+                break;
+            }
+        }
+        // Numbers in Fig-lang
+        /*
+            114514
+            1145.14
+            1.14e3  -> 1140
+            1.14e-3 -> 0.00114
+            .3      -> 0.3
+        */
+        // checking legality
+        if ((*numStr.end()) == u'e') // e 后面必须跟整数表示科学计数
+        {
+            error = SyntaxError(FStringView(
+                                    std::format("Ellegal number literal: {}", numStr.toBasicString())),
+                                this->line, it.column());
+            return IllegalTok;
+        }
+        return Token(numStr, TokenType::LiteralNumber);
+    }
+    Token Lexer::scanSymbol()
+    {
+        FString sym;
+        UTF8Char ch = *it;
+
+        sym += ch.getString();
+        UTF8Char peek = UTF8Char(u8"");
+        if (hasNext() and (peek = it.peek()).isPunct()) // 窥探下一个操作符
+        {
+            FString symd = FString(sym + peek.getString());
+            if (this->symbol_map.contains(symd))
+            {
+                // Operator length is 2
+                next();
+                sym = symd;
+            }
+            // Operator length is 1
+            else if (!this->symbol_map.contains(sym))
+            {
+                // check legality
+                error = SyntaxError(FStringView(
+                                        std::format("No such a operator: {}", sym.toBasicString())),
+                                    this->line, it.column());
+            }
+        }
+        next();
+        return Token(sym, this->symbol_map.at(sym)); // const object 'symbol_map', operator[] call is invalid
+    }
+    Token Lexer::scanComments()
+    {
+        // entry: when iterator current char is '/' and peek is '/' or '*'
+        // current char is '/'
+        FString comment;
+        if (it.peek() == U'/')
+        {
+            next();
+            next();
+            UTF8Char c = *it;
+            while (c != U'\n' and hasNext())
+            {
+                comment += c.getString();
+                next();
+            }
+            next();
+        }
+        else
+        {
+            next();
+            next();
+            UTF8Char c = *it;
+            bool terminated = false;
+            while (hasNext())
+            {
+                if (c == U'*' and hasNext() and it.peek() == U'/')
+                {
+                    next(); // skip '*'
+                    next(); // skip '/'
+                    next(); // to next char
+                    terminated = true;
+                    break;
+                }
+                else
+                {
+                    comment += c.getString();
+                    next();
+                }
+            }
+            if (!terminated)
+            {
+                error = SyntaxError(FStringView(u8"Unterminated multiline comment"), this->line, it.column());
+                next();
+                return IllegalTok;
+            }
+        }
+        return Token(comment, TokenType::Comments);
+    }
+    Token Lexer::nextToken()
+    {
+        if (!hasNext())
+        {
+            return EOFTok;
+        }
+        UTF8Char ch = *it;
+        while (ch.isSpace())
+        {
+            next();
+            ch = *it;
+            if (!hasNext())
+            {
+                return EOFTok.setPos(getCurrentLine(), getCurrentColumn());
+            }
+        }
+        last_line = getCurrentLine();
+        last_column = getCurrentColumn();
+        if (ch == U'r' and hasNext() and it.peek() == U'"')
+        {
+            // r""
+            // raw FString
+            next();
+            next();
+            return scanRawString().setPos(last_line, last_column);
+        }
+        if (ch.isAlpha() || ch == U'_')
+        {
+            return scanIdentifier().setPos(last_line, last_column);
+        }
+        else if (ch == U'"')
+        {
+            next();
+            return scanString().setPos(last_line, last_column);
+        }
+        else if (ch.isDigit())
+        {
+            return scanNumber().setPos(last_line, last_column);
+        }
+        else if (ch == U'/')
+        {
+            UTF8Char c{u8""};
+            if (!hasNext())
+            {
+                next();
+                return Token(u8"/", this->symbol_map.at(u8"/")).setPos(last_line, last_column);
+            }
+            c = it.peek();
+            if (c != U'/' and c != U'*')
+            {
+                next();
+                return Token(u8"/", this->symbol_map.at(u8"/")).setPos(last_line, last_column);
+            }
+            return scanComments().setPos(last_line, last_column);
+        }
+        else if (ch.isPunct())
+        {
+            return scanSymbol().setPos(last_line, last_column);
+        }
+        else
+        {
+            error = SyntaxError(FStringView(
+                                    std::format("Cannot tokenize char: '{}'", FString(ch.getString()).toBasicString())),
+                                this->line, it.column());
+            if (hasNext())
+            {
+                next();
+            }
+            return IllegalTok.setPos(last_line, last_column);
+        }
+    }
+
+} // namespace Fig