diff --git a/.clang-format b/.clang-format index 0a15f56..09e9f78 100644 --- a/.clang-format +++ b/.clang-format @@ -33,13 +33,13 @@ AllowShortBlocksOnASingleLine: true AllowShortCaseLabelsOnASingleLine: true # 允许短的函数放在同一行: None, InlineOnly(定义在类中), Empty(空函数), Inline(定义在类中,空函数), All -AllowShortFunctionsOnASingleLine: Inline +AllowShortFunctionsOnASingleLine: Empty # 允许短的if语句保持在同一行 -AllowShortIfStatementsOnASingleLine: true +AllowShortIfStatementsOnASingleLine: false # 允许短的循环保持在同一行 -AllowShortLoopsOnASingleLine: true +AllowShortLoopsOnASingleLine: false # 总是在返回类型后换行: None, All, TopLevel(顶级函数,不包括在类中的函数), # AllDefinitions(所有的定义,不包括声明), TopLevelDefinitions(所有的顶级函数的定义) diff --git a/.gitignore b/.gitignore index 0239152..55f9b92 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,6 @@ build/ .DS_Store .vscode -.VSCodeCounter \ No newline at end of file +.VSCodeCounter + +/test.fig \ No newline at end of file diff --git a/src/Core/SourceLocations.hpp b/src/Core/SourceLocations.hpp index 2722cfd..b72b51e 100644 --- a/src/Core/SourceLocations.hpp +++ b/src/Core/SourceLocations.hpp @@ -1,7 +1,6 @@ #pragma once #include -#include namespace Fig { diff --git a/src/Deps/Deps.hpp b/src/Deps/Deps.hpp index 241b69b..25bd1fc 100644 --- a/src/Deps/Deps.hpp +++ b/src/Deps/Deps.hpp @@ -3,11 +3,18 @@ #include #include #include +#include + +#include namespace Fig { #ifdef __FCORE_LINK_DEPS using Deps::String; using Deps::HashMap; + using Deps::CharUtils; + + template + using Result = std::expected<_Tp, _Err>; #endif }; \ No newline at end of file diff --git a/src/Deps/String/String.hpp b/src/Deps/String/String.hpp index ab308cc..c5cf6c1 100644 --- a/src/Deps/String/String.hpp +++ b/src/Deps/String/String.hpp @@ -17,7 +17,8 @@ namespace Fig::Deps { for (size_t i = 0; i < n; ++i) { - if (static_cast(data[i]) >= 128) return false; + if (static_cast(data[i]) >= 128) + return false; } return true; } @@ -26,7 +27,8 @@ namespace Fig::Deps { for (size_t i = 0; i < n; ++i) { - if (data[i] >= 128) return false; + if (data[i] >= 128) + return false; } return true; } @@ -100,8 +102,14 @@ namespace Fig::Deps _length = other._length; mode = other.mode; - if (mode == Mode::ASCII_SSO) { memcpy(sso, other.sso, sizeof(unsigned char) * _length); } - else if (mode == Mode::ASCII_HEP) { new (&ascii) std::vector(other.ascii); } + if (mode == Mode::ASCII_SSO) + { + memcpy(sso, other.sso, sizeof(unsigned char) * _length); + } + else if (mode == Mode::ASCII_HEP) + { + new (&ascii) std::vector(other.ascii); + } else { new (&utf32) std::vector(other.utf32); @@ -134,24 +142,33 @@ namespace Fig::Deps { // pass } - if (mode == Mode::ASCII_HEP) { ascii.~vector(); } - if (mode == Mode::UTF32_HEP) { utf32.~vector(); } + if (mode == Mode::ASCII_HEP) + { + ascii.~vector(); + } + if (mode == Mode::UTF32_HEP) + { + utf32.~vector(); + } } void ensure_utf32() { - if (mode == Mode::UTF32_HEP) return; + if (mode == Mode::UTF32_HEP) + return; std::vector tmp; tmp.reserve(_length); if (mode == Mode::ASCII_SSO) { - for (size_t i = 0; i < _length; ++i) tmp.push_back(static_cast(sso[i])); + for (size_t i = 0; i < _length; ++i) + tmp.push_back(static_cast(sso[i])); } else // ASCII_HEP { - for (unsigned char c : ascii) tmp.push_back(static_cast(c)); + for (unsigned char c : ascii) + tmp.push_back(static_cast(c)); } destroy(); @@ -166,7 +183,8 @@ namespace Fig::Deps std::vector tmp; tmp.reserve(_length); - for (size_t i = 0; i < _length; ++i) tmp.push_back(sso[i]); + for (size_t i = 0; i < _length; ++i) + tmp.push_back(sso[i]); mode = Mode::ASCII_HEP; new (&ascii) std::vector(std::move(tmp)); @@ -223,7 +241,8 @@ namespace Fig::Deps { assert(data); size_t n = 0; - while (data[n] != 0) ++n; + while (data[n] != 0) + ++n; init(data, n); } @@ -236,7 +255,8 @@ namespace Fig::Deps if (n <= SSO_SIZE && StringUtils::is_pure_ascii(data, n)) { mode = Mode::ASCII_SSO; - for (size_t i = 0; i < n; ++i) sso[i] = static_cast(data[i]); + for (size_t i = 0; i < n; ++i) + sso[i] = static_cast(data[i]); return; } @@ -245,7 +265,8 @@ namespace Fig::Deps mode = Mode::ASCII_HEP; new (&ascii) std::vector(); ascii.reserve(n); - for (size_t i = 0; i < n; ++i) ascii.push_back(static_cast(data[i])); + for (size_t i = 0; i < n; ++i) + ascii.push_back(static_cast(data[i])); return; } @@ -255,10 +276,19 @@ namespace Fig::Deps } public: - size_t length() const noexcept { return _length; } - size_t size() const noexcept { return _length; } + size_t length() const noexcept + { + return _length; + } + size_t size() const noexcept + { + return _length; + } - bool empty() const noexcept { return _length == 0; } + bool empty() const noexcept + { + return _length == 0; + } void reserve(size_t n) { if (mode == Mode::ASCII_HEP) @@ -274,7 +304,10 @@ namespace Fig::Deps { // pass } - if (mode == Mode::ASCII_HEP) { ascii.clear(); } + if (mode == Mode::ASCII_HEP) + { + ascii.clear(); + } else { utf32.clear(); @@ -283,34 +316,71 @@ namespace Fig::Deps void shrink_to_fit() noexcept { - if (mode == Mode::ASCII_HEP) { ascii.shrink_to_fit(); } + if (mode == Mode::ASCII_HEP) + { + ascii.shrink_to_fit(); + } else { utf32.shrink_to_fit(); } } - ~String() noexcept { destroy(); } + ~String() noexcept + { + destroy(); + } String() noexcept { mode = Mode::ASCII_SSO; _length = 0; } - String(const String &other) noexcept { copyfrom(other); } - String(String &&other) noexcept { movefrom(std::move(other)); } - String(const char *str) { init(str); } - String(const char32_t *str) { init(str); } - String(const std::string &s) { init(s.data(), s.size()); } + String(const String &other) noexcept + { + copyfrom(other); + } + String(String &&other) noexcept + { + movefrom(std::move(other)); + } + String(const char *str) + { + init(str); + } + String(const char32_t *str) + { + init(str); + } + String(char32_t c) + { + init(""); + push_back(c); + } + String(char c) + { + init(""); + push_back(static_cast(c)); + } + String(const std::string &s) + { + init(s.data(), s.size()); + } static String fromPureAscii(const char *str) { String string; string._length = std::strlen(str); - if (string._length <= SSO_SIZE) { memcpy(string.sso, str, string._length); } + if (string._length <= SSO_SIZE) + { + memcpy(string.sso, str, string._length); + } else { string.ascii.reserve(string._length); - for (size_t i = 0; i < string._length; ++i) { string.ascii.push_back(str[i]); } + for (size_t i = 0; i < string._length; ++i) + { + string.ascii.push_back(str[i]); + } } return string; @@ -328,13 +398,15 @@ namespace Fig::Deps String &operator=(String &&other) noexcept { - if (this != &other) movefrom(std::move(other)); + if (this != &other) + movefrom(std::move(other)); return *this; } String &operator+=(const String &rhs) { - if (rhs._length == 0) return *this; + if (rhs._length == 0) + return *this; // 两边都是 ASCII bool this_ascii = (mode == Mode::ASCII_SSO || mode == Mode::ASCII_HEP); @@ -356,7 +428,8 @@ namespace Fig::Deps return *this; } - if (mode == Mode::ASCII_SSO) promote_sso_ascii_to_heap(); + if (mode == Mode::ASCII_SSO) + promote_sso_ascii_to_heap(); // 追加 if (rhs.mode == Mode::ASCII_SSO) @@ -377,11 +450,13 @@ namespace Fig::Deps if (mode == Mode::ASCII_SSO) { - for (size_t i = 0; i < _length; ++i) tmp.push_back(static_cast(sso[i])); + for (size_t i = 0; i < _length; ++i) + tmp.push_back(static_cast(sso[i])); } else // ASCII_HEP { - for (unsigned char c : ascii) tmp.push_back(static_cast(c)); + for (unsigned char c : ascii) + tmp.push_back(static_cast(c)); } destroy(); @@ -389,14 +464,19 @@ namespace Fig::Deps new (&utf32) std::vector(std::move(tmp)); } - if (rhs.mode == Mode::UTF32_HEP) { utf32.insert(utf32.end(), rhs.utf32.begin(), rhs.utf32.end()); } + if (rhs.mode == Mode::UTF32_HEP) + { + utf32.insert(utf32.end(), rhs.utf32.begin(), rhs.utf32.end()); + } else if (rhs.mode == Mode::ASCII_SSO) { - for (size_t i = 0; i < rhs._length; ++i) utf32.push_back(static_cast(rhs.sso[i])); + for (size_t i = 0; i < rhs._length; ++i) + utf32.push_back(static_cast(rhs.sso[i])); } else // ASCII_HEP { - for (unsigned char c : rhs.ascii) utf32.push_back(static_cast(c)); + for (unsigned char c : rhs.ascii) + utf32.push_back(static_cast(c)); } _length = utf32.size(); @@ -425,7 +505,8 @@ namespace Fig::Deps return; } - if (mode == Mode::ASCII_SSO) promote_sso_ascii_to_heap(); + if (mode == Mode::ASCII_SSO) + promote_sso_ascii_to_heap(); if (mode == Mode::ASCII_HEP) { @@ -477,7 +558,8 @@ namespace Fig::Deps String &append(size_t count, u32 cp) { - for (size_t i = 0; i < count; ++i) push_back(cp); + for (size_t i = 0; i < count; ++i) + push_back(cp); return *this; } @@ -523,7 +605,10 @@ namespace Fig::Deps // UTF32_HEP -> UTF-8 encode for (u32 cp : utf32) { - if (cp <= 0x7F) { out.push_back(static_cast(cp)); } + if (cp <= 0x7F) + { + out.push_back(static_cast(cp)); + } else if (cp <= 0x7FF) { out.push_back(static_cast(0xC0 | (cp >> 6))); @@ -547,18 +632,24 @@ namespace Fig::Deps return out; } - friend std::ostream &operator<<(std::ostream &os, const String &s) { return os << s.toStdString(); } + friend std::ostream &operator<<(std::ostream &os, const String &s) + { + return os << s.toStdString(); + } friend bool operator==(const String &a, const String &b) noexcept { - if (a._length != b._length) return false; + if (a._length != b._length) + return false; // 同模式 if (a.mode == b.mode) { - if (a.mode == Mode::ASCII_SSO) return std::memcmp(a.sso, b.sso, a._length) == 0; + if (a.mode == Mode::ASCII_SSO) + return std::memcmp(a.sso, b.sso, a._length) == 0; - if (a.mode == Mode::ASCII_HEP) return a.ascii == b.ascii; + if (a.mode == Mode::ASCII_HEP) + return a.ascii == b.ascii; return a.utf32 == b.utf32; } @@ -582,18 +673,23 @@ namespace Fig::Deps if (ascii_str.mode == Mode::ASCII_SSO) { for (size_t i = 0; i < ascii_str._length; ++i) - if (static_cast(ascii_str.sso[i]) != utf32_str.utf32[i]) return false; + if (static_cast(ascii_str.sso[i]) != utf32_str.utf32[i]) + return false; } else { for (size_t i = 0; i < ascii_str._length; ++i) - if (static_cast(ascii_str.ascii[i]) != utf32_str.utf32[i]) return false; + if (static_cast(ascii_str.ascii[i]) != utf32_str.utf32[i]) + return false; } return true; } - friend bool operator!=(const String &a, const String &b) noexcept { return !(a == b); } + friend bool operator!=(const String &a, const String &b) noexcept + { + return !(a == b); + } // std::hash friend struct std::hash; @@ -602,34 +698,41 @@ namespace Fig::Deps { assert(i < _length); - if (mode == Mode::ASCII_SSO) return static_cast(sso[i]); - if (mode == Mode::ASCII_HEP) return static_cast(ascii[i]); + if (mode == Mode::ASCII_SSO) + return static_cast(sso[i]); + if (mode == Mode::ASCII_HEP) + return static_cast(ascii[i]); return utf32[i]; } u32 at(size_t i) const { - if (i >= _length) throw std::out_of_range("String::at"); + if (i >= _length) + throw std::out_of_range("String::at"); return (*this)[i]; } bool starts_with(const String &prefix) const { - if (prefix._length > _length) return false; + if (prefix._length > _length) + return false; for (size_t i = 0; i < prefix._length; ++i) - if ((*this)[i] != prefix[i]) return false; + if ((*this)[i] != prefix[i]) + return false; return true; } bool ends_with(const String &suffix) const { - if (suffix._length > _length) return false; + if (suffix._length > _length) + return false; size_t offset = _length - suffix._length; for (size_t i = 0; i < suffix._length; ++i) - if ((*this)[offset + i] != suffix[i]) return false; + if ((*this)[offset + i] != suffix[i]) + return false; return true; } @@ -639,27 +742,32 @@ namespace Fig::Deps if (mode == Mode::ASCII_SSO) { for (size_t i = 0; i < _length; ++i) - if (sso[i] == cp) return true; + if (sso[i] == cp) + return true; return false; } if (mode == Mode::ASCII_HEP) { - if (cp >= 128) return false; + if (cp >= 128) + return false; for (unsigned char c : ascii) - if (c == cp) return true; + if (c == cp) + return true; return false; } for (u32 c : utf32) - if (c == cp) return true; + if (c == cp) + return true; return false; } String substr(size_t pos, size_t count = size_t(-1)) const { - if (pos >= _length) return String(); + if (pos >= _length) + return String(); size_t len = (_length - pos < count) ? (_length - pos) : count; @@ -710,7 +818,8 @@ namespace Fig::Deps String &erase(size_t pos, size_t count = size_t(-1)) { - if (pos >= _length) return *this; + if (pos >= _length) + return *this; size_t len = (_length - pos < count) ? (_length - pos) : count; @@ -735,8 +844,10 @@ namespace Fig::Deps String &insert(size_t pos, const String &other) { - if (pos > _length) pos = _length; - if (other._length == 0) return *this; + if (pos > _length) + pos = _length; + if (other._length == 0) + return *this; bool this_ascii = (mode != Mode::UTF32_HEP); bool other_ascii = (other.mode != Mode::UTF32_HEP); @@ -759,7 +870,8 @@ namespace Fig::Deps return *this; } - if (mode == Mode::ASCII_SSO) promote_sso_ascii_to_heap(); + if (mode == Mode::ASCII_SSO) + promote_sso_ascii_to_heap(); if (other.mode == Mode::ASCII_SSO) ascii.insert(ascii.begin() + pos, other.sso, other.sso + other._length); @@ -794,17 +906,21 @@ namespace Fig::Deps { u32 a = (*this)[i]; u32 b = other[i]; - if (a != b) return (a < b) ? -1 : 1; + if (a != b) + return (a < b) ? -1 : 1; } - if (_length == other._length) return 0; + if (_length == other._length) + return 0; return (_length < other._length) ? -1 : 1; } size_t find(const String &needle, size_t pos = 0) const { - if (needle._length == 0) return pos <= _length ? pos : size_t(-1); - if (needle._length > _length || pos >= _length) return size_t(-1); + if (needle._length == 0) + return pos <= _length ? pos : size_t(-1); + if (needle._length > _length || pos >= _length) + return size_t(-1); size_t limit = _length - needle._length; @@ -812,9 +928,11 @@ namespace Fig::Deps { size_t j = 0; for (; j < needle._length; ++j) - if ((*this)[i + j] != needle[j]) break; + if ((*this)[i + j] != needle[j]) + break; - if (j == needle._length) return i; + if (j == needle._length) + return i; } return size_t(-1); @@ -822,16 +940,20 @@ namespace Fig::Deps size_t rfind(const String &needle) const { - if (needle._length == 0) return _length; - if (needle._length > _length) return size_t(-1); + if (needle._length == 0) + return _length; + if (needle._length > _length) + return size_t(-1); for (size_t i = _length - needle._length + 1; i-- > 0;) { size_t j = 0; for (; j < needle._length; ++j) - if ((*this)[i + j] != needle[j]) break; + if ((*this)[i + j] != needle[j]) + break; - if (j == needle._length) return i; + if (j == needle._length) + return i; } return size_t(-1); @@ -839,7 +961,8 @@ namespace Fig::Deps String &replace(size_t pos, size_t len, const String &repl) { - if (pos >= _length) return *this; + if (pos >= _length) + return *this; size_t erase_len = (_length - pos < len) ? (_length - pos) : len; @@ -865,7 +988,8 @@ namespace Fig::Deps return *this; } - if (mode == Mode::ASCII_SSO) promote_sso_ascii_to_heap(); + if (mode == Mode::ASCII_SSO) + promote_sso_ascii_to_heap(); ascii.erase(ascii.begin() + pos, ascii.begin() + pos + erase_len); @@ -948,7 +1072,10 @@ namespace std struct std::formatter { // 不支持自定义格式说明符 - constexpr auto parse(std::format_parse_context &ctx) { return ctx.begin(); } + constexpr auto parse(std::format_parse_context &ctx) + { + return ctx.begin(); + } template auto format(const Fig::Deps::String &s, FormatContext &ctx) const diff --git a/src/Error/Error.cpp b/src/Error/Error.cpp index 52742bd..551f592 100644 --- a/src/Error/Error.cpp +++ b/src/Error/Error.cpp @@ -36,8 +36,14 @@ namespace Fig case ErrorType::MayBeNull: return "MaybeNull"; case ErrorType::UnterminatedString: return "UnterminatedString"; + case ErrorType::UnterminatedComments: return "UnterminatedComments"; + case ErrorType::InvalidNumberLiteral: return "InvalidNumberLiteral"; + case ErrorType::InvalidCharacter: return "InvalidCharacter"; + case Fig::ErrorType::InvalidSymbol: return "InvalidSymbol"; + case ErrorType::SyntaxError: return "SyntaxError"; - default: return "Some one forgot to add case to `ErrorTypeToString`"; + + // default: return "Some one forgot to add case to `ErrorTypeToString`"; } } @@ -76,8 +82,9 @@ namespace Fig const SourceLocation &location = error.location; - err << TC::DarkGray << " ┌─> " << TC::Cyan << location.fileName << " " << TC::DarkGray << location.sp.line - << ":" << location.sp.column << '\n'; + err << TC::DarkGray << " ┌─> Fn " << TC::Cyan << '\'' << location.packageName << '.' << location.functionName + << '\'' << " " << location.fileName << " (" << TC::DarkGray << location.sp.line << ":" << location.sp.column + << TC::Cyan << ')' << TC::Reset << '\n'; err << TC::DarkGray << " │" << '\n' << " │" << TC::Reset << '\n'; // 尝试打印上3行 下2行 diff --git a/src/Error/Error.hpp b/src/Error/Error.hpp index 97fa942..75748ca 100644 --- a/src/Error/Error.hpp +++ b/src/Error/Error.hpp @@ -20,6 +20,11 @@ namespace Fig MayBeNull = 1001, UnterminatedString = 2001, + UnterminatedComments, + InvalidNumberLiteral, + InvalidCharacter, + InvalidSymbol, + SyntaxError, }; diff --git a/src/Lexer/Lexer.cpp b/src/Lexer/Lexer.cpp index 1f26d28..290a7c9 100644 --- a/src/Lexer/Lexer.cpp +++ b/src/Lexer/Lexer.cpp @@ -2,5 +2,280 @@ namespace Fig { - -}; \ No newline at end of file + /* + 总则: + Lexer不涉及语义部分,语义为Parser及之后的部分确定! + 确定边界 --> 分词 + 无法确定 --> 错误的源,报错 + + */ + + Result Lexer::scanComments() + { + Token tok(rd.currentIndex(), 2, TokenType::Comments); + rd.skip(2); // 跳过 // + do + { + tok.length++; + if (rd.current() == U'\n') + { + rd.next(); // skip '\n' + break; + } + rd.next(); + } while (rd.hasNext()); + return tok; + } + + Result Lexer::scanMultilineComments() + { + Token tok(rd.currentIndex(), 2, TokenType::Comments); + SourcePosition startPos = rd.currentPosition(); + rd.skip(2); // 跳过 / * + while (true) + { + if (rd.isAtEnd()) + { + return std::unexpected(Error(ErrorType::UnterminatedComments, + "unterminated multiline comments", + "insert '*/'", + makeSourceLocation(startPos))); + } + if (rd.current() == U'*' && rd.peekIf() == U'/') + { + rd.skip(2); + break; + } + tok.length++; + rd.next(); + } + return tok; + } + + Result Lexer::scanIdentifierOrKeyword() + { + Token tok(rd.currentIndex(), 1, TokenType::Identifier); + String value; // 用于判断是标识符还是关键字 + value.push_back(rd.produce()); // 加入第一个 + + while (CharUtils::isIdentifierContinue(rd.current())) // continue: _ / 0-9 / aA - zZ + { + tok.length++; + value.push_back(rd.produce()); + if (rd.isAtEnd()) + { + break; + } + } + + if (Token::keywordMap.contains(value)) + { + tok.type = Token::keywordMap.at(value); + } + return tok; + } + + Result Lexer::scanNumberLiteral() + { + Token tok(rd.currentIndex(), 0, TokenType::LiteralNumber); + state = State::ScanDec; + + if (rd.current() == U'0') + { + char32_t _peek = std::tolower(rd.peekIf()); + if (_peek == U'b') + { + state = State::ScanBin; + rd.skip(2); // 跳过 0b + tok.length += 2; + } + else if (_peek == U'x') + { + state = State::ScanHex; + rd.skip(2); // 跳过 0x + tok.length += 2; + } + // else + // { + // return std::unexpected(Error(ErrorType::InvalidNumberLiteral, + // std::format("bad number postfix 0{}", String(_peek)), + // "correct it", + // makeSourceLocation(rd.currentPosition()))); + + // } + } + + do + { + char32_t current = rd.current(); + if (state == State::ScanDec && !CharUtils::isDigit(current)) + { + break; + } + if (state == State::ScanHex && !CharUtils::isHexDigit(current)) + { + break; + } + if (state == State::ScanBin && current != U'0' && current != U'1') + { + // return std::unexpected( + // Error(ErrorType::InvalidNumberLiteral, + // std::format("invalid binary number literal, scanning '{}'", String(¤t)), + // "correct it", + // makeSourceLocation(rd.currentPosition()))); + break; + } + tok.length++; + rd.next(); + } while (!rd.isAtEnd()); + + // 科学计数法 + while (!rd.isAtEnd() && state == State::ScanDec + && (rd.current() == U'e' || rd.current() == U'E' || rd.current() == U'_' || rd.current() == U'+' + || rd.current() == U'-' || CharUtils::isDigit(rd.current()))) + { + tok.length++; + rd.next(); + } + + return tok; + } + Result Lexer::scanStringLiteral() + { + state = (rd.current() == U'"' ? State::ScanStringDQ : Lexer::State::ScanStringSQ); + + SourcePosition startPos = rd.currentPosition(); + + rd.next(); // skip " / ' + Token tok(rd.currentIndex(), 0, TokenType::LiteralString); + + while (true) + { + if (state == State::ScanStringDQ && rd.current() == U'"') + { + rd.next(); // skip '"' + break; + } + else if (state == State::ScanStringSQ && rd.current() == U'\'') + { + rd.next(); // skip `'` + break; + } + else if (rd.isAtEnd()) + { + return std::unexpected( + Error(ErrorType::UnterminatedString, + "unterminated string literal", + std::format("insert '{}'", String((state == State::ScanStringDQ ? "\"" : "'"))), + makeSourceLocation(startPos))); + } + else + { + tok.length++; + rd.next(); + } + } + return tok; + } + Result Lexer::scanPunct() + { + Token tok(rd.currentIndex(), 0, TokenType::Illegal); + + auto startsWith = [&](const String &prefix) -> bool { + for (const auto &p : Token::punctMap) + { + const String &op = p.first; + if (op.starts_with(prefix)) + return true; + } + return false; + }; + + String sym; + + do + { + String candidate = sym + rd.current(); + if (startsWith(candidate)) + { + rd.next(); + tok.length++; + sym = candidate; + } + else + { + break; + } + } while (!rd.isAtEnd() && CharUtils::isPunct(rd.current())); + + if (!Token::punctMap.contains(sym)) + { + return std::unexpected(Error(ErrorType::InvalidSymbol, + std::format("invalid symbol `{}`", sym), + "correct it", + makeSourceLocation(rd.currentPosition()))); + } + tok.type = Token::punctMap.at(sym); + return tok; + } + + void Lexer::skipWhitespaces() + { + while (!rd.isAtEnd()) + { + char32_t current = rd.current(); + if (current == EOF || !CharUtils::isAsciiSpace(current)) // 检查 EOF + break; + rd.next(); + } + } + + Result Lexer::NextToken() + { + if (rd.isAtEnd()) + { + return Token(rd.currentIndex(), 0, TokenType::EndOfFile); + } + if (rd.current() == U'\0') + { + return Token(rd.currentIndex(), 1, TokenType::EndOfFile); + } + if (rd.current() == U'/' && rd.peekIf() == U'/') + { + return scanComments(); + } + else if (rd.current() == U'/' && rd.peekIf() == U'*') + { + return scanMultilineComments(); + } + else if (CharUtils::isIdentifierStart(rd.current())) + { + return scanIdentifierOrKeyword(); + } + else if (CharUtils::isDigit(rd.current())) + { + return scanNumberLiteral(); + } + else if (rd.current() == U'"' || rd.current() == U'\'') + { + return scanStringLiteral(); + } + else if (CharUtils::isPunct(rd.current())) + { + return scanPunct(); + } + else if (CharUtils::isSpace(rd.current())) + { + skipWhitespaces(); + return NextToken(); + } + else + { + return std::unexpected(Error( + ErrorType::InvalidCharacter, + std::format("invalid character '{}' (U+{})", String(rd.current()), static_cast(rd.current())), + "correct it", + makeSourceLocation(rd.currentPosition()))); + } + } +}; // namespace Fig \ No newline at end of file diff --git a/src/Lexer/Lexer.hpp b/src/Lexer/Lexer.hpp index 842f958..b29b8bc 100644 --- a/src/Lexer/Lexer.hpp +++ b/src/Lexer/Lexer.hpp @@ -10,6 +10,7 @@ #include #include #include +#include namespace Fig { @@ -25,13 +26,13 @@ namespace Fig SourceReader() { index = 0; - pos.line = pos.column = 0; + pos.line = pos.column = 1; } SourceReader(const String &_source) // copy { source = _source; index = 0; - pos.line = pos.column = 0; + pos.line = pos.column = 1; } SourcePosition ¤tPosition() { return pos; } @@ -42,7 +43,16 @@ namespace Fig return source[index]; } - inline bool hasNext() const { return index < source.length(); } + inline char32_t currentIf() const + { + if (index >= source.length()) + { + return U'\0'; + } + return source[index]; + } + + inline bool hasNext() const { return index < source.length() - 1; } inline char32_t peek() const { @@ -66,10 +76,10 @@ namespace Fig inline void next() { - assert(hasNext() && "SrcReader: next failed, need more runes"); - ++index; + char32_t consumed = currentIf(); - if (current() == U'\n') + ++index; + if (consumed == U'\n') { ++pos.line; pos.column = 1; @@ -80,18 +90,38 @@ namespace Fig } } + inline void skip(size_t n) + { + for (size_t i = 0; i < n; ++i) { next(); } + } + inline size_t currentIndex() const { return index; } - inline bool isAtEnd() const { return index == source.length() - 1; } + inline bool isAtEnd() const { return index >= source.length(); } }; class Lexer { public: - enum State : uint8_t + enum class State : uint8_t { - Normal, - Error + Error, + Standby, + End, + + ScanComments, // 单行注释 + ScanMultilineComments, // 多行注释 + ScanIdentifier, // 关键字也算 + + ScanDec, // 十进制数字, 如 1.2 31, 3.14e+3, 1_000_0000 + ScanBin, // 二进制数字, 如 0b0001 / 0B0001 + ScanHex, // 十六进制数字, 如 0xABCD / 0XabCd + ScanStringDQ, // 双引号字符串, 如 "hello, world!" + ScanStringSQ, // 单引号字符串, 如 'hello' + ScanBool, // 布尔字面量, true / false + ScanNull, // 空值字面量, null + + ScanPunct, // 符号 }; private: @@ -99,17 +129,22 @@ namespace Fig SourceReader rd; protected: - Token scanComments(); - Token scanIdentifierOrKeyword(); + Result scanComments(); + Result scanMultilineComments(); - Token scanNumberLiteral(); - Token scanStringLiteral(); - Token scanBoolLiteral(); - Token scanLiteralNull(); + Result scanIdentifierOrKeyword(); + + Result scanNumberLiteral(); + Result scanStringLiteral(); // 支持多行 + // Result scanBoolLiteral(); 由 scanIdentifier...扫描 + // Result scanLiteralNull(); 由 scanIdentifier...扫描 + + Result scanPunct(); + + void skipWhitespaces(); - Token scanPunct(); public: - State state = Normal; + State state = State::Standby; Lexer() {} Lexer(const String &source, String _fileName) @@ -118,6 +153,12 @@ namespace Fig fileName = std::move(_fileName); } - Token NextToken(); + SourceLocation makeSourceLocation(const SourcePosition ¤t_pos) + { + return SourceLocation( + current_pos, fileName, "[internal lexer]", String(magic_enum::enum_name(state).data())); + } + + Result NextToken(); }; }; // namespace Fig \ No newline at end of file diff --git a/src/Lexer/LexerTest.cpp b/src/Lexer/LexerTest.cpp new file mode 100644 index 0000000..35d68ca --- /dev/null +++ b/src/Lexer/LexerTest.cpp @@ -0,0 +1,43 @@ +#include +#include +#include + +#include + +int main() +{ + using namespace Fig; + + String fileName = "test.fig"; + String filePath = "T:/Files/Maker/Code/MyCodingLanguage/The Fig Project/Fig/test.fig"; + + SourceManager manager(filePath); + manager.Read(); + + if (!manager.read) + { + std::cerr << "Couldn't read file"; + return 1; + } + + Lexer lexer(manager.GetSource(), fileName); + + while (true) + { + const auto &result = lexer.NextToken(); + if (!result.has_value()) + { + ReportError(result.error(), manager); + break; + } + const Token &token = *result; + const String &lexeme = manager.GetSub(token.index, token.length); + const auto &type = magic_enum::enum_name(token.type); + if (token.type == TokenType::EndOfFile) + { + std::cout << "EOF: " << type << '\n'; + break; + } + std::cout << lexeme << " --> " << type << '\n'; + } +} \ No newline at end of file diff --git a/src/SourceManager/SourceManager.hpp b/src/SourceManager/SourceManager.hpp index 655b164..fed79a3 100644 --- a/src/SourceManager/SourceManager.hpp +++ b/src/SourceManager/SourceManager.hpp @@ -52,5 +52,10 @@ namespace Fig { return source.substr(_index_start, _length); } + + const String &GetSource() const + { + return source; + } }; }; \ No newline at end of file diff --git a/src/Token/Token.cpp b/src/Token/Token.cpp index 210179c..7524959 100644 --- a/src/Token/Token.cpp +++ b/src/Token/Token.cpp @@ -2,7 +2,7 @@ namespace Fig { - const HashMap Token::symbolMap = { + const HashMap Token::punctMap = { // 三字符 {String("..."), TokenType::TripleDot}, // 双字符 @@ -83,5 +83,8 @@ namespace Fig {String("throw"), TokenType::Throw}, {String("Finally"), TokenType::Finally}, {String("as"), TokenType::As}, + {String("true"), TokenType::LiteralTrue}, + {String("false"), TokenType::LiteralFalse}, + {String("null"), TokenType::LiteralNull}, }; }; \ No newline at end of file diff --git a/src/Token/Token.hpp b/src/Token/Token.hpp index 6f94011..a62f25c 100644 --- a/src/Token/Token.hpp +++ b/src/Token/Token.hpp @@ -54,11 +54,13 @@ namespace Fig // TypeBool, // Bool // TypeDouble, // Double - /* Literal Types (not keyword)*/ + /* Literal Types */ LiteralNumber, // number (int,float...) LiteralString, // string - LiteralBool, // bool (true/false) - LiteralNull, // null (Null unique instance) + + LiteralTrue, // true <-- keyword + LiteralFalse, // false <-- keyword + LiteralNull, // null (Null unique instance) <-- keyword /* Punct */ Plus, // + @@ -121,12 +123,12 @@ namespace Fig class Token final { public: - static const HashMap symbolMap; + static const HashMap punctMap; static const HashMap keywordMap; - const size_t index, length; + size_t index, length; // 源文件中的下标 Token长度 - const TokenType type; + TokenType type; Token() : index(0), length(0), type(TokenType::Illegal) {}; Token(size_t _index, size_t _length, TokenType _type) : index(_index), length(_length), type(_type) {} @@ -138,8 +140,8 @@ namespace Fig bool isIdentifier() const { return type == TokenType::Identifier; } bool isLiteral() const { - return type == TokenType::LiteralNull || type == TokenType::LiteralBool || type == TokenType::LiteralNumber - || type == TokenType::LiteralString; + return type == TokenType::LiteralNull || type == TokenType::LiteralTrue || type == TokenType::LiteralFalse + || type == TokenType::LiteralNumber || type == TokenType::LiteralString; } }; } // namespace Fig \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index 55158e0..e69de29 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,15 +0,0 @@ -#include -#include - -int main() -{ - using namespace Fig; - Error error{ErrorType::MayBeNull, - "unterminated string literal", - "terminated it", - SourceLocation{2, 4, 5, "main.cpp", "main", "main"}}; - SourceManager manager = SourceManager("T:/Files/Maker/Code/MyCodingLanguage/The Fig Project/Fig/src/main.cpp"); - manager.Read(); - - ReportError(error, manager); -} \ No newline at end of file diff --git a/xmake.lua b/xmake.lua index fe965a2..8b475d2 100644 --- a/xmake.lua +++ b/xmake.lua @@ -28,6 +28,13 @@ add_defines("__FCORE_COMPILE_TIME=\"" .. os.date("%Y-%m-%d %H:%M:%S") .. "\"") target("StringTest") add_files("src/Deps/String/StringTest.cpp") +target("LexerTest") + add_files("src/Core/*.cpp") + add_files("src/Token/Token.cpp") + add_files("src/Error/Error.cpp") + add_files("src/Lexer/Lexer.cpp") + + add_files("src/Lexer/LexerTest.cpp") target("Fig") add_files("src/Core/*.cpp") add_files("src/Token/Token.cpp")