完成Lexer实现,100%可靠

This commit is contained in:
2026-02-14 14:54:44 +08:00
parent 877253cbbc
commit 35b98c4d7f
15 changed files with 634 additions and 126 deletions

View File

@@ -33,13 +33,13 @@ AllowShortBlocksOnASingleLine: true
AllowShortCaseLabelsOnASingleLine: true AllowShortCaseLabelsOnASingleLine: true
# 允许短的函数放在同一行: None, InlineOnly(定义在类中), Empty(空函数), Inline(定义在类中,空函数), All # 允许短的函数放在同一行: None, InlineOnly(定义在类中), Empty(空函数), Inline(定义在类中,空函数), All
AllowShortFunctionsOnASingleLine: Inline AllowShortFunctionsOnASingleLine: Empty
# 允许短的if语句保持在同一行 # 允许短的if语句保持在同一行
AllowShortIfStatementsOnASingleLine: true AllowShortIfStatementsOnASingleLine: false
# 允许短的循环保持在同一行 # 允许短的循环保持在同一行
AllowShortLoopsOnASingleLine: true AllowShortLoopsOnASingleLine: false
# 总是在返回类型后换行: None, All, TopLevel(顶级函数,不包括在类中的函数), # 总是在返回类型后换行: None, All, TopLevel(顶级函数,不包括在类中的函数),
# AllDefinitions(所有的定义,不包括声明), TopLevelDefinitions(所有的顶级函数的定义) # AllDefinitions(所有的定义,不包括声明), TopLevelDefinitions(所有的顶级函数的定义)

4
.gitignore vendored
View File

@@ -6,4 +6,6 @@ build/
.DS_Store .DS_Store
.vscode .vscode
.VSCodeCounter .VSCodeCounter
/test.fig

View File

@@ -1,7 +1,6 @@
#pragma once #pragma once
#include <Deps/Deps.hpp> #include <Deps/Deps.hpp>
#include <format>
namespace Fig namespace Fig
{ {

View File

@@ -3,11 +3,18 @@
#include <Core/CoreInfos.hpp> #include <Core/CoreInfos.hpp>
#include <Deps/HashMap/HashMap.hpp> #include <Deps/HashMap/HashMap.hpp>
#include <Deps/String/String.hpp> #include <Deps/String/String.hpp>
#include <Deps/String/CharUtils.hpp>
#include <expected>
namespace Fig namespace Fig
{ {
#ifdef __FCORE_LINK_DEPS #ifdef __FCORE_LINK_DEPS
using Deps::String; using Deps::String;
using Deps::HashMap; using Deps::HashMap;
using Deps::CharUtils;
template<class _Tp, class _Err>
using Result = std::expected<_Tp, _Err>;
#endif #endif
}; };

View File

@@ -17,7 +17,8 @@ namespace Fig::Deps
{ {
for (size_t i = 0; i < n; ++i) for (size_t i = 0; i < n; ++i)
{ {
if (static_cast<unsigned char>(data[i]) >= 128) return false; if (static_cast<unsigned char>(data[i]) >= 128)
return false;
} }
return true; return true;
} }
@@ -26,7 +27,8 @@ namespace Fig::Deps
{ {
for (size_t i = 0; i < n; ++i) for (size_t i = 0; i < n; ++i)
{ {
if (data[i] >= 128) return false; if (data[i] >= 128)
return false;
} }
return true; return true;
} }
@@ -100,8 +102,14 @@ namespace Fig::Deps
_length = other._length; _length = other._length;
mode = other.mode; mode = other.mode;
if (mode == Mode::ASCII_SSO) { memcpy(sso, other.sso, sizeof(unsigned char) * _length); } if (mode == Mode::ASCII_SSO)
else if (mode == Mode::ASCII_HEP) { new (&ascii) std::vector<unsigned char>(other.ascii); } {
memcpy(sso, other.sso, sizeof(unsigned char) * _length);
}
else if (mode == Mode::ASCII_HEP)
{
new (&ascii) std::vector<unsigned char>(other.ascii);
}
else else
{ {
new (&utf32) std::vector<u32>(other.utf32); new (&utf32) std::vector<u32>(other.utf32);
@@ -134,24 +142,33 @@ namespace Fig::Deps
{ {
// pass // pass
} }
if (mode == Mode::ASCII_HEP) { ascii.~vector(); } if (mode == Mode::ASCII_HEP)
if (mode == Mode::UTF32_HEP) { utf32.~vector(); } {
ascii.~vector();
}
if (mode == Mode::UTF32_HEP)
{
utf32.~vector();
}
} }
void ensure_utf32() void ensure_utf32()
{ {
if (mode == Mode::UTF32_HEP) return; if (mode == Mode::UTF32_HEP)
return;
std::vector<u32> tmp; std::vector<u32> tmp;
tmp.reserve(_length); tmp.reserve(_length);
if (mode == Mode::ASCII_SSO) if (mode == Mode::ASCII_SSO)
{ {
for (size_t i = 0; i < _length; ++i) tmp.push_back(static_cast<u32>(sso[i])); for (size_t i = 0; i < _length; ++i)
tmp.push_back(static_cast<u32>(sso[i]));
} }
else // ASCII_HEP else // ASCII_HEP
{ {
for (unsigned char c : ascii) tmp.push_back(static_cast<u32>(c)); for (unsigned char c : ascii)
tmp.push_back(static_cast<u32>(c));
} }
destroy(); destroy();
@@ -166,7 +183,8 @@ namespace Fig::Deps
std::vector<unsigned char> tmp; std::vector<unsigned char> tmp;
tmp.reserve(_length); tmp.reserve(_length);
for (size_t i = 0; i < _length; ++i) tmp.push_back(sso[i]); for (size_t i = 0; i < _length; ++i)
tmp.push_back(sso[i]);
mode = Mode::ASCII_HEP; mode = Mode::ASCII_HEP;
new (&ascii) std::vector<unsigned char>(std::move(tmp)); new (&ascii) std::vector<unsigned char>(std::move(tmp));
@@ -223,7 +241,8 @@ namespace Fig::Deps
{ {
assert(data); assert(data);
size_t n = 0; size_t n = 0;
while (data[n] != 0) ++n; while (data[n] != 0)
++n;
init(data, n); init(data, n);
} }
@@ -236,7 +255,8 @@ namespace Fig::Deps
if (n <= SSO_SIZE && StringUtils::is_pure_ascii(data, n)) if (n <= SSO_SIZE && StringUtils::is_pure_ascii(data, n))
{ {
mode = Mode::ASCII_SSO; mode = Mode::ASCII_SSO;
for (size_t i = 0; i < n; ++i) sso[i] = static_cast<unsigned char>(data[i]); for (size_t i = 0; i < n; ++i)
sso[i] = static_cast<unsigned char>(data[i]);
return; return;
} }
@@ -245,7 +265,8 @@ namespace Fig::Deps
mode = Mode::ASCII_HEP; mode = Mode::ASCII_HEP;
new (&ascii) std::vector<unsigned char>(); new (&ascii) std::vector<unsigned char>();
ascii.reserve(n); ascii.reserve(n);
for (size_t i = 0; i < n; ++i) ascii.push_back(static_cast<unsigned char>(data[i])); for (size_t i = 0; i < n; ++i)
ascii.push_back(static_cast<unsigned char>(data[i]));
return; return;
} }
@@ -255,10 +276,19 @@ namespace Fig::Deps
} }
public: public:
size_t length() const noexcept { return _length; } size_t length() const noexcept
size_t size() const noexcept { return _length; } {
return _length;
}
size_t size() const noexcept
{
return _length;
}
bool empty() const noexcept { return _length == 0; } bool empty() const noexcept
{
return _length == 0;
}
void reserve(size_t n) void reserve(size_t n)
{ {
if (mode == Mode::ASCII_HEP) if (mode == Mode::ASCII_HEP)
@@ -274,7 +304,10 @@ namespace Fig::Deps
{ {
// pass // pass
} }
if (mode == Mode::ASCII_HEP) { ascii.clear(); } if (mode == Mode::ASCII_HEP)
{
ascii.clear();
}
else else
{ {
utf32.clear(); utf32.clear();
@@ -283,34 +316,71 @@ namespace Fig::Deps
void shrink_to_fit() noexcept void shrink_to_fit() noexcept
{ {
if (mode == Mode::ASCII_HEP) { ascii.shrink_to_fit(); } if (mode == Mode::ASCII_HEP)
{
ascii.shrink_to_fit();
}
else else
{ {
utf32.shrink_to_fit(); utf32.shrink_to_fit();
} }
} }
~String() noexcept { destroy(); } ~String() noexcept
{
destroy();
}
String() noexcept String() noexcept
{ {
mode = Mode::ASCII_SSO; mode = Mode::ASCII_SSO;
_length = 0; _length = 0;
} }
String(const String &other) noexcept { copyfrom(other); } String(const String &other) noexcept
String(String &&other) noexcept { movefrom(std::move(other)); } {
String(const char *str) { init(str); } copyfrom(other);
String(const char32_t *str) { init(str); } }
String(const std::string &s) { init(s.data(), s.size()); } String(String &&other) noexcept
{
movefrom(std::move(other));
}
String(const char *str)
{
init(str);
}
String(const char32_t *str)
{
init(str);
}
String(char32_t c)
{
init("");
push_back(c);
}
String(char c)
{
init("");
push_back(static_cast<char32_t>(c));
}
String(const std::string &s)
{
init(s.data(), s.size());
}
static String fromPureAscii(const char *str) static String fromPureAscii(const char *str)
{ {
String string; String string;
string._length = std::strlen(str); string._length = std::strlen(str);
if (string._length <= SSO_SIZE) { memcpy(string.sso, str, string._length); } if (string._length <= SSO_SIZE)
{
memcpy(string.sso, str, string._length);
}
else else
{ {
string.ascii.reserve(string._length); string.ascii.reserve(string._length);
for (size_t i = 0; i < string._length; ++i) { string.ascii.push_back(str[i]); } for (size_t i = 0; i < string._length; ++i)
{
string.ascii.push_back(str[i]);
}
} }
return string; return string;
@@ -328,13 +398,15 @@ namespace Fig::Deps
String &operator=(String &&other) noexcept String &operator=(String &&other) noexcept
{ {
if (this != &other) movefrom(std::move(other)); if (this != &other)
movefrom(std::move(other));
return *this; return *this;
} }
String &operator+=(const String &rhs) String &operator+=(const String &rhs)
{ {
if (rhs._length == 0) return *this; if (rhs._length == 0)
return *this;
// 两边都是 ASCII // 两边都是 ASCII
bool this_ascii = (mode == Mode::ASCII_SSO || mode == Mode::ASCII_HEP); bool this_ascii = (mode == Mode::ASCII_SSO || mode == Mode::ASCII_HEP);
@@ -356,7 +428,8 @@ namespace Fig::Deps
return *this; return *this;
} }
if (mode == Mode::ASCII_SSO) promote_sso_ascii_to_heap(); if (mode == Mode::ASCII_SSO)
promote_sso_ascii_to_heap();
// 追加 // 追加
if (rhs.mode == Mode::ASCII_SSO) if (rhs.mode == Mode::ASCII_SSO)
@@ -377,11 +450,13 @@ namespace Fig::Deps
if (mode == Mode::ASCII_SSO) if (mode == Mode::ASCII_SSO)
{ {
for (size_t i = 0; i < _length; ++i) tmp.push_back(static_cast<u32>(sso[i])); for (size_t i = 0; i < _length; ++i)
tmp.push_back(static_cast<u32>(sso[i]));
} }
else // ASCII_HEP else // ASCII_HEP
{ {
for (unsigned char c : ascii) tmp.push_back(static_cast<u32>(c)); for (unsigned char c : ascii)
tmp.push_back(static_cast<u32>(c));
} }
destroy(); destroy();
@@ -389,14 +464,19 @@ namespace Fig::Deps
new (&utf32) std::vector<u32>(std::move(tmp)); new (&utf32) std::vector<u32>(std::move(tmp));
} }
if (rhs.mode == Mode::UTF32_HEP) { utf32.insert(utf32.end(), rhs.utf32.begin(), rhs.utf32.end()); } if (rhs.mode == Mode::UTF32_HEP)
{
utf32.insert(utf32.end(), rhs.utf32.begin(), rhs.utf32.end());
}
else if (rhs.mode == Mode::ASCII_SSO) else if (rhs.mode == Mode::ASCII_SSO)
{ {
for (size_t i = 0; i < rhs._length; ++i) utf32.push_back(static_cast<u32>(rhs.sso[i])); for (size_t i = 0; i < rhs._length; ++i)
utf32.push_back(static_cast<u32>(rhs.sso[i]));
} }
else // ASCII_HEP else // ASCII_HEP
{ {
for (unsigned char c : rhs.ascii) utf32.push_back(static_cast<u32>(c)); for (unsigned char c : rhs.ascii)
utf32.push_back(static_cast<u32>(c));
} }
_length = utf32.size(); _length = utf32.size();
@@ -425,7 +505,8 @@ namespace Fig::Deps
return; return;
} }
if (mode == Mode::ASCII_SSO) promote_sso_ascii_to_heap(); if (mode == Mode::ASCII_SSO)
promote_sso_ascii_to_heap();
if (mode == Mode::ASCII_HEP) if (mode == Mode::ASCII_HEP)
{ {
@@ -477,7 +558,8 @@ namespace Fig::Deps
String &append(size_t count, u32 cp) String &append(size_t count, u32 cp)
{ {
for (size_t i = 0; i < count; ++i) push_back(cp); for (size_t i = 0; i < count; ++i)
push_back(cp);
return *this; return *this;
} }
@@ -523,7 +605,10 @@ namespace Fig::Deps
// UTF32_HEP -> UTF-8 encode // UTF32_HEP -> UTF-8 encode
for (u32 cp : utf32) for (u32 cp : utf32)
{ {
if (cp <= 0x7F) { out.push_back(static_cast<char>(cp)); } if (cp <= 0x7F)
{
out.push_back(static_cast<char>(cp));
}
else if (cp <= 0x7FF) else if (cp <= 0x7FF)
{ {
out.push_back(static_cast<char>(0xC0 | (cp >> 6))); out.push_back(static_cast<char>(0xC0 | (cp >> 6)));
@@ -547,18 +632,24 @@ namespace Fig::Deps
return out; return out;
} }
friend std::ostream &operator<<(std::ostream &os, const String &s) { return os << s.toStdString(); } friend std::ostream &operator<<(std::ostream &os, const String &s)
{
return os << s.toStdString();
}
friend bool operator==(const String &a, const String &b) noexcept friend bool operator==(const String &a, const String &b) noexcept
{ {
if (a._length != b._length) return false; if (a._length != b._length)
return false;
// 同模式 // 同模式
if (a.mode == b.mode) if (a.mode == b.mode)
{ {
if (a.mode == Mode::ASCII_SSO) return std::memcmp(a.sso, b.sso, a._length) == 0; if (a.mode == Mode::ASCII_SSO)
return std::memcmp(a.sso, b.sso, a._length) == 0;
if (a.mode == Mode::ASCII_HEP) return a.ascii == b.ascii; if (a.mode == Mode::ASCII_HEP)
return a.ascii == b.ascii;
return a.utf32 == b.utf32; return a.utf32 == b.utf32;
} }
@@ -582,18 +673,23 @@ namespace Fig::Deps
if (ascii_str.mode == Mode::ASCII_SSO) if (ascii_str.mode == Mode::ASCII_SSO)
{ {
for (size_t i = 0; i < ascii_str._length; ++i) for (size_t i = 0; i < ascii_str._length; ++i)
if (static_cast<u32>(ascii_str.sso[i]) != utf32_str.utf32[i]) return false; if (static_cast<u32>(ascii_str.sso[i]) != utf32_str.utf32[i])
return false;
} }
else else
{ {
for (size_t i = 0; i < ascii_str._length; ++i) for (size_t i = 0; i < ascii_str._length; ++i)
if (static_cast<u32>(ascii_str.ascii[i]) != utf32_str.utf32[i]) return false; if (static_cast<u32>(ascii_str.ascii[i]) != utf32_str.utf32[i])
return false;
} }
return true; return true;
} }
friend bool operator!=(const String &a, const String &b) noexcept { return !(a == b); } friend bool operator!=(const String &a, const String &b) noexcept
{
return !(a == b);
}
// std::hash // std::hash
friend struct std::hash<String>; friend struct std::hash<String>;
@@ -602,34 +698,41 @@ namespace Fig::Deps
{ {
assert(i < _length); assert(i < _length);
if (mode == Mode::ASCII_SSO) return static_cast<u32>(sso[i]); if (mode == Mode::ASCII_SSO)
if (mode == Mode::ASCII_HEP) return static_cast<u32>(ascii[i]); return static_cast<u32>(sso[i]);
if (mode == Mode::ASCII_HEP)
return static_cast<u32>(ascii[i]);
return utf32[i]; return utf32[i];
} }
u32 at(size_t i) const u32 at(size_t i) const
{ {
if (i >= _length) throw std::out_of_range("String::at"); if (i >= _length)
throw std::out_of_range("String::at");
return (*this)[i]; return (*this)[i];
} }
bool starts_with(const String &prefix) const bool starts_with(const String &prefix) const
{ {
if (prefix._length > _length) return false; if (prefix._length > _length)
return false;
for (size_t i = 0; i < prefix._length; ++i) for (size_t i = 0; i < prefix._length; ++i)
if ((*this)[i] != prefix[i]) return false; if ((*this)[i] != prefix[i])
return false;
return true; return true;
} }
bool ends_with(const String &suffix) const bool ends_with(const String &suffix) const
{ {
if (suffix._length > _length) return false; if (suffix._length > _length)
return false;
size_t offset = _length - suffix._length; size_t offset = _length - suffix._length;
for (size_t i = 0; i < suffix._length; ++i) for (size_t i = 0; i < suffix._length; ++i)
if ((*this)[offset + i] != suffix[i]) return false; if ((*this)[offset + i] != suffix[i])
return false;
return true; return true;
} }
@@ -639,27 +742,32 @@ namespace Fig::Deps
if (mode == Mode::ASCII_SSO) if (mode == Mode::ASCII_SSO)
{ {
for (size_t i = 0; i < _length; ++i) for (size_t i = 0; i < _length; ++i)
if (sso[i] == cp) return true; if (sso[i] == cp)
return true;
return false; return false;
} }
if (mode == Mode::ASCII_HEP) if (mode == Mode::ASCII_HEP)
{ {
if (cp >= 128) return false; if (cp >= 128)
return false;
for (unsigned char c : ascii) for (unsigned char c : ascii)
if (c == cp) return true; if (c == cp)
return true;
return false; return false;
} }
for (u32 c : utf32) for (u32 c : utf32)
if (c == cp) return true; if (c == cp)
return true;
return false; return false;
} }
String substr(size_t pos, size_t count = size_t(-1)) const String substr(size_t pos, size_t count = size_t(-1)) const
{ {
if (pos >= _length) return String(); if (pos >= _length)
return String();
size_t len = (_length - pos < count) ? (_length - pos) : count; size_t len = (_length - pos < count) ? (_length - pos) : count;
@@ -710,7 +818,8 @@ namespace Fig::Deps
String &erase(size_t pos, size_t count = size_t(-1)) String &erase(size_t pos, size_t count = size_t(-1))
{ {
if (pos >= _length) return *this; if (pos >= _length)
return *this;
size_t len = (_length - pos < count) ? (_length - pos) : count; size_t len = (_length - pos < count) ? (_length - pos) : count;
@@ -735,8 +844,10 @@ namespace Fig::Deps
String &insert(size_t pos, const String &other) String &insert(size_t pos, const String &other)
{ {
if (pos > _length) pos = _length; if (pos > _length)
if (other._length == 0) return *this; pos = _length;
if (other._length == 0)
return *this;
bool this_ascii = (mode != Mode::UTF32_HEP); bool this_ascii = (mode != Mode::UTF32_HEP);
bool other_ascii = (other.mode != Mode::UTF32_HEP); bool other_ascii = (other.mode != Mode::UTF32_HEP);
@@ -759,7 +870,8 @@ namespace Fig::Deps
return *this; return *this;
} }
if (mode == Mode::ASCII_SSO) promote_sso_ascii_to_heap(); if (mode == Mode::ASCII_SSO)
promote_sso_ascii_to_heap();
if (other.mode == Mode::ASCII_SSO) if (other.mode == Mode::ASCII_SSO)
ascii.insert(ascii.begin() + pos, other.sso, other.sso + other._length); ascii.insert(ascii.begin() + pos, other.sso, other.sso + other._length);
@@ -794,17 +906,21 @@ namespace Fig::Deps
{ {
u32 a = (*this)[i]; u32 a = (*this)[i];
u32 b = other[i]; u32 b = other[i];
if (a != b) return (a < b) ? -1 : 1; if (a != b)
return (a < b) ? -1 : 1;
} }
if (_length == other._length) return 0; if (_length == other._length)
return 0;
return (_length < other._length) ? -1 : 1; return (_length < other._length) ? -1 : 1;
} }
size_t find(const String &needle, size_t pos = 0) const size_t find(const String &needle, size_t pos = 0) const
{ {
if (needle._length == 0) return pos <= _length ? pos : size_t(-1); if (needle._length == 0)
if (needle._length > _length || pos >= _length) return size_t(-1); return pos <= _length ? pos : size_t(-1);
if (needle._length > _length || pos >= _length)
return size_t(-1);
size_t limit = _length - needle._length; size_t limit = _length - needle._length;
@@ -812,9 +928,11 @@ namespace Fig::Deps
{ {
size_t j = 0; size_t j = 0;
for (; j < needle._length; ++j) for (; j < needle._length; ++j)
if ((*this)[i + j] != needle[j]) break; if ((*this)[i + j] != needle[j])
break;
if (j == needle._length) return i; if (j == needle._length)
return i;
} }
return size_t(-1); return size_t(-1);
@@ -822,16 +940,20 @@ namespace Fig::Deps
size_t rfind(const String &needle) const size_t rfind(const String &needle) const
{ {
if (needle._length == 0) return _length; if (needle._length == 0)
if (needle._length > _length) return size_t(-1); return _length;
if (needle._length > _length)
return size_t(-1);
for (size_t i = _length - needle._length + 1; i-- > 0;) for (size_t i = _length - needle._length + 1; i-- > 0;)
{ {
size_t j = 0; size_t j = 0;
for (; j < needle._length; ++j) for (; j < needle._length; ++j)
if ((*this)[i + j] != needle[j]) break; if ((*this)[i + j] != needle[j])
break;
if (j == needle._length) return i; if (j == needle._length)
return i;
} }
return size_t(-1); return size_t(-1);
@@ -839,7 +961,8 @@ namespace Fig::Deps
String &replace(size_t pos, size_t len, const String &repl) String &replace(size_t pos, size_t len, const String &repl)
{ {
if (pos >= _length) return *this; if (pos >= _length)
return *this;
size_t erase_len = (_length - pos < len) ? (_length - pos) : len; size_t erase_len = (_length - pos < len) ? (_length - pos) : len;
@@ -865,7 +988,8 @@ namespace Fig::Deps
return *this; return *this;
} }
if (mode == Mode::ASCII_SSO) promote_sso_ascii_to_heap(); if (mode == Mode::ASCII_SSO)
promote_sso_ascii_to_heap();
ascii.erase(ascii.begin() + pos, ascii.begin() + pos + erase_len); ascii.erase(ascii.begin() + pos, ascii.begin() + pos + erase_len);
@@ -948,7 +1072,10 @@ namespace std
struct std::formatter<Fig::Deps::String, char> struct std::formatter<Fig::Deps::String, char>
{ {
// 不支持自定义格式说明符 // 不支持自定义格式说明符
constexpr auto parse(std::format_parse_context &ctx) { return ctx.begin(); } constexpr auto parse(std::format_parse_context &ctx)
{
return ctx.begin();
}
template <typename FormatContext> template <typename FormatContext>
auto format(const Fig::Deps::String &s, FormatContext &ctx) const auto format(const Fig::Deps::String &s, FormatContext &ctx) const

View File

@@ -36,8 +36,14 @@ namespace Fig
case ErrorType::MayBeNull: return "MaybeNull"; case ErrorType::MayBeNull: return "MaybeNull";
case ErrorType::UnterminatedString: return "UnterminatedString"; case ErrorType::UnterminatedString: return "UnterminatedString";
case ErrorType::UnterminatedComments: return "UnterminatedComments";
case ErrorType::InvalidNumberLiteral: return "InvalidNumberLiteral";
case ErrorType::InvalidCharacter: return "InvalidCharacter";
case Fig::ErrorType::InvalidSymbol: return "InvalidSymbol";
case ErrorType::SyntaxError: return "SyntaxError"; case ErrorType::SyntaxError: return "SyntaxError";
default: return "Some one forgot to add case to `ErrorTypeToString`";
// default: return "Some one forgot to add case to `ErrorTypeToString`";
} }
} }
@@ -76,8 +82,9 @@ namespace Fig
const SourceLocation &location = error.location; const SourceLocation &location = error.location;
err << TC::DarkGray << " ┌─> " << TC::Cyan << location.fileName << " " << TC::DarkGray << location.sp.line err << TC::DarkGray << " ┌─> Fn " << TC::Cyan << '\'' << location.packageName << '.' << location.functionName
<< ":" << location.sp.column << '\n'; << '\'' << " " << location.fileName << " (" << TC::DarkGray << location.sp.line << ":" << location.sp.column
<< TC::Cyan << ')' << TC::Reset << '\n';
err << TC::DarkGray << "" << '\n' << "" << TC::Reset << '\n'; err << TC::DarkGray << "" << '\n' << "" << TC::Reset << '\n';
// 尝试打印上3行 下2行 // 尝试打印上3行 下2行

View File

@@ -20,6 +20,11 @@ namespace Fig
MayBeNull = 1001, MayBeNull = 1001,
UnterminatedString = 2001, UnterminatedString = 2001,
UnterminatedComments,
InvalidNumberLiteral,
InvalidCharacter,
InvalidSymbol,
SyntaxError, SyntaxError,
}; };

View File

@@ -2,5 +2,280 @@
namespace Fig namespace Fig
{ {
/*
}; 总则:
Lexer不涉及语义部分语义为Parser及之后的部分确定
确定边界 --> 分词
无法确定 --> 错误的源,报错
*/
Result<Token, Error> Lexer::scanComments()
{
Token tok(rd.currentIndex(), 2, TokenType::Comments);
rd.skip(2); // 跳过 //
do
{
tok.length++;
if (rd.current() == U'\n')
{
rd.next(); // skip '\n'
break;
}
rd.next();
} while (rd.hasNext());
return tok;
}
Result<Token, Error> Lexer::scanMultilineComments()
{
Token tok(rd.currentIndex(), 2, TokenType::Comments);
SourcePosition startPos = rd.currentPosition();
rd.skip(2); // 跳过 / *
while (true)
{
if (rd.isAtEnd())
{
return std::unexpected(Error(ErrorType::UnterminatedComments,
"unterminated multiline comments",
"insert '*/'",
makeSourceLocation(startPos)));
}
if (rd.current() == U'*' && rd.peekIf() == U'/')
{
rd.skip(2);
break;
}
tok.length++;
rd.next();
}
return tok;
}
Result<Token, Error> Lexer::scanIdentifierOrKeyword()
{
Token tok(rd.currentIndex(), 1, TokenType::Identifier);
String value; // 用于判断是标识符还是关键字
value.push_back(rd.produce()); // 加入第一个
while (CharUtils::isIdentifierContinue(rd.current())) // continue: _ / 0-9 / aA - zZ
{
tok.length++;
value.push_back(rd.produce());
if (rd.isAtEnd())
{
break;
}
}
if (Token::keywordMap.contains(value))
{
tok.type = Token::keywordMap.at(value);
}
return tok;
}
Result<Token, Error> Lexer::scanNumberLiteral()
{
Token tok(rd.currentIndex(), 0, TokenType::LiteralNumber);
state = State::ScanDec;
if (rd.current() == U'0')
{
char32_t _peek = std::tolower(rd.peekIf());
if (_peek == U'b')
{
state = State::ScanBin;
rd.skip(2); // 跳过 0b
tok.length += 2;
}
else if (_peek == U'x')
{
state = State::ScanHex;
rd.skip(2); // 跳过 0x
tok.length += 2;
}
// else
// {
// return std::unexpected(Error(ErrorType::InvalidNumberLiteral,
// std::format("bad number postfix 0{}", String(_peek)),
// "correct it",
// makeSourceLocation(rd.currentPosition())));
// }
}
do
{
char32_t current = rd.current();
if (state == State::ScanDec && !CharUtils::isDigit(current))
{
break;
}
if (state == State::ScanHex && !CharUtils::isHexDigit(current))
{
break;
}
if (state == State::ScanBin && current != U'0' && current != U'1')
{
// return std::unexpected(
// Error(ErrorType::InvalidNumberLiteral,
// std::format("invalid binary number literal, scanning '{}'", String(&current)),
// "correct it",
// makeSourceLocation(rd.currentPosition())));
break;
}
tok.length++;
rd.next();
} while (!rd.isAtEnd());
// 科学计数法
while (!rd.isAtEnd() && state == State::ScanDec
&& (rd.current() == U'e' || rd.current() == U'E' || rd.current() == U'_' || rd.current() == U'+'
|| rd.current() == U'-' || CharUtils::isDigit(rd.current())))
{
tok.length++;
rd.next();
}
return tok;
}
Result<Token, Error> Lexer::scanStringLiteral()
{
state = (rd.current() == U'"' ? State::ScanStringDQ : Lexer::State::ScanStringSQ);
SourcePosition startPos = rd.currentPosition();
rd.next(); // skip " / '
Token tok(rd.currentIndex(), 0, TokenType::LiteralString);
while (true)
{
if (state == State::ScanStringDQ && rd.current() == U'"')
{
rd.next(); // skip '"'
break;
}
else if (state == State::ScanStringSQ && rd.current() == U'\'')
{
rd.next(); // skip `'`
break;
}
else if (rd.isAtEnd())
{
return std::unexpected(
Error(ErrorType::UnterminatedString,
"unterminated string literal",
std::format("insert '{}'", String((state == State::ScanStringDQ ? "\"" : "'"))),
makeSourceLocation(startPos)));
}
else
{
tok.length++;
rd.next();
}
}
return tok;
}
Result<Token, Error> Lexer::scanPunct()
{
Token tok(rd.currentIndex(), 0, TokenType::Illegal);
auto startsWith = [&](const String &prefix) -> bool {
for (const auto &p : Token::punctMap)
{
const String &op = p.first;
if (op.starts_with(prefix))
return true;
}
return false;
};
String sym;
do
{
String candidate = sym + rd.current();
if (startsWith(candidate))
{
rd.next();
tok.length++;
sym = candidate;
}
else
{
break;
}
} while (!rd.isAtEnd() && CharUtils::isPunct(rd.current()));
if (!Token::punctMap.contains(sym))
{
return std::unexpected(Error(ErrorType::InvalidSymbol,
std::format("invalid symbol `{}`", sym),
"correct it",
makeSourceLocation(rd.currentPosition())));
}
tok.type = Token::punctMap.at(sym);
return tok;
}
void Lexer::skipWhitespaces()
{
while (!rd.isAtEnd())
{
char32_t current = rd.current();
if (current == EOF || !CharUtils::isAsciiSpace(current)) // 检查 EOF
break;
rd.next();
}
}
Result<Token, Error> Lexer::NextToken()
{
if (rd.isAtEnd())
{
return Token(rd.currentIndex(), 0, TokenType::EndOfFile);
}
if (rd.current() == U'\0')
{
return Token(rd.currentIndex(), 1, TokenType::EndOfFile);
}
if (rd.current() == U'/' && rd.peekIf() == U'/')
{
return scanComments();
}
else if (rd.current() == U'/' && rd.peekIf() == U'*')
{
return scanMultilineComments();
}
else if (CharUtils::isIdentifierStart(rd.current()))
{
return scanIdentifierOrKeyword();
}
else if (CharUtils::isDigit(rd.current()))
{
return scanNumberLiteral();
}
else if (rd.current() == U'"' || rd.current() == U'\'')
{
return scanStringLiteral();
}
else if (CharUtils::isPunct(rd.current()))
{
return scanPunct();
}
else if (CharUtils::isSpace(rd.current()))
{
skipWhitespaces();
return NextToken();
}
else
{
return std::unexpected(Error(
ErrorType::InvalidCharacter,
std::format("invalid character '{}' (U+{})", String(rd.current()), static_cast<int>(rd.current())),
"correct it",
makeSourceLocation(rd.currentPosition())));
}
}
}; // namespace Fig

View File

@@ -10,6 +10,7 @@
#include <Deps/Deps.hpp> #include <Deps/Deps.hpp>
#include <Token/Token.hpp> #include <Token/Token.hpp>
#include <Core/SourceLocations.hpp> #include <Core/SourceLocations.hpp>
#include <Error/Error.hpp>
namespace Fig namespace Fig
{ {
@@ -25,13 +26,13 @@ namespace Fig
SourceReader() SourceReader()
{ {
index = 0; index = 0;
pos.line = pos.column = 0; pos.line = pos.column = 1;
} }
SourceReader(const String &_source) // copy SourceReader(const String &_source) // copy
{ {
source = _source; source = _source;
index = 0; index = 0;
pos.line = pos.column = 0; pos.line = pos.column = 1;
} }
SourcePosition &currentPosition() { return pos; } SourcePosition &currentPosition() { return pos; }
@@ -42,7 +43,16 @@ namespace Fig
return source[index]; return source[index];
} }
inline bool hasNext() const { return index < source.length(); } inline char32_t currentIf() const
{
if (index >= source.length())
{
return U'\0';
}
return source[index];
}
inline bool hasNext() const { return index < source.length() - 1; }
inline char32_t peek() const inline char32_t peek() const
{ {
@@ -66,10 +76,10 @@ namespace Fig
inline void next() inline void next()
{ {
assert(hasNext() && "SrcReader: next failed, need more runes"); char32_t consumed = currentIf();
++index;
if (current() == U'\n') ++index;
if (consumed == U'\n')
{ {
++pos.line; ++pos.line;
pos.column = 1; pos.column = 1;
@@ -80,18 +90,38 @@ namespace Fig
} }
} }
inline void skip(size_t n)
{
for (size_t i = 0; i < n; ++i) { next(); }
}
inline size_t currentIndex() const { return index; } inline size_t currentIndex() const { return index; }
inline bool isAtEnd() const { return index == source.length() - 1; } inline bool isAtEnd() const { return index >= source.length(); }
}; };
class Lexer class Lexer
{ {
public: public:
enum State : uint8_t enum class State : uint8_t
{ {
Normal, Error,
Error Standby,
End,
ScanComments, // 单行注释
ScanMultilineComments, // 多行注释
ScanIdentifier, // 关键字也算
ScanDec, // 十进制数字, 如 1.2 31, 3.14e+3, 1_000_0000
ScanBin, // 二进制数字, 如 0b0001 / 0B0001
ScanHex, // 十六进制数字, 如 0xABCD / 0XabCd
ScanStringDQ, // 双引号字符串, 如 "hello, world!"
ScanStringSQ, // 单引号字符串, 如 'hello'
ScanBool, // 布尔字面量, true / false
ScanNull, // 空值字面量, null
ScanPunct, // 符号
}; };
private: private:
@@ -99,17 +129,22 @@ namespace Fig
SourceReader rd; SourceReader rd;
protected: protected:
Token scanComments(); Result<Token, Error> scanComments();
Token scanIdentifierOrKeyword(); Result<Token, Error> scanMultilineComments();
Token scanNumberLiteral(); Result<Token, Error> scanIdentifierOrKeyword();
Token scanStringLiteral();
Token scanBoolLiteral(); Result<Token, Error> scanNumberLiteral();
Token scanLiteralNull(); Result<Token, Error> scanStringLiteral(); // 支持多行
// Result<Token, Error> scanBoolLiteral(); 由 scanIdentifier...扫描
// Result<Token, Error> scanLiteralNull(); 由 scanIdentifier...扫描
Result<Token, Error> scanPunct();
void skipWhitespaces();
Token scanPunct();
public: public:
State state = Normal; State state = State::Standby;
Lexer() {} Lexer() {}
Lexer(const String &source, String _fileName) Lexer(const String &source, String _fileName)
@@ -118,6 +153,12 @@ namespace Fig
fileName = std::move(_fileName); fileName = std::move(_fileName);
} }
Token NextToken(); SourceLocation makeSourceLocation(const SourcePosition &current_pos)
{
return SourceLocation(
current_pos, fileName, "[internal lexer]", String(magic_enum::enum_name(state).data()));
}
Result<Token, Error> NextToken();
}; };
}; // namespace Fig }; // namespace Fig

43
src/Lexer/LexerTest.cpp Normal file
View File

@@ -0,0 +1,43 @@
#include <Error/Error.hpp>
#include <Token/Token.hpp>
#include <Lexer/Lexer.hpp>
#include <iostream>
int main()
{
using namespace Fig;
String fileName = "test.fig";
String filePath = "T:/Files/Maker/Code/MyCodingLanguage/The Fig Project/Fig/test.fig";
SourceManager manager(filePath);
manager.Read();
if (!manager.read)
{
std::cerr << "Couldn't read file";
return 1;
}
Lexer lexer(manager.GetSource(), fileName);
while (true)
{
const auto &result = lexer.NextToken();
if (!result.has_value())
{
ReportError(result.error(), manager);
break;
}
const Token &token = *result;
const String &lexeme = manager.GetSub(token.index, token.length);
const auto &type = magic_enum::enum_name(token.type);
if (token.type == TokenType::EndOfFile)
{
std::cout << "EOF: " << type << '\n';
break;
}
std::cout << lexeme << " --> " << type << '\n';
}
}

View File

@@ -52,5 +52,10 @@ namespace Fig
{ {
return source.substr(_index_start, _length); return source.substr(_index_start, _length);
} }
const String &GetSource() const
{
return source;
}
}; };
}; };

View File

@@ -2,7 +2,7 @@
namespace Fig namespace Fig
{ {
const HashMap<String, TokenType> Token::symbolMap = { const HashMap<String, TokenType> Token::punctMap = {
// 三字符 // 三字符
{String("..."), TokenType::TripleDot}, {String("..."), TokenType::TripleDot},
// 双字符 // 双字符
@@ -83,5 +83,8 @@ namespace Fig
{String("throw"), TokenType::Throw}, {String("throw"), TokenType::Throw},
{String("Finally"), TokenType::Finally}, {String("Finally"), TokenType::Finally},
{String("as"), TokenType::As}, {String("as"), TokenType::As},
{String("true"), TokenType::LiteralTrue},
{String("false"), TokenType::LiteralFalse},
{String("null"), TokenType::LiteralNull},
}; };
}; };

View File

@@ -54,11 +54,13 @@ namespace Fig
// TypeBool, // Bool // TypeBool, // Bool
// TypeDouble, // Double // TypeDouble, // Double
/* Literal Types (not keyword)*/ /* Literal Types */
LiteralNumber, // number (int,float...) LiteralNumber, // number (int,float...)
LiteralString, // string LiteralString, // string
LiteralBool, // bool (true/false)
LiteralNull, // null (Null unique instance) LiteralTrue, // true <-- keyword
LiteralFalse, // false <-- keyword
LiteralNull, // null (Null unique instance) <-- keyword
/* Punct */ /* Punct */
Plus, // + Plus, // +
@@ -121,12 +123,12 @@ namespace Fig
class Token final class Token final
{ {
public: public:
static const HashMap<String, TokenType> symbolMap; static const HashMap<String, TokenType> punctMap;
static const HashMap<String, TokenType> keywordMap; static const HashMap<String, TokenType> keywordMap;
const size_t index, length; size_t index, length;
// 源文件中的下标 Token长度 // 源文件中的下标 Token长度
const TokenType type; TokenType type;
Token() : index(0), length(0), type(TokenType::Illegal) {}; Token() : index(0), length(0), type(TokenType::Illegal) {};
Token(size_t _index, size_t _length, TokenType _type) : index(_index), length(_length), type(_type) {} Token(size_t _index, size_t _length, TokenType _type) : index(_index), length(_length), type(_type) {}
@@ -138,8 +140,8 @@ namespace Fig
bool isIdentifier() const { return type == TokenType::Identifier; } bool isIdentifier() const { return type == TokenType::Identifier; }
bool isLiteral() const bool isLiteral() const
{ {
return type == TokenType::LiteralNull || type == TokenType::LiteralBool || type == TokenType::LiteralNumber return type == TokenType::LiteralNull || type == TokenType::LiteralTrue || type == TokenType::LiteralFalse
|| type == TokenType::LiteralString; || type == TokenType::LiteralNumber || type == TokenType::LiteralString;
} }
}; };
} // namespace Fig } // namespace Fig

View File

@@ -1,15 +0,0 @@
#include <Error/Error.hpp>
#include <Token/Token.hpp>
int main()
{
using namespace Fig;
Error error{ErrorType::MayBeNull,
"unterminated string literal",
"terminated it",
SourceLocation{2, 4, 5, "main.cpp", "main", "main"}};
SourceManager manager = SourceManager("T:/Files/Maker/Code/MyCodingLanguage/The Fig Project/Fig/src/main.cpp");
manager.Read();
ReportError(error, manager);
}

View File

@@ -28,6 +28,13 @@ add_defines("__FCORE_COMPILE_TIME=\"" .. os.date("%Y-%m-%d %H:%M:%S") .. "\"")
target("StringTest") target("StringTest")
add_files("src/Deps/String/StringTest.cpp") add_files("src/Deps/String/StringTest.cpp")
target("LexerTest")
add_files("src/Core/*.cpp")
add_files("src/Token/Token.cpp")
add_files("src/Error/Error.cpp")
add_files("src/Lexer/Lexer.cpp")
add_files("src/Lexer/LexerTest.cpp")
target("Fig") target("Fig")
add_files("src/Core/*.cpp") add_files("src/Core/*.cpp")
add_files("src/Token/Token.cpp") add_files("src/Token/Token.cpp")