Files
Fig/src/Core/utf8_iterator.hpp
PuqiAR fc35368d85 [Feat] 详细区分左值(LvObject)与右值(RvObject -> ObjectPtr)
[Impl] 重构evaluator.cpp + hpp 全部
[Feat] 增加对于IndexExpr的解析
[Fix][Impl] 现在点运算符不由BinaryExpr负责,增加MemberExpr,单独实现解析
[Impl] 项目目录全部翻修, src/目录下单独文件夹放置每一个模块
2025-12-24 17:54:38 +08:00

260 lines
7.3 KiB
C++

#include <corecrt.h>
#include <string>
#include <iterator>
#include <string>
#include <cwctype>
// fuckyou C++
// i don't know how to deal with unicode string in cpp
// fuck
// generate by Qwen3-Coder:
namespace Fig
{
class UTF8Char
{
private:
std::u8string char_data_;
public:
UTF8Char(const std::u8string &data) :
char_data_(data) {}
// 获取UTF-8字符的字节长度
static size_t getUTF8CharLength(char8_t first_byte)
{
if ((first_byte & 0x80) == 0x00) return 1;
if ((first_byte & 0xE0) == 0xC0) return 2;
if ((first_byte & 0xF0) == 0xE0) return 3;
if ((first_byte & 0xF8) == 0xF0) return 4;
return 1;
}
// 转换为Unicode码点
char32_t toCodePoint() const
{
if (char_data_.empty()) return 0;
size_t len = getUTF8CharLength(char_data_[0]);
if (len > char_data_.length()) return 0;
char32_t code_point = 0;
switch (len)
{
case 1:
code_point = char_data_[0];
break;
case 2:
code_point = ((char_data_[0] & 0x1F) << 6) | (char_data_[1] & 0x3F);
break;
case 3:
code_point = ((char_data_[0] & 0x0F) << 12) | ((char_data_[1] & 0x3F) << 6) | (char_data_[2] & 0x3F);
break;
case 4:
code_point = ((char_data_[0] & 0x07) << 18) | ((char_data_[1] & 0x3F) << 12) | ((char_data_[2] & 0x3F) << 6) | (char_data_[3] & 0x3F);
break;
}
return code_point;
}
inline bool operator==(char32_t ch)
{
return this->toCodePoint() == ch;
}
// 字符分类函数
bool isAlpha() const
{
char32_t cp = toCodePoint();
return std::iswalpha(static_cast<wint_t>(cp));
}
bool isDigit() const
{
char32_t cp = toCodePoint();
return std::iswdigit(static_cast<wint_t>(cp));
}
bool isAlnum() const
{
char32_t cp = toCodePoint();
return std::iswalnum(static_cast<wint_t>(cp));
}
bool isSpace() const
{
char32_t cp = toCodePoint();
return std::iswspace(static_cast<wint_t>(cp));
}
bool isUpper() const
{
char32_t cp = toCodePoint();
return std::iswupper(static_cast<wint_t>(cp));
}
bool isLower() const
{
char32_t cp = toCodePoint();
return std::iswlower(static_cast<wint_t>(cp));
}
bool isPunct() const
{
char32_t cp = toCodePoint();
return std::iswpunct(static_cast<wint_t>(cp));
}
// 获取底层数据
const std::u8string &getString() const { return char_data_; }
// 获取字符长度(字节数)
size_t length() const { return char_data_.length(); }
// 是否为空
bool empty() const { return char_data_.empty(); }
};
class UTF8Iterator
{
private:
const std::u8string *str_;
size_t pos_;
// 获取UTF-8字符的字节长度
static size_t getUTF8CharLength(char8_t first_byte)
{
if ((first_byte & 0x80) == 0x00) return 1;
if ((first_byte & 0xE0) == 0xC0) return 2;
if ((first_byte & 0xF0) == 0xE0) return 3;
if ((first_byte & 0xF8) == 0xF0) return 4;
return 1;
}
// 获取下一个字符的起始位置
size_t getNextCharPos(size_t current_pos) const
{
if (current_pos >= str_->length()) return current_pos;
size_t char_len = getUTF8CharLength((*str_)[current_pos]);
return current_pos + char_len;
}
// 获取前一个字符的起始位置
size_t getPrevCharPos(size_t current_pos) const
{
if (current_pos == 0) return 0;
size_t pos = current_pos - 1;
while (pos > 0 && (str_->at(pos) & 0xC0) == 0x80)
{
--pos;
}
return pos;
}
public:
using iterator_category = std::bidirectional_iterator_tag;
using value_type = UTF8Char;
using difference_type = std::ptrdiff_t;
using pointer = const UTF8Char *;
using reference = const UTF8Char &;
// 构造函数
UTF8Iterator(const std::u8string &str, size_t pos = 0) :
str_(&str), pos_(pos)
{
if (pos_ > str_->length()) pos_ = str_->length();
}
// 前置递增
UTF8Iterator &operator++()
{
pos_ = getNextCharPos(pos_);
return *this;
}
// 后置递增
UTF8Iterator operator++(int)
{
UTF8Iterator temp = *this;
pos_ = getNextCharPos(pos_);
return temp;
}
// 前置递减
UTF8Iterator &operator--()
{
pos_ = getPrevCharPos(pos_);
return *this;
}
// 后置递减
UTF8Iterator operator--(int)
{
UTF8Iterator temp = *this;
pos_ = getPrevCharPos(pos_);
return temp;
}
// 解引用操作符 - 返回当前字符
UTF8Char operator*() const
{
if (pos_ >= str_->length())
{
return UTF8Char(std::u8string());
}
size_t char_len = getUTF8CharLength((*str_)[pos_]);
size_t end_pos = pos_ + char_len;
if (end_pos > str_->length())
{
end_pos = str_->length();
}
return UTF8Char(str_->substr(pos_, end_pos - pos_));
}
UTF8Char peek() const
{
if (pos_ >= str_->length())
{
return UTF8Char(std::u8string());
}
size_t next_pos = getNextCharPos(pos_);
if (next_pos >= str_->length())
{
return UTF8Char(std::u8string());
}
size_t char_len = getUTF8CharLength((*str_)[next_pos]);
size_t end_pos = next_pos + char_len;
if (end_pos > str_->length())
{
end_pos = str_->length();
}
return UTF8Char(str_->substr(next_pos, end_pos - next_pos));
}
// 窥探前一个字符
UTF8Char peekPrev() const
{
if (pos_ == 0)
{
return UTF8Char(std::u8string());
}
size_t prev_pos = getPrevCharPos(pos_);
size_t char_len = getUTF8CharLength((*str_)[prev_pos]);
size_t end_pos = prev_pos + char_len;
if (end_pos > str_->length())
{
end_pos = str_->length();
}
return UTF8Char(str_->substr(prev_pos, end_pos - prev_pos));
}
// 获取当前位置
size_t position() const { return pos_; }
size_t column() const { return pos_ + 1; }
// 检查是否到达末尾
bool isEnd() const { return pos_ >= str_->length(); }
};
} // namespace Fig