/*! @file src/Deps/String/String.hpp @brief UTF32/Pure ASCII + SSO优化的字符串实现 @author PuqiAR (im@puqiar.top) @date 2026-02-13 */ #pragma once #include #include #include #include #include #include namespace Fig::Deps { class StringUtils { public: static bool is_pure_ascii(const char *data, size_t n) noexcept { for (size_t i = 0; i < n; ++i) { if (static_cast(data[i]) >= 128) return false; } return true; } static bool is_pure_ascii(const char32_t *data, size_t n) noexcept { for (size_t i = 0; i < n; ++i) { if (data[i] >= 128) return false; } return true; } static size_t utf8_decode_one(const char *s, size_t n, char32_t &out) { unsigned char c0 = static_cast(s[0]); if (c0 < 0x80) { out = c0; return 1; } if ((c0 >> 5) == 0x6 && n >= 2) { unsigned char c1 = static_cast(s[1]); out = ((c0 & 0x1F) << 6) | (c1 & 0x3F); return 2; } if ((c0 >> 4) == 0xE && n >= 3) { unsigned char c1 = static_cast(s[1]); unsigned char c2 = static_cast(s[2]); out = ((c0 & 0x0F) << 12) | ((c1 & 0x3F) << 6) | (c2 & 0x3F); return 3; } if ((c0 >> 3) == 0x1E && n >= 4) { unsigned char c1 = static_cast(s[1]); unsigned char c2 = static_cast(s[2]); unsigned char c3 = static_cast(s[3]); out = ((c0 & 0x07) << 18) | ((c1 & 0x3F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F); return 4; } out = 0xFFFD; return 1; } }; class String { public: using u32 = char32_t; static constexpr uint8_t SSO_SIZE = 22; enum class Mode : uint8_t { ASCII_SSO, // ASCII ASCII_HEP, // ASCII heap UTF32_HEP, // UTF32 heap }; private: Mode mode = Mode::ASCII_SSO; union { unsigned char sso[SSO_SIZE]; // non null terminate std::vector ascii; std::vector utf32; }; size_t _length = 0; void copyfrom(const String &other) { destroy(); _length = other._length; mode = other.mode; if (mode == Mode::ASCII_SSO) { memcpy(sso, other.sso, sizeof(unsigned char) * _length); } else if (mode == Mode::ASCII_HEP) { new (&ascii) std::vector(other.ascii); } else { new (&utf32) std::vector(other.utf32); } } void movefrom(String &&other) noexcept { destroy(); mode = other.mode; _length = other._length; switch (mode) { case Mode::ASCII_SSO: std::memcpy(sso, other.sso, other._length); break; case Mode::ASCII_HEP: new (&ascii) std::vector(std::move(other.ascii)); break; case Mode::UTF32_HEP: new (&utf32) std::vector(std::move(other.utf32)); break; } other.mode = Mode::ASCII_SSO; other._length = 0; } void destroy() noexcept { if (mode == Mode::ASCII_SSO) { // pass } if (mode == Mode::ASCII_HEP) { ascii.~vector(); } if (mode == Mode::UTF32_HEP) { utf32.~vector(); } } void ensure_utf32() { if (mode == Mode::UTF32_HEP) return; std::vector tmp; tmp.reserve(_length); if (mode == Mode::ASCII_SSO) { for (size_t i = 0; i < _length; ++i) tmp.push_back(static_cast(sso[i])); } else // ASCII_HEP { for (unsigned char c : ascii) tmp.push_back(static_cast(c)); } destroy(); mode = Mode::UTF32_HEP; new (&utf32) std::vector(std::move(tmp)); } void promote_sso_ascii_to_heap() noexcept { assert(mode == Mode::ASCII_SSO && "promote_sso_ascii_to_heap: mode is not ascii sso"); mode = Mode::ASCII_HEP; std::vector tmp; tmp.reserve(_length); for (size_t i = 0; i < _length; ++i) tmp.push_back(sso[i]); mode = Mode::ASCII_HEP; new (&ascii) std::vector(std::move(tmp)); } void init(const char *data) { assert(data); size_t n = std::strlen(data); init(data, n); } void init(const char *data, size_t n) { destroy(); _length = 0; // ASCII 快路径 if (n <= SSO_SIZE && StringUtils::is_pure_ascii(data, n)) { mode = Mode::ASCII_SSO; std::memcpy(sso, data, n); _length = n; return; } if (StringUtils::is_pure_ascii(data, n)) { mode = Mode::ASCII_HEP; new (&ascii) std::vector(data, data + n); _length = n; return; } // UTF-8 decode mode = Mode::UTF32_HEP; new (&utf32) std::vector(); utf32.reserve(n); for (size_t i = 0; i < n;) { u32 cp; size_t step = StringUtils::utf8_decode_one(data + i, n - i, cp); utf32.push_back(cp); i += step; } utf32.shrink_to_fit(); _length = utf32.size(); } void init(const u32 *data) { assert(data); size_t n = 0; while (data[n] != 0) ++n; init(data, n); } void init(const u32 *data, size_t n) { destroy(); _length = n; if (n <= SSO_SIZE && StringUtils::is_pure_ascii(data, n)) { mode = Mode::ASCII_SSO; for (size_t i = 0; i < n; ++i) sso[i] = static_cast(data[i]); return; } if (StringUtils::is_pure_ascii(data, n)) { mode = Mode::ASCII_HEP; new (&ascii) std::vector(); ascii.reserve(n); for (size_t i = 0; i < n; ++i) ascii.push_back(static_cast(data[i])); return; } mode = Mode::UTF32_HEP; new (&utf32) std::vector(); utf32.assign(data, data + n); } public: size_t length() const noexcept { return _length; } size_t size() const noexcept { return _length; } bool empty() const noexcept { return _length == 0; } void reserve(size_t n) { if (mode == Mode::ASCII_HEP) ascii.reserve(n); else if (mode == Mode::UTF32_HEP) utf32.reserve(n); } void clear() noexcept { _length = 0; if (mode == Mode::ASCII_SSO) { // pass } if (mode == Mode::ASCII_HEP) { ascii.clear(); } else { utf32.clear(); } } void shrink_to_fit() noexcept { if (mode == Mode::ASCII_HEP) { ascii.shrink_to_fit(); } else { utf32.shrink_to_fit(); } } ~String() noexcept { destroy(); } String() noexcept { mode = Mode::ASCII_SSO; _length = 0; } String(const String &other) noexcept { copyfrom(other); } String(String &&other) noexcept { movefrom(std::move(other)); } String(const char *str) { init(str); } String(const char32_t *str) { init(str); } String(char32_t c) { init(""); push_back(c); } String(char c) { init(""); push_back(static_cast(c)); } String(const std::string &s) { init(s.data(), s.size()); } static String fromPureAscii(const char *str) { String string; string._length = std::strlen(str); if (string._length <= SSO_SIZE) { memcpy(string.sso, str, string._length); } else { string.ascii.reserve(string._length); for (size_t i = 0; i < string._length; ++i) { string.ascii.push_back(str[i]); } } return string; } String &operator=(const String &other) { if (this != &other) { destroy(); copyfrom(other); } return *this; } String &operator=(String &&other) noexcept { if (this != &other) movefrom(std::move(other)); return *this; } String &operator+=(const String &rhs) { if (rhs._length == 0) return *this; // 两边都是 ASCII bool this_ascii = (mode == Mode::ASCII_SSO || mode == Mode::ASCII_HEP); bool rhs_ascii = (rhs.mode == Mode::ASCII_SSO || rhs.mode == Mode::ASCII_HEP); if (this_ascii && rhs_ascii) { size_t newlen = _length + rhs._length; // SSO 可容纳 if (mode == Mode::ASCII_SSO && newlen <= SSO_SIZE) { if (rhs.mode == Mode::ASCII_SSO) std::memcpy(sso + _length, rhs.sso, rhs._length); else std::memcpy(sso + _length, rhs.ascii.data(), rhs._length); _length = newlen; return *this; } if (mode == Mode::ASCII_SSO) promote_sso_ascii_to_heap(); // 追加 if (rhs.mode == Mode::ASCII_SSO) ascii.insert(ascii.end(), rhs.sso, rhs.sso + rhs._length); else ascii.insert(ascii.end(), rhs.ascii.begin(), rhs.ascii.end()); _length = newlen; return *this; } // 必须 UTF32 if (mode != Mode::UTF32_HEP) { std::vector tmp; tmp.reserve(_length + rhs._length); if (mode == Mode::ASCII_SSO) { for (size_t i = 0; i < _length; ++i) tmp.push_back(static_cast(sso[i])); } else // ASCII_HEP { for (unsigned char c : ascii) tmp.push_back(static_cast(c)); } destroy(); mode = Mode::UTF32_HEP; new (&utf32) std::vector(std::move(tmp)); } if (rhs.mode == Mode::UTF32_HEP) { utf32.insert(utf32.end(), rhs.utf32.begin(), rhs.utf32.end()); } else if (rhs.mode == Mode::ASCII_SSO) { for (size_t i = 0; i < rhs._length; ++i) utf32.push_back(static_cast(rhs.sso[i])); } else // ASCII_HEP { for (unsigned char c : rhs.ascii) utf32.push_back(static_cast(c)); } _length = utf32.size(); return *this; } String &operator+=(const char *utf8) { String tmp(utf8); return (*this += tmp); } friend String operator+(String lhs, const String &rhs) { lhs += rhs; return lhs; } void push_back(u32 cp) { if (cp < 128) { if (mode == Mode::ASCII_SSO && _length < SSO_SIZE) { sso[_length++] = static_cast(cp); return; } if (mode == Mode::ASCII_SSO) promote_sso_ascii_to_heap(); if (mode == Mode::ASCII_HEP) { ascii.push_back(static_cast(cp)); ++_length; return; } } ensure_utf32(); utf32.push_back(cp); _length = utf32.size(); } void pop_back() { assert(_length > 0); if (mode == Mode::ASCII_SSO) { --_length; return; } if (mode == Mode::ASCII_HEP) { ascii.pop_back(); --_length; return; } utf32.pop_back(); _length = utf32.size(); } String &append(const char *utf8) { String tmp(utf8); *this += tmp; return *this; } String &append(const char32_t *u32str) { String tmp(u32str); *this += tmp; return *this; } String &append(size_t count, u32 cp) { for (size_t i = 0; i < count; ++i) push_back(cp); return *this; } void resize(size_t new_size, u32 fill = 0) { if (new_size <= _length) { erase(new_size); return; } append(new_size - _length, fill); } u32 front() const { assert(_length > 0); return (*this)[0]; } u32 back() const { assert(_length > 0); return (*this)[_length - 1]; } std::string toStdString() const { std::string out; if (mode == Mode::ASCII_SSO) { out.assign(reinterpret_cast(sso), _length); return out; } if (mode == Mode::ASCII_HEP) { out.assign(ascii.begin(), ascii.end()); return out; } // UTF32_HEP -> UTF-8 encode for (u32 cp : utf32) { if (cp <= 0x7F) { out.push_back(static_cast(cp)); } else if (cp <= 0x7FF) { out.push_back(static_cast(0xC0 | (cp >> 6))); out.push_back(static_cast(0x80 | (cp & 0x3F))); } else if (cp <= 0xFFFF) { out.push_back(static_cast(0xE0 | (cp >> 12))); out.push_back(static_cast(0x80 | ((cp >> 6) & 0x3F))); out.push_back(static_cast(0x80 | (cp & 0x3F))); } else if (cp <= 0x10FFFF) { out.push_back(static_cast(0xF0 | (cp >> 18))); out.push_back(static_cast(0x80 | ((cp >> 12) & 0x3F))); out.push_back(static_cast(0x80 | ((cp >> 6) & 0x3F))); out.push_back(static_cast(0x80 | (cp & 0x3F))); } // 非法码点 } return out; } friend std::ostream &operator<<(std::ostream &os, const String &s) { return os << s.toStdString(); } friend bool operator==(const String &a, const String &b) noexcept { if (a._length != b._length) return false; // 同模式 if (a.mode == b.mode) { if (a.mode == Mode::ASCII_SSO) return std::memcmp(a.sso, b.sso, a._length) == 0; if (a.mode == Mode::ASCII_HEP) return a.ascii == b.ascii; return a.utf32 == b.utf32; } // 不同模式ASCII / UTF32 const bool a_ascii = (a.mode == Mode::ASCII_SSO || a.mode == Mode::ASCII_HEP); const bool b_ascii = (b.mode == Mode::ASCII_SSO || b.mode == Mode::ASCII_HEP); if (a_ascii && b_ascii) { if (a.mode == Mode::ASCII_SSO) return std::memcmp(a.sso, b.ascii.data(), a._length) == 0; else return std::memcmp(a.ascii.data(), b.sso, a._length) == 0; } // ASCII / UTF32 const String &ascii_str = a_ascii ? a : b; const String &utf32_str = a_ascii ? b : a; if (ascii_str.mode == Mode::ASCII_SSO) { for (size_t i = 0; i < ascii_str._length; ++i) if (static_cast(ascii_str.sso[i]) != utf32_str.utf32[i]) return false; } else { for (size_t i = 0; i < ascii_str._length; ++i) if (static_cast(ascii_str.ascii[i]) != utf32_str.utf32[i]) return false; } return true; } friend bool operator!=(const String &a, const String &b) noexcept { return !(a == b); } // std::hash friend struct std::hash; // read only u32 operator[](size_t i) const { assert(i < _length); if (mode == Mode::ASCII_SSO) return static_cast(sso[i]); if (mode == Mode::ASCII_HEP) return static_cast(ascii[i]); return utf32[i]; } u32 at(size_t i) const { if (i >= _length) throw std::out_of_range("String::at"); return (*this)[i]; } bool starts_with(const String &prefix) const { if (prefix._length > _length) return false; for (size_t i = 0; i < prefix._length; ++i) if ((*this)[i] != prefix[i]) return false; return true; } bool ends_with(const String &suffix) const { if (suffix._length > _length) return false; size_t offset = _length - suffix._length; for (size_t i = 0; i < suffix._length; ++i) if ((*this)[offset + i] != suffix[i]) return false; return true; } bool contains(u32 cp) const { if (mode == Mode::ASCII_SSO) { for (size_t i = 0; i < _length; ++i) if (sso[i] == cp) return true; return false; } if (mode == Mode::ASCII_HEP) { if (cp >= 128) return false; for (unsigned char c : ascii) if (c == cp) return true; return false; } for (u32 c : utf32) if (c == cp) return true; return false; } String substr(size_t pos, size_t count = size_t(-1)) const { if (pos >= _length) return String(); size_t len = (_length - pos < count) ? (_length - pos) : count; String out; // ASCII_SSO if (mode == Mode::ASCII_SSO) { if (len <= SSO_SIZE) { out.mode = Mode::ASCII_SSO; std::memcpy(out.sso, sso + pos, len); out._length = len; } else { out.mode = Mode::ASCII_HEP; new (&out.ascii) std::vector(sso + pos, sso + pos + len); out._length = len; } return out; } // ASCII_HEP if (mode == Mode::ASCII_HEP) { if (len <= SSO_SIZE) { out.mode = Mode::ASCII_SSO; std::memcpy(out.sso, ascii.data() + pos, len); out._length = len; } else { out.mode = Mode::ASCII_HEP; new (&out.ascii) std::vector(ascii.begin() + pos, ascii.begin() + pos + len); out._length = len; } return out; } // UTF32 out.mode = Mode::UTF32_HEP; new (&out.utf32) std::vector(utf32.begin() + pos, utf32.begin() + pos + len); out._length = len; return out; } String &erase(size_t pos, size_t count = size_t(-1)) { if (pos >= _length) return *this; size_t len = (_length - pos < count) ? (_length - pos) : count; if (mode == Mode::ASCII_SSO) { std::memmove(sso + pos, sso + pos + len, _length - pos - len); _length -= len; return *this; } if (mode == Mode::ASCII_HEP) { ascii.erase(ascii.begin() + pos, ascii.begin() + pos + len); _length -= len; return *this; } utf32.erase(utf32.begin() + pos, utf32.begin() + pos + len); _length = utf32.size(); return *this; } String &insert(size_t pos, const String &other) { if (pos > _length) pos = _length; if (other._length == 0) return *this; bool this_ascii = (mode != Mode::UTF32_HEP); bool other_ascii = (other.mode != Mode::UTF32_HEP); // ASCII 合并路径 if (this_ascii && other_ascii) { size_t newlen = _length + other._length; if (mode == Mode::ASCII_SSO && newlen <= SSO_SIZE) { std::memmove(sso + pos + other._length, sso + pos, _length - pos); if (other.mode == Mode::ASCII_SSO) std::memcpy(sso + pos, other.sso, other._length); else std::memcpy(sso + pos, other.ascii.data(), other._length); _length = newlen; return *this; } if (mode == Mode::ASCII_SSO) promote_sso_ascii_to_heap(); if (other.mode == Mode::ASCII_SSO) ascii.insert(ascii.begin() + pos, other.sso, other.sso + other._length); else ascii.insert(ascii.begin() + pos, other.ascii.begin(), other.ascii.end()); _length = newlen; return *this; } // UTF32 路径 ensure_utf32(); if (other.mode == Mode::UTF32_HEP) utf32.insert(utf32.begin() + pos, other.utf32.begin(), other.utf32.end()); else if (other.mode == Mode::ASCII_SSO) for (size_t i = 0; i < other._length; ++i) utf32.insert(utf32.begin() + pos + i, static_cast(other.sso[i])); else for (size_t i = 0; i < other._length; ++i) utf32.insert(utf32.begin() + pos + i, static_cast(other.ascii[i])); _length = utf32.size(); return *this; } int compare(const String &other) const noexcept { size_t n = (_length < other._length) ? _length : other._length; for (size_t i = 0; i < n; ++i) { u32 a = (*this)[i]; u32 b = other[i]; if (a != b) return (a < b) ? -1 : 1; } if (_length == other._length) return 0; return (_length < other._length) ? -1 : 1; } size_t find(const String &needle, size_t pos = 0) const { if (needle._length == 0) return pos <= _length ? pos : size_t(-1); if (needle._length > _length || pos >= _length) return size_t(-1); size_t limit = _length - needle._length; for (size_t i = pos; i <= limit; ++i) { size_t j = 0; for (; j < needle._length; ++j) if ((*this)[i + j] != needle[j]) break; if (j == needle._length) return i; } return size_t(-1); } size_t rfind(const String &needle) const { if (needle._length == 0) return _length; if (needle._length > _length) return size_t(-1); for (size_t i = _length - needle._length + 1; i-- > 0;) { size_t j = 0; for (; j < needle._length; ++j) if ((*this)[i + j] != needle[j]) break; if (j == needle._length) return i; } return size_t(-1); } String &replace(size_t pos, size_t len, const String &repl) { if (pos >= _length) return *this; size_t erase_len = (_length - pos < len) ? (_length - pos) : len; // ASCII路径 bool this_ascii = (mode != Mode::UTF32_HEP); bool repl_ascii = (repl.mode != Mode::UTF32_HEP); if (this_ascii && repl_ascii) { size_t newlen = _length - erase_len + repl._length; // SSO容纳 if (mode == Mode::ASCII_SSO && newlen <= SSO_SIZE) { std::memmove(sso + pos + repl._length, sso + pos + erase_len, _length - pos - erase_len); if (repl.mode == Mode::ASCII_SSO) std::memcpy(sso + pos, repl.sso, repl._length); else std::memcpy(sso + pos, repl.ascii.data(), repl._length); _length = newlen; return *this; } if (mode == Mode::ASCII_SSO) promote_sso_ascii_to_heap(); ascii.erase(ascii.begin() + pos, ascii.begin() + pos + erase_len); if (repl.mode == Mode::ASCII_SSO) ascii.insert(ascii.begin() + pos, repl.sso, repl.sso + repl._length); else ascii.insert(ascii.begin() + pos, repl.ascii.begin(), repl.ascii.end()); _length = newlen; return *this; } // UTF32路径 ensure_utf32(); utf32.erase(utf32.begin() + pos, utf32.begin() + pos + erase_len); if (repl.mode == Mode::UTF32_HEP) utf32.insert(utf32.begin() + pos, repl.utf32.begin(), repl.utf32.end()); else if (repl.mode == Mode::ASCII_SSO) for (size_t i = 0; i < repl._length; ++i) utf32.insert(utf32.begin() + pos + i, static_cast(repl.sso[i])); else for (size_t i = 0; i < repl._length; ++i) utf32.insert(utf32.begin() + pos + i, static_cast(repl.ascii[i])); _length = utf32.size(); return *this; } }; }; // namespace Fig::Deps namespace std { template <> struct hash { size_t operator()(const Fig::Deps::String &s) const noexcept { using String = Fig::Deps::String; using u32 = String::u32; const size_t FNV_offset = 1469598103934665603ull; const size_t FNV_prime = 1099511628211ull; size_t h = FNV_offset; if (s.mode == String::Mode::ASCII_SSO) { for (size_t i = 0; i < s._length; ++i) { h ^= s.sso[i]; h *= FNV_prime; } return h; } if (s.mode == String::Mode::ASCII_HEP) { for (unsigned char c : s.ascii) { h ^= c; h *= FNV_prime; } return h; } // UTF32 for (u32 cp : s.utf32) { h ^= static_cast(cp); h *= FNV_prime; } return h; } }; template <> struct std::formatter { // 不支持自定义格式说明符 constexpr auto parse(std::format_parse_context &ctx) { return ctx.begin(); } template auto format(const Fig::Deps::String &s, FormatContext &ctx) const { return std::format_to(ctx.out(), "{}", s.toStdString()); } }; } // namespace std