mirror of
https://git.mirrors.martin98.com/https://github.com/gulrak/filesystem
synced 2025-06-11 02:36:33 +08:00
refs #18, some refactoring on unicode handling and a fix for a VS2019 warning.
This commit is contained in:
parent
61182d039f
commit
9d5bc604c9
@ -1179,12 +1179,15 @@ GHC_INLINE void appendUTF8(std::string& str, uint32_t unicode)
|
|||||||
str.push_back(static_cast<char>(((unicode & 0xfff) >> 6) + 128));
|
str.push_back(static_cast<char>(((unicode & 0xfff) >> 6) + 128));
|
||||||
str.push_back(static_cast<char>((unicode & 0x3f) + 128));
|
str.push_back(static_cast<char>((unicode & 0x3f) + 128));
|
||||||
}
|
}
|
||||||
else {
|
else if (unicode >= 0x10000 && unicode <= 0x10ffff) {
|
||||||
str.push_back(static_cast<char>((unicode >> 18) + 240));
|
str.push_back(static_cast<char>((unicode >> 18) + 240));
|
||||||
str.push_back(static_cast<char>(((unicode & 0x3ffff) >> 12) + 128));
|
str.push_back(static_cast<char>(((unicode & 0x3ffff) >> 12) + 128));
|
||||||
str.push_back(static_cast<char>(((unicode & 0xfff) >> 6) + 128));
|
str.push_back(static_cast<char>(((unicode & 0xfff) >> 6) + 128));
|
||||||
str.push_back(static_cast<char>((unicode & 0x3f) + 128));
|
str.push_back(static_cast<char>((unicode & 0x3f) + 128));
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
str.push_back(0xfffd);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Thanks to Bjoern Hoehrmann (https://bjoern.hoehrmann.de/utf-8/decoder/dfa/)
|
// Thanks to Bjoern Hoehrmann (https://bjoern.hoehrmann.de/utf-8/decoder/dfa/)
|
||||||
@ -1192,10 +1195,8 @@ GHC_INLINE void appendUTF8(std::string& str, uint32_t unicode)
|
|||||||
// Generating debugging and shrinking my own DFA from scratch was a day of fun!
|
// Generating debugging and shrinking my own DFA from scratch was a day of fun!
|
||||||
GHC_INLINE unsigned consumeUtf8Fragment(const unsigned state, const uint8_t fragment, uint32_t& codepoint)
|
GHC_INLINE unsigned consumeUtf8Fragment(const unsigned state, const uint8_t fragment, uint32_t& codepoint)
|
||||||
{
|
{
|
||||||
static const uint32_t utf8_state_info[] = {
|
static const uint32_t utf8_state_info[] = {0x11111111u, 0x11111111u, 0x77777777u, 0x77777777u, 0x88888888u, 0x88888888u, 0x88888888u, 0x88888888u, 0x22222299u, 0x22222222u, 0x22222222u, 0x22222222u, 0x3333333au, 0x33433333u, 0x9995666bu, 0x99999999u,
|
||||||
0x11111111u, 0x11111111u, 0x77777777u, 0x77777777u, 0x88888888u, 0x88888888u, 0x88888888u, 0x88888888u, 0x22222299u, 0x22222222u, 0x22222222u, 0x22222222u, 0x3333333au, 0x33433333u,
|
0x88888880u, 0x22818108u, 0x88888881u, 0x88888882u, 0x88888884u, 0x88888887u, 0x88888886u, 0x82218108u, 0x82281108u, 0x88888888u, 0x88888883u, 0x88888885u, 0u, 0u, 0u, 0u};
|
||||||
0x9995666bu, 0x99999999u, 0x88888880u, 0x22818108u, 0x88888881u, 0x88888882u, 0x88888884u, 0x88888887u, 0x88888886u, 0x82218108u, 0x82281108u, 0x88888888u, 0x88888883u, 0x88888885u,
|
|
||||||
};
|
|
||||||
uint8_t category = fragment < 128 ? 0 : (utf8_state_info[(fragment >> 3) & 0xf] >> ((fragment & 7) << 2)) & 0xf;
|
uint8_t category = fragment < 128 ? 0 : (utf8_state_info[(fragment >> 3) & 0xf] >> ((fragment & 7) << 2)) & 0xf;
|
||||||
codepoint = (state ? (codepoint << 6) | (fragment & 0x3f) : (0xff >> category) & fragment);
|
codepoint = (state ? (codepoint << 6) | (fragment & 0x3f) : (0xff >> category) & fragment);
|
||||||
return state == S_RJCT ? static_cast<unsigned>(S_RJCT) : static_cast<unsigned>((utf8_state_info[category + 16] >> (state << 2)) & 0xf);
|
return state == S_RJCT ? static_cast<unsigned>(S_RJCT) : static_cast<unsigned>((utf8_state_info[category + 16] >> (state << 2)) & 0xf);
|
||||||
@ -1207,12 +1208,15 @@ GHC_INLINE unsigned consumeUtf8Fragment(const unsigned state, const uint8_t frag
|
|||||||
|
|
||||||
namespace detail {
|
namespace detail {
|
||||||
|
|
||||||
template <class StringType>
|
template <class StringType, typename std::enable_if<(sizeof(typename StringType::value_type) == 1)>::type* = nullptr>
|
||||||
inline StringType fromUtf8(const std::string& utf8String, const typename StringType::allocator_type& alloc = typename StringType::allocator_type())
|
inline StringType fromUtf8(const std::string& utf8String, const typename StringType::allocator_type& alloc = typename StringType::allocator_type())
|
||||||
{
|
{
|
||||||
if (sizeof(typename StringType::value_type) == 1) {
|
|
||||||
return StringType(utf8String.begin(), utf8String.end());
|
return StringType(utf8String.begin(), utf8String.end());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <class StringType, typename std::enable_if<(sizeof(typename StringType::value_type) == 2)>::type* = nullptr>
|
||||||
|
inline StringType fromUtf8(const std::string& utf8String, const typename StringType::allocator_type& alloc = typename StringType::allocator_type())
|
||||||
|
{
|
||||||
StringType result(alloc);
|
StringType result(alloc);
|
||||||
result.reserve(utf8String.length());
|
result.reserve(utf8String.length());
|
||||||
std::string::const_iterator iter = utf8String.begin();
|
std::string::const_iterator iter = utf8String.begin();
|
||||||
@ -1220,10 +1224,6 @@ inline StringType fromUtf8(const std::string& utf8String, const typename StringT
|
|||||||
std::uint32_t codepoint = 0;
|
std::uint32_t codepoint = 0;
|
||||||
while (iter < utf8String.end()) {
|
while (iter < utf8String.end()) {
|
||||||
if ((utf8_state = consumeUtf8Fragment(utf8_state, (uint8_t)*iter++, codepoint)) == S_STRT) {
|
if ((utf8_state = consumeUtf8Fragment(utf8_state, (uint8_t)*iter++, codepoint)) == S_STRT) {
|
||||||
if (sizeof(typename StringType::value_type) == 4) {
|
|
||||||
result += codepoint;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
if (codepoint <= 0xffff) {
|
if (codepoint <= 0xffff) {
|
||||||
result += (typename StringType::value_type)codepoint;
|
result += (typename StringType::value_type)codepoint;
|
||||||
}
|
}
|
||||||
@ -1232,7 +1232,6 @@ inline StringType fromUtf8(const std::string& utf8String, const typename StringT
|
|||||||
result += (typename StringType::value_type)((codepoint >> 10) + 0xd800);
|
result += (typename StringType::value_type)((codepoint >> 10) + 0xd800);
|
||||||
result += (typename StringType::value_type)((codepoint & 0x3ff) + 0xdc00);
|
result += (typename StringType::value_type)((codepoint & 0x3ff) + 0xdc00);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
codepoint = 0;
|
codepoint = 0;
|
||||||
}
|
}
|
||||||
else if (utf8_state == S_RJCT) {
|
else if (utf8_state == S_RJCT) {
|
||||||
@ -1247,17 +1246,42 @@ inline StringType fromUtf8(const std::string& utf8String, const typename StringT
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename charT, typename traits, typename Alloc>
|
template <class StringType, typename std::enable_if<(sizeof(typename StringType::value_type) == 4)>::type* = nullptr>
|
||||||
|
inline StringType fromUtf8(const std::string& utf8String, const typename StringType::allocator_type& alloc = typename StringType::allocator_type())
|
||||||
|
{
|
||||||
|
StringType result(alloc);
|
||||||
|
result.reserve(utf8String.length());
|
||||||
|
std::string::const_iterator iter = utf8String.begin();
|
||||||
|
unsigned utf8_state = S_STRT;
|
||||||
|
std::uint32_t codepoint = 0;
|
||||||
|
while (iter < utf8String.end()) {
|
||||||
|
if ((utf8_state = consumeUtf8Fragment(utf8_state, (uint8_t)*iter++, codepoint)) == S_STRT) {
|
||||||
|
result += codepoint;
|
||||||
|
codepoint = 0;
|
||||||
|
}
|
||||||
|
else if (utf8_state == S_RJCT) {
|
||||||
|
result += (typename StringType::value_type)0xfffd;
|
||||||
|
utf8_state = S_STRT;
|
||||||
|
codepoint = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (utf8_state) {
|
||||||
|
result += (typename StringType::value_type)0xfffd;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename charT, typename traits, typename Alloc, typename std::enable_if<(sizeof(charT) == 1)>::type* = nullptr>
|
||||||
inline std::string toUtf8(const std::basic_string<charT, traits, Alloc>& unicodeString)
|
inline std::string toUtf8(const std::basic_string<charT, traits, Alloc>& unicodeString)
|
||||||
{
|
{
|
||||||
using StringType = std::basic_string<charT, traits, Alloc>;
|
|
||||||
if (sizeof(typename StringType::value_type) == 1) {
|
|
||||||
return std::string(unicodeString.begin(), unicodeString.end());
|
return std::string(unicodeString.begin(), unicodeString.end());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename charT, typename traits, typename Alloc, typename std::enable_if<(sizeof(charT) == 2)>::type* = nullptr>
|
||||||
|
inline std::string toUtf8(const std::basic_string<charT, traits, Alloc>& unicodeString)
|
||||||
|
{
|
||||||
std::string result;
|
std::string result;
|
||||||
result.reserve(unicodeString.length());
|
for (auto iter = unicodeString.begin(); iter != unicodeString.end(); ++iter) {
|
||||||
if (sizeof(typename StringType::value_type) == 2) {
|
|
||||||
for (typename StringType::const_iterator iter = unicodeString.begin(); iter != unicodeString.end(); ++iter) {
|
|
||||||
char32_t c = *iter;
|
char32_t c = *iter;
|
||||||
if (is_surrogate(c)) {
|
if (is_surrogate(c)) {
|
||||||
++iter;
|
++iter;
|
||||||
@ -1272,12 +1296,16 @@ inline std::string toUtf8(const std::basic_string<charT, traits, Alloc>& unicode
|
|||||||
appendUTF8(result, c);
|
appendUTF8(result, c);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
else {
|
|
||||||
for (char32_t c : unicodeString) {
|
template <typename charT, typename traits, typename Alloc, typename std::enable_if<(sizeof(charT) == 4)>::type* = nullptr>
|
||||||
|
inline std::string toUtf8(const std::basic_string<charT, traits, Alloc>& unicodeString)
|
||||||
|
{
|
||||||
|
std::string result;
|
||||||
|
for (auto c : unicodeString) {
|
||||||
appendUTF8(result, c);
|
appendUTF8(result, c);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user