refs #18, unicode error raise can now be enabled with define GHC_RAISE_UNICODE_ERRORS

2025-10-19 08:21:29 +08:00 · 2019-06-02 11:52:07 +02:00 · 2019-06-02 11:52:07 +02:00 · 98dad7f0a1
commit 98dad7f0a1
parent 2969bad152
2 changed files with 76 additions and 4 deletions
--- a/include/ghc/filesystem.hpp
+++ b/include/ghc/filesystem.hpp
@ -166,6 +166,11 @@
 // as ghc::filesystem::string_type.
 // #define GHC_WIN_WSTRING_STRING_TYPE
 //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 // Rais errors/exceptions when invalid unicode codepoints or UTF-8 sequences are found,
 // instead of replacing them with the unicode replacement character (U+FFFD).
 // #define GHC_RAISE_UNICODE_ERRORS
 //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 // ghc::filesystem version in decimal (major * 10000 + minor * 100 + patch)
 #define GHC_FILESYSTEM_VERSION 10199L
@ -1209,7 +1214,11 @@ GHC_INLINE void appendUTF8(std::string& str, uint32_t unicode)
        str.push_back(static_cast<char>((unicode & 0x3f) + 128));
    }
    else {
 #ifdef GHC_RAISE_UNICODE_ERRORS
        throw filesystem_error("Illegal code point for unicode character.", str, std::make_error_code(std::errc::illegal_byte_sequence));
 #else
        appendUTF8(str, 0xfffd);
 #endif
    }
 }
@ -1228,6 +1237,22 @@ GHC_INLINE unsigned consumeUtf8Fragment(const unsigned state, const uint8_t frag
    return state == S_RJCT ? static_cast<unsigned>(S_RJCT) : static_cast<unsigned>((utf8_state_info[category + 16] >> (state << 2)) & 0xf);
 }
 GHC_INLINE bool validUtf8(const std::string& utf8String)
 {
    std::string::const_iterator iter = utf8String.begin();
    unsigned utf8_state = S_STRT;
    std::uint32_t codepoint = 0;
    while (iter < utf8String.end()) {
        if ((utf8_state = consumeUtf8Fragment(utf8_state, (uint8_t)*iter++, codepoint)) == S_RJCT) {
            return false;
        }
    }
    if (utf8_state) {
        return false;
    }
    return true;
 }
 }  // namespace detail
 #endif
@ -1261,13 +1286,21 @@ inline StringType fromUtf8(const std::string& utf8String, const typename StringT
            codepoint = 0;
        }
        else if (utf8_state == S_RJCT) {
 #ifdef GHC_RAISE_UNICODE_ERRORS
            throw filesystem_error("Illegal byte sequence for unicode character.", utf8String, std::make_error_code(std::errc::illegal_byte_sequence));
 #else
            result += (typename StringType::value_type)0xfffd;
            utf8_state = S_STRT;
            codepoint = 0;
 #endif
        }
    }
    if (utf8_state) {
 #ifdef GHC_RAISE_UNICODE_ERRORS
        throw filesystem_error("Illegal byte sequence for unicode character.", utf8String, std::make_error_code(std::errc::illegal_byte_sequence));
 #else
        result += (typename StringType::value_type)0xfffd;
 #endif
    }
    return result;
 }
@ -1286,13 +1319,21 @@ inline StringType fromUtf8(const std::string& utf8String, const typename StringT
            codepoint = 0;
        }
        else if (utf8_state == S_RJCT) {
 #ifdef GHC_RAISE_UNICODE_ERRORS
            throw filesystem_error("Illegal byte sequence for unicode character.", utf8String, std::make_error_code(std::errc::illegal_byte_sequence));
 #else
            result += (typename StringType::value_type)0xfffd;
            utf8_state = S_STRT;
            codepoint = 0;
 #endif
        }
    }
    if (utf8_state) {
 #ifdef GHC_RAISE_UNICODE_ERRORS
        throw filesystem_error("Illegal byte sequence for unicode character.", utf8String, std::make_error_code(std::errc::illegal_byte_sequence));
 #else
        result += (typename StringType::value_type)0xfffd;
 #endif
    }
    return result;
 }
@ -1315,10 +1356,14 @@ inline std::string toUtf8(const std::basic_string<charT, traits, Alloc>& unicode
                appendUTF8(result, (char32_t(c) << 10) + *iter - 0x35fdc00);
            }
            else {
 #ifdef GHC_RAISE_UNICODE_ERRORS
                throw filesystem_error("Illegal code point for unicode character.", result, std::make_error_code(std::errc::illegal_byte_sequence));
 #else
                appendUTF8(result, 0xfffd);
                if(iter == unicodeString.end()) {
                    break;
                }
 #endif
            }
        }
        else {
@ -1359,6 +1404,13 @@ GHC_INLINE bool startsWith(const std::string& what, const std::string& with)
 GHC_INLINE void path::postprocess_path_with_format(path::impl_string_type& p, path::format fmt)
 {
 #ifdef GHC_RAISE_UNICODE_ERRORS
    if(!detail::validUtf8(p)) {
        path t;
        t._path = p;
        throw filesystem_error("Illegal byte sequence for unicode character.", t, std::make_error_code(std::errc::illegal_byte_sequence));
    }
 #endif
    switch (fmt) {
 #ifndef GHC_OS_WINDOWS
        case path::auto_format:
@ -4658,10 +4710,20 @@ public:
            do {
                if (FindNextFileW(_dirHandle, &_findData)) {
                    _current = _base;
                    try {
                        _current.append_name(detail::toUtf8(_findData.cFileName).c_str());
                    }
                    catch(filesystem_error& fe) {
                        ec = fe.code();
                        return;
                    }
                    copyToDirEntry(ec);
                }
                else {
                    auto err = ::GetLastError();
                    if(err != ERROR_NO_MORE_FILES) {
                        _ec = ec = std::error_code(err, std::system_category());
                    }
                    FindClose(_dirHandle);
                    _dirHandle = INVALID_HANDLE_VALUE;
                    _current = filesystem::path();
--- a/test/filesystem_test.cpp
+++ b/test/filesystem_test.cpp
@ -321,17 +321,27 @@ TEST_CASE("fs::detail::fromUtf8", "[filesystem][fs.detail.utf8]")
    CHECK(fs::detail::toUtf8(std::wstring(L"föobar")).length() == 7);
    CHECK(fs::detail::toUtf8(std::wstring(L"föobar")) == u8"föobar");
 #ifdef GHC_RAISE_UNICODE_ERRORS
    CHECK_THROWS_AS(fs::detail::fromUtf8<std::u16string>(std::string("\xed\xa0\x80")), fs::filesystem_error);
    CHECK_THROWS_AS(fs::detail::fromUtf8<std::u16string>(std::string("\xc3")), fs::filesystem_error);
 #else
    CHECK(std::u16string(2,0xfffd) == fs::detail::fromUtf8<std::u16string>(std::string("\xed\xa0\x80")));
    CHECK(std::u16string(1,0xfffd) == fs::detail::fromUtf8<std::u16string>(std::string("\xc3")));
 #endif
 }
 TEST_CASE("fs::detail::toUtf8", "[filesystem][fs.detail.utf8]")
 {
    CHECK(std::string("\xc3\xa4/\xe2\x82\xac\xf0\x9d\x84\x9e") == fs::detail::toUtf8(std::u16string(u"\u00E4/\u20AC\U0001D11E")));
    CHECK(std::string("\xEF\xBF\xBD") == fs::detail::toUtf8(std::u16string(1, 0xd800)));
    std::string t;
    CHECK(std::string("\xc3\xa4/\xe2\x82\xac\xf0\x9d\x84\x9e") == fs::detail::toUtf8(std::u16string(u"\u00E4/\u20AC\U0001D11E")));
 #ifdef GHC_RAISE_UNICODE_ERRORS
    CHECK_THROWS_AS(fs::detail::toUtf8(std::u16string(1, 0xd800)), fs::filesystem_error);
    CHECK_THROWS_AS(fs::detail::appendUTF8(t, 0x200000), fs::filesystem_error);
 #else
    CHECK(std::string("\xEF\xBF\xBD") == fs::detail::toUtf8(std::u16string(1, 0xd800)));
    fs::detail::appendUTF8(t, 0x200000);
    CHECK(std::string("\xEF\xBF\xBD") == t);
 #endif
 }
 #endif