//# This file is a part of toml++ and is subject to the the terms of the MIT license. //# Copyright (c) 2019-2020 Mark Gillard //# See https://github.com/marzer/tomlplusplus/blob/master/LICENSE for the full license text. // SPDX-License-Identifier: MIT #pragma once //# {{ #include "toml_preprocessor.h" #if !TOML_PARSER #error This header cannot not be included when TOML_PARSER is disabled. #endif //# }} #include "toml_utf8.h" #include "toml_parse_error.h" TOML_PUSH_WARNINGS TOML_DISABLE_PADDING_WARNINGS namespace toml::impl { template class utf8_byte_stream; inline constexpr auto utf8_byte_order_mark = "\xEF\xBB\xBF"sv; template class utf8_byte_stream> final { static_assert(sizeof(Char) == 1_sz); private: std::basic_string_view source; size_t position = {}; public: explicit constexpr utf8_byte_stream(std::basic_string_view sv) noexcept : source{ sv } { // trim trailing nulls size_t actual_len = source.length(); for (size_t i = actual_len; i --> 0_sz;) { if (source[i] != Char{}) // not '\0' { actual_len = i + 1_sz; break; } } if (source.length() != actual_len) // not '\0' source = source.substr(0_sz, actual_len); // skip bom if (source.length() >= 3_sz && memcmp(utf8_byte_order_mark.data(), source.data(), 3_sz) == 0) position += 3_sz; } [[nodiscard]] TOML_ALWAYS_INLINE constexpr bool eof() const noexcept { return position >= source.length(); } [[nodiscard]] TOML_ALWAYS_INLINE constexpr bool peek_eof() const noexcept { return eof(); } [[nodiscard]] TOML_ALWAYS_INLINE constexpr bool error() const noexcept { return false; } [[nodiscard]] constexpr unsigned int operator() () noexcept { if (position >= source.length()) return 0xFFFFFFFFu; return static_cast(static_cast(source[position++])); } }; template class utf8_byte_stream> final { static_assert(sizeof(Char) == 1_sz); private: std::basic_istream* source; public: explicit utf8_byte_stream(std::basic_istream& stream) : source{ &stream } { if (!source->good()) // eof, fail, bad return; const auto initial_pos = source->tellg(); Char bom[3]; source->read(bom, 3); if (source->bad() || (source->gcount() == 3 && memcmp(utf8_byte_order_mark.data(), bom, 3_sz) == 0)) return; source->clear(); source->seekg(initial_pos, std::ios::beg); } [[nodiscard]] TOML_ALWAYS_INLINE bool eof() const noexcept { return source->eof(); } [[nodiscard]] TOML_ALWAYS_INLINE bool peek_eof() const { using stream_traits = typename std::remove_pointer_t::traits_type; return eof() || source->peek() == stream_traits::eof(); } [[nodiscard]] TOML_ALWAYS_INLINE bool error() const noexcept { return !(*source); } [[nodiscard]] unsigned int operator() () { auto val = source->get(); if (val == std::basic_istream::traits_type::eof()) return 0xFFFFFFFFu; return static_cast(val); } }; #if TOML_LARGE_FILES TOML_ABI_NAMESPACE_START(impl_lf) #else TOML_ABI_NAMESPACE_START(impl_sf) #endif struct utf8_codepoint final { char32_t value; string_char bytes[4]; source_position position; template [[nodiscard]] TOML_ALWAYS_INLINE std::basic_string_view as_view() const noexcept { static_assert( sizeof(Char) == 1, "The string view's underlying character type must be 1 byte in size." ); return bytes[3] ? std::basic_string_view{ reinterpret_cast(bytes), 4_sz } : std::basic_string_view{ reinterpret_cast(bytes) }; } [[nodiscard]] TOML_ATTR(pure) TOML_ALWAYS_INLINE constexpr operator char32_t& () noexcept { return value; } [[nodiscard]] TOML_ATTR(pure) TOML_ALWAYS_INLINE constexpr operator const char32_t& () const noexcept { return value; } [[nodiscard]] TOML_ATTR(pure) TOML_ALWAYS_INLINE constexpr const char32_t& operator* () const noexcept { return value; } }; static_assert(std::is_trivial_v); static_assert(std::is_standard_layout_v); TOML_ABI_NAMESPACE_END // TOML_LARGE_FILES #if TOML_EXCEPTIONS #define TOML_ERROR_CHECK (void)0 #define TOML_ERROR throw parse_error TOML_ABI_NAMESPACE_START(impl_ex) #else #define TOML_ERROR_CHECK if (err) return nullptr #define TOML_ERROR err.emplace TOML_ABI_NAMESPACE_START(impl_noex) #endif TOML_PUSH_WARNINGS TOML_DISABLE_VTABLE_WARNINGS struct TOML_INTERFACE utf8_reader_interface { [[nodiscard]] virtual const source_path_ptr& source_path() const noexcept = 0; [[nodiscard]] virtual const utf8_codepoint* read_next() = 0; [[nodiscard]] virtual bool peek_eof() const = 0; #if !TOML_EXCEPTIONS [[nodiscard]] virtual optional&& error() noexcept = 0; #endif virtual ~utf8_reader_interface() noexcept = default; }; template class TOML_EMPTY_BASES utf8_reader final : public utf8_reader_interface { private: utf8_byte_stream stream; utf8_decoder decoder; utf8_codepoint codepoints[2]; size_t cp_idx = 1; uint8_t current_byte_count{}; source_path_ptr source_path_; #if !TOML_EXCEPTIONS optional err; #endif public: template explicit utf8_reader(U && source, String&& source_path = {}) noexcept(std::is_nothrow_constructible_v, U&&>) : stream{ std::forward(source) } { std::memset(codepoints, 0, sizeof(codepoints)); codepoints[0].position = { 1, 1 }; codepoints[1].position = { 1, 1 }; if (!source_path.empty()) source_path_ = std::make_shared(std::forward(source_path)); } [[nodiscard]] const source_path_ptr& source_path() const noexcept override { return source_path_; } [[nodiscard]] const utf8_codepoint* read_next() override { TOML_ERROR_CHECK; auto& prev = codepoints[(cp_idx - 1_sz) % 2_sz]; if (stream.eof()) return nullptr; else if (stream.error()) TOML_ERROR("An error occurred while reading from the underlying stream", prev.position, source_path_ ); else if (decoder.error()) TOML_ERROR( "Encountered invalid utf-8 sequence", prev.position, source_path_ ); TOML_ERROR_CHECK; while (true) { uint8_t next_byte; { unsigned int next_byte_raw{ 0xFFFFFFFFu }; if constexpr (noexcept(stream()) || !TOML_EXCEPTIONS) { next_byte_raw = stream(); } #if TOML_EXCEPTIONS else { try { next_byte_raw = stream(); } catch (const std::exception& exc) { throw parse_error{ exc.what(), prev.position, source_path_ }; } catch (...) { throw parse_error{ "An unspecified error occurred", prev.position, source_path_ }; } } #endif if (next_byte_raw >= 256u) { if (stream.eof()) { if (decoder.needs_more_input()) TOML_ERROR("Encountered EOF during incomplete utf-8 code point sequence", prev.position, source_path_); return nullptr; } else TOML_ERROR("An error occurred while reading from the underlying stream", prev.position, source_path_); } TOML_ERROR_CHECK; next_byte = static_cast(next_byte_raw); } decoder(next_byte); if (decoder.error()) TOML_ERROR( "Encountered invalid utf-8 sequence", prev.position, source_path_ ); TOML_ERROR_CHECK; auto& current = codepoints[cp_idx % 2_sz]; current.bytes[current_byte_count++] = static_cast(next_byte); if (decoder.has_code_point()) { //store codepoint current.value = decoder.codepoint; //reset prev (will be the next 'current') std::memset(prev.bytes, 0, sizeof(prev.bytes)); current_byte_count = {}; if (is_line_break(current.value)) prev.position = { static_cast(current.position.line + 1), 1 }; else prev.position = { current.position.line, static_cast(current.position.column + 1) }; cp_idx++; return ¤t; } } TOML_UNREACHABLE; } [[nodiscard]] bool peek_eof() const override { return stream.peek_eof(); } #if !TOML_EXCEPTIONS [[nodiscard]] optional&& error() noexcept override { return std::move(err); } #endif }; template utf8_reader(std::basic_string_view, std::string_view) -> utf8_reader>; template utf8_reader(std::basic_istream&, std::string_view) -> utf8_reader>; template utf8_reader(std::basic_string_view, std::string&&) -> utf8_reader>; template utf8_reader(std::basic_istream&, std::string&&) -> utf8_reader>; #if !TOML_EXCEPTIONS #undef TOML_ERROR_CHECK #define TOML_ERROR_CHECK if (reader.error()) return nullptr #endif class TOML_EMPTY_BASES utf8_buffered_reader final : public utf8_reader_interface { public: static constexpr size_t max_history_length = 72; private: static constexpr size_t history_buffer_size = max_history_length - 1; //'head' is stored in the reader utf8_reader_interface& reader; struct { utf8_codepoint buffer[history_buffer_size]; size_t count, first; } history = {}; const utf8_codepoint* head = {}; size_t negative_offset = {}; public: explicit utf8_buffered_reader(utf8_reader_interface& reader_) noexcept : reader{ reader_ } {} [[nodiscard]] const source_path_ptr& source_path() const noexcept override { return reader.source_path(); } [[nodiscard]] const utf8_codepoint* read_next() override { TOML_ERROR_CHECK; if (negative_offset) { negative_offset--; // an entry negative offset of 1 just means "replay the current head" if (!negative_offset) return head; // otherwise step back into the history buffer else return history.buffer + ((history.first + history.count - negative_offset) % history_buffer_size); } else { // first character read from stream if TOML_UNLIKELY(!history.count && !head) head = reader.read_next(); // subsequent characters and not eof else if (head) { if TOML_UNLIKELY(history.count < history_buffer_size) history.buffer[history.count++] = *head; else history.buffer[(history.first++ + history_buffer_size) % history_buffer_size] = *head; head = reader.read_next(); } return head; } } [[nodiscard]] const utf8_codepoint* step_back(size_t count) noexcept { TOML_ERROR_CHECK; TOML_ASSERT(history.count); TOML_ASSERT(negative_offset + count <= history.count); negative_offset += count; return negative_offset ? history.buffer + ((history.first + history.count - negative_offset) % history_buffer_size) : head; } [[nodiscard]] bool peek_eof() const override { return reader.peek_eof(); } #if !TOML_EXCEPTIONS [[nodiscard]] optional&& error() noexcept override { return reader.error(); } #endif }; #undef TOML_ERROR_CHECK #undef TOML_ERROR TOML_ABI_NAMESPACE_END // TOML_EXCEPTIONS TOML_POP_WARNINGS } TOML_POP_WARNINGS // TOML_DISABLE_PADDING_WARNINGS