From ed6115361006240e3c7b02295599e4534cc55a13 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Sat, 19 Oct 2013 08:57:20 +0200 Subject: Update internal Boost subset to 1.54.0 --- .../boost/regex/pending/unicode_iterator.hpp | 102 +++++++++++++++++++-- 1 file changed, 93 insertions(+), 9 deletions(-) (limited to 'cutl/details/boost/regex/pending/unicode_iterator.hpp') diff --git a/cutl/details/boost/regex/pending/unicode_iterator.hpp b/cutl/details/boost/regex/pending/unicode_iterator.hpp index 2ab88c3..467afc7 100644 --- a/cutl/details/boost/regex/pending/unicode_iterator.hpp +++ b/cutl/details/boost/regex/pending/unicode_iterator.hpp @@ -82,16 +82,16 @@ static const ::cutl_details_boost::uint32_t ten_bit_mask = 0x3FFu; inline bool is_high_surrogate(::cutl_details_boost::uint16_t v) { - return (v & 0xFC00u) == 0xd800u; + return (v & 0xFFFFFC00u) == 0xd800u; } inline bool is_low_surrogate(::cutl_details_boost::uint16_t v) { - return (v & 0xFC00u) == 0xdc00u; + return (v & 0xFFFFFC00u) == 0xdc00u; } template inline bool is_surrogate(T v) { - return (v & 0xF800u) == 0xd800; + return (v & 0xFFFFF800u) == 0xd800; } inline unsigned utf8_byte_count(cutl_details_boost::uint8_t c) @@ -113,6 +113,10 @@ inline unsigned utf8_trailing_byte_count(cutl_details_boost::uint8_t c) return utf8_byte_count(c) - 1; } +#ifdef BOOST_MSVC +#pragma warning(push) +#pragma warning(disable:4100) +#endif inline void invalid_utf32_code_point(::cutl_details_boost::uint32_t val) { #ifndef BOOST_NO_STD_LOCALE @@ -124,6 +128,9 @@ inline void invalid_utf32_code_point(::cutl_details_boost::uint32_t val) #endif cutl_details_boost::throw_exception(e); } +#ifdef BOOST_MSVC +#pragma warning(pop) +#endif } // namespace detail @@ -296,6 +303,34 @@ public: { m_value = pending_read; } + // + // Range checked version: + // + u16_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b) + { + m_value = pending_read; + // + // The range must not start with a low surrogate, or end in a high surrogate, + // otherwise we run the risk of running outside the underlying input range. + // Likewise b must not be located at a low surrogate. + // + cutl_details_boost::uint16_t val; + if(start != end) + { + if((b != start) && (b != end)) + { + val = *b; + if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u)) + invalid_code_point(val); + } + val = *start; + if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u)) + invalid_code_point(val); + val = *--end; + if(detail::is_high_surrogate(val)) + invalid_code_point(val); + } + } private: static void invalid_code_point(::cutl_details_boost::uint16_t val) { @@ -485,9 +520,26 @@ public: } void increment() { + // We must not start with a continuation character: + if((static_cast(*m_position) & 0xC0) == 0x80) + invalid_sequence(); // skip high surrogate first if there is one: unsigned c = detail::utf8_byte_count(*m_position); - std::advance(m_position, c); + if(m_value == pending_read) + { + // Since we haven't read in a value, we need to validate the code points: + for(unsigned i = 0; i < c; ++i) + { + ++m_position; + // We must have a continuation byte: + if((i != c - 1) && ((static_cast(*m_position) & 0xC0) != 0x80)) + invalid_sequence(); + } + } + else + { + std::advance(m_position, c); + } m_value = pending_read; } void decrement() @@ -497,7 +549,7 @@ public: while((*--m_position & 0xC0u) == 0x80u) ++count; // now check that the sequence was valid: if(count != detail::utf8_trailing_byte_count(*m_position)) - invalid_sequnce(); + invalid_sequence(); m_value = pending_read; } BaseIterator base()const @@ -513,8 +565,37 @@ public: { m_value = pending_read; } + // + // Checked constructor: + // + u8_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b) + { + m_value = pending_read; + // + // We must not start with a continuation character, or end with a + // truncated UTF-8 sequence otherwise we run the risk of going past + // the start/end of the underlying sequence: + // + if(start != end) + { + unsigned char v = *start; + if((v & 0xC0u) == 0x80u) + invalid_sequence(); + if((b != start) && (b != end) && ((*b & 0xC0u) == 0x80u)) + invalid_sequence(); + BaseIterator pos = end; + do + { + v = *--pos; + } + while((start != pos) && ((v & 0xC0u) == 0x80u)); + std::ptrdiff_t extra = detail::utf8_byte_count(v); + if(std::distance(pos, end) < extra) + invalid_sequence(); + } + } private: - static void invalid_sequnce() + static void invalid_sequence() { std::out_of_range e("Invalid UTF-8 sequence encountered while trying to encode UTF-32 character"); cutl_details_boost::throw_exception(e); @@ -524,8 +605,8 @@ private: m_value = static_cast(static_cast< ::cutl_details_boost::uint8_t>(*m_position)); // we must not have a continuation character: if((m_value & 0xC0u) == 0x80u) - invalid_sequnce(); - // see how many extra byts we have: + invalid_sequence(); + // see how many extra bytes we have: unsigned extra = detail::utf8_trailing_byte_count(*m_position); // extract the extra bits, 6 from each extra byte: BaseIterator next(m_position); @@ -533,6 +614,9 @@ private: { ++next; m_value <<= 6; + // We must have a continuation byte: + if((static_cast(*next) & 0xC0) != 0x80) + invalid_sequence(); m_value += static_cast(*next) & 0x3Fu; } // we now need to remove a few of the leftmost bits, but how many depends @@ -547,7 +631,7 @@ private: m_value &= masks[extra]; // check the result: if(m_value > static_cast(0x10FFFFu)) - invalid_sequnce(); + invalid_sequence(); } BaseIterator m_position; mutable U32Type m_value; -- cgit v1.1