AK: Implement Utf8CodepointIterator::peek(size_t)

This adds a peek method for Utf8CodepointIterator, which enables it to
be used in some parsing cases where peeking is necessary.

peek(0) is equivalent to operator*, expect that peek() does not contain
any assertions and will just return an empty Optional<u32>.

This also implements a test case for iterating UTF-8.
This commit is contained in:
Max Wipfli 2021-05-24 00:29:16 +02:00 committed by Andreas Kling
parent 31f6ba0952
commit 14506e8f5e
Notes: sideshowbarker 2024-07-18 17:04:53 +09:00
3 changed files with 55 additions and 0 deletions

View file

@ -240,4 +240,21 @@ u32 Utf8CodepointIterator::operator*() const
return code_point_value_so_far;
}
Optional<u32> Utf8CodepointIterator::peek(size_t offset) const
{
if (offset == 0) {
if (this->done())
return {};
return this->operator*();
}
auto new_iterator = *this;
for (size_t index = 0; index < offset; ++index) {
++new_iterator;
if (new_iterator.done())
return {};
}
return *new_iterator;
}
}

View file

@ -25,6 +25,8 @@ public:
bool operator!=(const Utf8CodepointIterator&) const;
Utf8CodepointIterator& operator++();
u32 operator*() const;
// NOTE: This returns {} if the peek is at or past EOF.
Optional<u32> peek(size_t offset = 0) const;
ssize_t operator-(const Utf8CodepointIterator& other) const
{

View file

@ -67,3 +67,39 @@ TEST_CASE(validate_invalid_ut8)
EXPECT(!utf8_4.validate(valid_bytes));
EXPECT(valid_bytes == 0);
}
TEST_CASE(iterate_utf8)
{
Utf8View view("Some weird characters \u00A9\u266A\uA755");
Utf8CodepointIterator iterator = view.begin();
EXPECT(*iterator == 'S');
EXPECT(iterator.peek().has_value() && iterator.peek().value() == 'S');
EXPECT(iterator.peek(0).has_value() && iterator.peek(0).value() == 'S');
EXPECT(iterator.peek(1).has_value() && iterator.peek(1).value() == 'o');
EXPECT(iterator.peek(22).has_value() && iterator.peek(22).value() == 0x00A9);
EXPECT(iterator.peek(24).has_value() && iterator.peek(24).value() == 0xA755);
EXPECT(!iterator.peek(25).has_value());
++iterator;
EXPECT(*iterator == 'o');
EXPECT(iterator.peek(23).has_value() && iterator.peek(23).value() == 0xA755);
for (size_t i = 0; i < 23; ++i)
++iterator;
EXPECT(!iterator.done());
EXPECT(*iterator == 0xA755);
EXPECT(iterator.peek().has_value() && iterator.peek().value() == 0xA755);
EXPECT(!iterator.peek(1).has_value());
++iterator;
EXPECT(iterator.done());
EXPECT(!iterator.peek(0).has_value());
EXPECT_CRASH("Dereferencing Utf8CodepointIterator which is already done.", [&iterator] {
*iterator;
return Test::Crash::Failure::DidNotCrash;
});
}