LibLocale: Implement an ICU-based text segmenter

Our current segmenter implementation lives in LibUnicode, and is not
locale-aware. We will need such awareness for ECMA-402, and so LibLocale
will be the new home for text segmentation.

The tests here are ported directly from LibUnicode/TestSegmentation.cpp.
This commit is contained in:
Timothy Flynn 2024-06-18 18:46:23 -04:00 committed by Andreas Kling
parent 5cf818e305
commit 3fe0a27fbd
Notes: sideshowbarker 2024-07-17 06:00:02 +09:00
6 changed files with 433 additions and 0 deletions

View file

@ -1,6 +1,7 @@
set(TEST_SOURCES
TestDisplayNames.cpp
TestLocale.cpp
TestSegmenter.cpp
)
foreach(source IN LISTS TEST_SOURCES)

View file

@ -0,0 +1,128 @@
/*
* Copyright (c) 2023-2024, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <LibTest/TestCase.h>
#include <AK/Array.h>
#include <AK/String.h>
#include <AK/StringView.h>
#include <AK/Vector.h>
#include <LibLocale/Segmenter.h>
template<size_t N>
static void test_grapheme_segmentation(StringView string, size_t const (&expected_boundaries)[N])
{
Vector<size_t> boundaries;
auto segmenter = Locale::Segmenter::create(Locale::SegmenterGranularity::Grapheme);
segmenter->for_each_boundary(MUST(String::from_utf8(string)), [&](auto boundary) {
boundaries.append(boundary);
return IterationDecision::Continue;
});
EXPECT_EQ(boundaries, ReadonlySpan<size_t> { expected_boundaries });
}
TEST_CASE(grapheme_segmentation)
{
auto segmenter = Locale::Segmenter::create(Locale::SegmenterGranularity::Grapheme);
segmenter->for_each_boundary(String {}, [&](auto i) {
dbgln("{}", i);
VERIFY_NOT_REACHED();
return IterationDecision::Break;
});
test_grapheme_segmentation("a"sv, { 0u, 1u });
test_grapheme_segmentation("ab"sv, { 0u, 1u, 2u });
test_grapheme_segmentation("abc"sv, { 0u, 1u, 2u, 3u });
test_grapheme_segmentation("a\nb"sv, { 0u, 1u, 2u, 3u });
test_grapheme_segmentation("a\n\rb"sv, { 0u, 1u, 2u, 3u, 4u });
test_grapheme_segmentation("a\r\nb"sv, { 0u, 1u, 3u, 4u });
test_grapheme_segmentation("aᄀb"sv, { 0u, 1u, 4u, 5u });
test_grapheme_segmentation("aᄀᄀb"sv, { 0u, 1u, 7u, 8u });
test_grapheme_segmentation("aᄀᆢb"sv, { 0u, 1u, 7u, 8u });
test_grapheme_segmentation("aᄀ가b"sv, { 0u, 1u, 7u, 8u });
test_grapheme_segmentation("aᄀ각b"sv, { 0u, 1u, 7u, 8u });
test_grapheme_segmentation("a😀b"sv, { 0u, 1u, 5u, 6u });
test_grapheme_segmentation("a👨👩👧👦b"sv, { 0u, 1u, 26u, 27u });
test_grapheme_segmentation("a👩🏼👨🏻b"sv, { 0u, 1u, 29u, 30u });
}
TEST_CASE(grapheme_segmentation_indic_conjunct_break)
{
test_grapheme_segmentation("\u0915"sv, { 0u, 3u });
test_grapheme_segmentation("\u0915a"sv, { 0u, 3u, 4u });
test_grapheme_segmentation("\u0915\u0916"sv, { 0u, 3u, 6u });
test_grapheme_segmentation("\u0915\u094D\u0916"sv, { 0u, 9u });
test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u0916"sv, { 0u, 15u });
test_grapheme_segmentation("\u0915\u094D\u09BC\u09CD\u0916"sv, { 0u, 15u });
test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u09BC\u09CD\u0916"sv, { 0u, 21u });
test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u0916"sv, { 0u, 21u });
test_grapheme_segmentation("\u0915\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 21u });
test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u09BC\u09CD\u0916"sv, { 0u, 27u });
test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 27u });
test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 33u });
}
template<size_t N>
static void test_word_segmentation(StringView string, size_t const (&expected_boundaries)[N])
{
Vector<size_t> boundaries;
auto segmenter = Locale::Segmenter::create(Locale::SegmenterGranularity::Word);
segmenter->for_each_boundary(MUST(String::from_utf8(string)), [&](auto boundary) {
boundaries.append(boundary);
return IterationDecision::Continue;
});
EXPECT_EQ(boundaries, ReadonlySpan<size_t> { expected_boundaries });
}
TEST_CASE(word_segmentation)
{
auto segmenter = Locale::Segmenter::create(Locale::SegmenterGranularity::Word);
segmenter->for_each_boundary(String {}, [&](auto) {
VERIFY_NOT_REACHED();
return IterationDecision::Break;
});
test_word_segmentation("a"sv, { 0u, 1u });
test_word_segmentation("ab"sv, { 0u, 2u });
test_word_segmentation("abc"sv, { 0u, 3u });
test_word_segmentation("ab cd"sv, { 0u, 2u, 3u, 5u });
test_word_segmentation("ab cd"sv, { 0u, 2u, 4u, 6u });
test_word_segmentation("ab\tcd"sv, { 0u, 2u, 3u, 5u });
test_word_segmentation("ab\ncd"sv, { 0u, 2u, 3u, 5u });
test_word_segmentation("ab\n\rcd"sv, { 0u, 2u, 3u, 4u, 6u });
test_word_segmentation("ab\r\ncd"sv, { 0u, 2u, 4u, 6u });
test_word_segmentation("a😀b"sv, { 0u, 1u, 5u, 6u });
test_word_segmentation("a👨👩👧👦b"sv, { 0u, 1u, 26u, 27u });
test_word_segmentation("a👩🏼👨🏻b"sv, { 0u, 1u, 29u, 30u });
test_word_segmentation("ab 12 cd"sv, { 0u, 2u, 3u, 5u, 6u, 8u });
test_word_segmentation("ab 1.2 cd"sv, { 0u, 2u, 3u, 6u, 7u, 9u });
test_word_segmentation("ab 12.34 cd"sv, { 0u, 2u, 3u, 8u, 9u, 11u });
test_word_segmentation("ab example.com cd"sv, { 0u, 2u, 3u, 14u, 15u, 17u });
test_word_segmentation("ab can't cd"sv, { 0u, 2u, 3u, 8u, 9u, 11u });
test_word_segmentation("ab \"can't\" cd"sv, { 0u, 2u, 3u, 4u, 9u, 10u, 11u, 13u });
test_word_segmentation(
"The quick (“brown”) fox cant jump 32.3 feet, right?"sv,
{ 0u, 3u, 4u, 9u, 10u, 11u, 14u, 19u, 22u, 23u, 24u, 27u, 28u, 35u, 36u, 40u, 41u, 45u, 46u, 50u, 51u, 52u, 57u, 58u });
}

View file

@ -8,6 +8,7 @@ set(SOURCES
NumberFormat.cpp
PluralRules.cpp
RelativeTimeFormat.cpp
Segmenter.cpp
UnicodeKeywords.cpp
)

View file

@ -17,6 +17,7 @@ enum class Style;
enum class Weekday;
class NumberFormat;
class Segmenter;
struct CalendarPattern;
struct Keyword;

View file

@ -0,0 +1,240 @@
/*
* Copyright (c) 2024, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#define AK_DONT_REPLACE_STD
#include <AK/Utf16View.h>
#include <AK/Utf32View.h>
#include <LibLocale/ICU.h>
#include <LibLocale/Locale.h>
#include <LibLocale/Segmenter.h>
#include <unicode/brkiter.h>
#include <unicode/utext.h>
#include <unicode/utf8.h>
namespace Locale {
SegmenterGranularity segmenter_granularity_from_string(StringView segmenter_granularity)
{
if (segmenter_granularity == "grapheme"sv)
return SegmenterGranularity::Grapheme;
if (segmenter_granularity == "sentence"sv)
return SegmenterGranularity::Sentence;
if (segmenter_granularity == "word"sv)
return SegmenterGranularity::Word;
VERIFY_NOT_REACHED();
}
StringView segmenter_granularity_to_string(SegmenterGranularity segmenter_granularity)
{
switch (segmenter_granularity) {
case SegmenterGranularity::Grapheme:
return "grapheme"sv;
case SegmenterGranularity::Sentence:
return "sentence"sv;
case SegmenterGranularity::Word:
return "word"sv;
}
VERIFY_NOT_REACHED();
}
class SegmenterImpl : public Segmenter {
public:
SegmenterImpl(NonnullOwnPtr<icu::BreakIterator> segmenter, SegmenterGranularity segmenter_granularity)
: Segmenter(segmenter_granularity)
, m_segmenter(move(segmenter))
{
}
virtual ~SegmenterImpl() override = default;
virtual NonnullOwnPtr<Segmenter> clone() const override
{
return make<SegmenterImpl>(adopt_own(*m_segmenter->clone()), m_segmenter_granularity);
}
virtual void set_segmented_text(String text) override
{
UErrorCode status = U_ZERO_ERROR;
m_segmented_text = move(text);
auto view = m_segmented_text.get<String>().bytes_as_string_view();
UText utext = UTEXT_INITIALIZER;
utext_openUTF8(&utext, view.characters_without_null_termination(), static_cast<i64>(view.length()), &status);
VERIFY(icu_success(status));
m_segmenter->setText(&utext, status);
VERIFY(icu_success(status));
utext_close(&utext);
}
virtual void set_segmented_text(Utf16View const& text) override
{
m_segmented_text = icu::UnicodeString { text.data(), static_cast<i32>(text.length_in_code_units()) };
m_segmenter->setText(m_segmented_text.get<icu::UnicodeString>());
}
virtual size_t current_boundary() override
{
return m_segmenter->current();
}
virtual Optional<size_t> previous_boundary(size_t boundary, Inclusive inclusive) override
{
auto icu_boundary = align_boundary(boundary);
if (inclusive == Inclusive::Yes) {
if (static_cast<bool>(m_segmenter->isBoundary(icu_boundary)))
return static_cast<size_t>(icu_boundary);
}
if (auto index = m_segmenter->preceding(icu_boundary); index != icu::BreakIterator::DONE)
return static_cast<size_t>(index);
return {};
}
virtual Optional<size_t> next_boundary(size_t boundary, Inclusive inclusive) override
{
auto icu_boundary = align_boundary(boundary);
if (inclusive == Inclusive::Yes) {
if (static_cast<bool>(m_segmenter->isBoundary(icu_boundary)))
return static_cast<size_t>(icu_boundary);
}
if (auto index = m_segmenter->following(icu_boundary); index != icu::BreakIterator::DONE)
return static_cast<size_t>(index);
return {};
}
virtual void for_each_boundary(String text, SegmentationCallback callback) override
{
if (text.is_empty())
return;
set_segmented_text(move(text));
for_each_boundary(move(callback));
}
virtual void for_each_boundary(Utf16View const& text, SegmentationCallback callback) override
{
if (text.is_empty())
return;
set_segmented_text(text);
for_each_boundary(move(callback));
}
virtual void for_each_boundary(Utf32View const& text, SegmentationCallback callback) override
{
if (text.is_empty())
return;
// FIXME: We should be able to create a custom UText provider to avoid converting to UTF-8 here.
set_segmented_text(MUST(String::formatted("{}", text)));
auto code_points = m_segmented_text.get<String>().code_points();
auto current = code_points.begin();
size_t code_point_index = 0;
for_each_boundary([&](auto index) {
auto it = code_points.iterator_at_byte_offset(index);
while (current != it) {
++code_point_index;
++current;
}
return callback(code_point_index);
});
}
virtual bool is_current_boundary_word_like() const override
{
auto status = m_segmenter->getRuleStatus();
if (status >= UBRK_WORD_NUMBER && status < UBRK_WORD_NUMBER_LIMIT)
return true;
if (status >= UBRK_WORD_LETTER && status < UBRK_WORD_LETTER_LIMIT)
return true;
if (status >= UBRK_WORD_KANA && status < UBRK_WORD_KANA_LIMIT)
return true;
if (status >= UBRK_WORD_IDEO && status < UBRK_WORD_IDEO_LIMIT)
return true;
return false;
}
private:
i32 align_boundary(size_t boundary)
{
auto icu_boundary = static_cast<i32>(boundary);
return m_segmented_text.visit(
[&](String const& text) {
U8_SET_CP_START(text.bytes().data(), 0, icu_boundary);
return icu_boundary;
},
[&](icu::UnicodeString const& text) {
return text.getChar32Start(icu_boundary);
},
[](Empty) -> i32 { VERIFY_NOT_REACHED(); });
}
void for_each_boundary(SegmentationCallback callback)
{
if (callback(static_cast<size_t>(m_segmenter->first())) == IterationDecision::Break)
return;
while (true) {
auto index = m_segmenter->next();
if (index == icu::BreakIterator::DONE)
return;
if (callback(static_cast<size_t>(index)) == IterationDecision::Break)
return;
}
}
NonnullOwnPtr<icu::BreakIterator> m_segmenter;
Variant<Empty, String, icu::UnicodeString> m_segmented_text;
};
NonnullOwnPtr<Segmenter> Segmenter::create(SegmenterGranularity segmenter_granularity)
{
return Segmenter::create(default_locale(), segmenter_granularity);
}
NonnullOwnPtr<Segmenter> Segmenter::create(StringView locale, SegmenterGranularity segmenter_granularity)
{
UErrorCode status = U_ZERO_ERROR;
auto locale_data = LocaleData::for_locale(locale);
VERIFY(locale_data.has_value());
auto segmenter = adopt_own_if_nonnull([&]() {
switch (segmenter_granularity) {
case SegmenterGranularity::Grapheme:
return icu::BreakIterator::createCharacterInstance(locale_data->locale(), status);
case SegmenterGranularity::Sentence:
return icu::BreakIterator::createSentenceInstance(locale_data->locale(), status);
case SegmenterGranularity::Word:
return icu::BreakIterator::createWordInstance(locale_data->locale(), status);
}
VERIFY_NOT_REACHED();
}());
VERIFY(icu_success(status));
return make<SegmenterImpl>(segmenter.release_nonnull(), segmenter_granularity);
}
}

View file

@ -0,0 +1,62 @@
/*
* Copyright (c) 2024, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/Function.h>
#include <AK/NonnullOwnPtr.h>
#include <AK/Optional.h>
#include <AK/StringView.h>
namespace Locale {
enum class SegmenterGranularity {
Grapheme,
Sentence,
Word,
};
SegmenterGranularity segmenter_granularity_from_string(StringView);
StringView segmenter_granularity_to_string(SegmenterGranularity);
class Segmenter {
public:
static NonnullOwnPtr<Segmenter> create(SegmenterGranularity segmenter_granularity);
static NonnullOwnPtr<Segmenter> create(StringView locale, SegmenterGranularity segmenter_granularity);
virtual ~Segmenter() = default;
SegmenterGranularity segmenter_granularity() const { return m_segmenter_granularity; }
virtual NonnullOwnPtr<Segmenter> clone() const = 0;
virtual void set_segmented_text(String) = 0;
virtual void set_segmented_text(Utf16View const&) = 0;
virtual size_t current_boundary() = 0;
enum class Inclusive {
No,
Yes,
};
virtual Optional<size_t> previous_boundary(size_t index, Inclusive = Inclusive::No) = 0;
virtual Optional<size_t> next_boundary(size_t index, Inclusive = Inclusive::No) = 0;
using SegmentationCallback = Function<IterationDecision(size_t)>;
virtual void for_each_boundary(String, SegmentationCallback) = 0;
virtual void for_each_boundary(Utf16View const&, SegmentationCallback) = 0;
virtual void for_each_boundary(Utf32View const&, SegmentationCallback) = 0;
virtual bool is_current_boundary_word_like() const = 0;
protected:
explicit Segmenter(SegmenterGranularity segmenter_granularity)
: m_segmenter_granularity(segmenter_granularity)
{
}
SegmenterGranularity m_segmenter_granularity { SegmenterGranularity::Grapheme };
};
}