mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-09-29 16:21:29 +00:00
LibLocale: Implement an ICU-based text segmenter
Our current segmenter implementation lives in LibUnicode, and is not locale-aware. We will need such awareness for ECMA-402, and so LibLocale will be the new home for text segmentation. The tests here are ported directly from LibUnicode/TestSegmentation.cpp.
This commit is contained in:
parent
5cf818e305
commit
3fe0a27fbd
Notes:
sideshowbarker
2024-07-17 06:00:02 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/LadybirdBrowser/ladybird/commit/3fe0a27fbd Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/218
|
@ -1,6 +1,7 @@
|
|||
set(TEST_SOURCES
|
||||
TestDisplayNames.cpp
|
||||
TestLocale.cpp
|
||||
TestSegmenter.cpp
|
||||
)
|
||||
|
||||
foreach(source IN LISTS TEST_SOURCES)
|
||||
|
|
128
Tests/LibLocale/TestSegmenter.cpp
Normal file
128
Tests/LibLocale/TestSegmenter.cpp
Normal file
|
@ -0,0 +1,128 @@
|
|||
/*
|
||||
* Copyright (c) 2023-2024, Tim Flynn <trflynn89@serenityos.org>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#include <LibTest/TestCase.h>
|
||||
|
||||
#include <AK/Array.h>
|
||||
#include <AK/String.h>
|
||||
#include <AK/StringView.h>
|
||||
#include <AK/Vector.h>
|
||||
#include <LibLocale/Segmenter.h>
|
||||
|
||||
template<size_t N>
|
||||
static void test_grapheme_segmentation(StringView string, size_t const (&expected_boundaries)[N])
|
||||
{
|
||||
Vector<size_t> boundaries;
|
||||
auto segmenter = Locale::Segmenter::create(Locale::SegmenterGranularity::Grapheme);
|
||||
|
||||
segmenter->for_each_boundary(MUST(String::from_utf8(string)), [&](auto boundary) {
|
||||
boundaries.append(boundary);
|
||||
return IterationDecision::Continue;
|
||||
});
|
||||
|
||||
EXPECT_EQ(boundaries, ReadonlySpan<size_t> { expected_boundaries });
|
||||
}
|
||||
|
||||
TEST_CASE(grapheme_segmentation)
|
||||
{
|
||||
auto segmenter = Locale::Segmenter::create(Locale::SegmenterGranularity::Grapheme);
|
||||
|
||||
segmenter->for_each_boundary(String {}, [&](auto i) {
|
||||
dbgln("{}", i);
|
||||
VERIFY_NOT_REACHED();
|
||||
return IterationDecision::Break;
|
||||
});
|
||||
|
||||
test_grapheme_segmentation("a"sv, { 0u, 1u });
|
||||
test_grapheme_segmentation("ab"sv, { 0u, 1u, 2u });
|
||||
test_grapheme_segmentation("abc"sv, { 0u, 1u, 2u, 3u });
|
||||
|
||||
test_grapheme_segmentation("a\nb"sv, { 0u, 1u, 2u, 3u });
|
||||
test_grapheme_segmentation("a\n\rb"sv, { 0u, 1u, 2u, 3u, 4u });
|
||||
test_grapheme_segmentation("a\r\nb"sv, { 0u, 1u, 3u, 4u });
|
||||
|
||||
test_grapheme_segmentation("aᄀb"sv, { 0u, 1u, 4u, 5u });
|
||||
test_grapheme_segmentation("aᄀᄀb"sv, { 0u, 1u, 7u, 8u });
|
||||
test_grapheme_segmentation("aᄀᆢb"sv, { 0u, 1u, 7u, 8u });
|
||||
test_grapheme_segmentation("aᄀ가b"sv, { 0u, 1u, 7u, 8u });
|
||||
test_grapheme_segmentation("aᄀ각b"sv, { 0u, 1u, 7u, 8u });
|
||||
|
||||
test_grapheme_segmentation("a😀b"sv, { 0u, 1u, 5u, 6u });
|
||||
test_grapheme_segmentation("a👨👩👧👦b"sv, { 0u, 1u, 26u, 27u });
|
||||
test_grapheme_segmentation("a👩🏼❤️👨🏻b"sv, { 0u, 1u, 29u, 30u });
|
||||
}
|
||||
|
||||
TEST_CASE(grapheme_segmentation_indic_conjunct_break)
|
||||
{
|
||||
test_grapheme_segmentation("\u0915"sv, { 0u, 3u });
|
||||
test_grapheme_segmentation("\u0915a"sv, { 0u, 3u, 4u });
|
||||
test_grapheme_segmentation("\u0915\u0916"sv, { 0u, 3u, 6u });
|
||||
|
||||
test_grapheme_segmentation("\u0915\u094D\u0916"sv, { 0u, 9u });
|
||||
|
||||
test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u0916"sv, { 0u, 15u });
|
||||
test_grapheme_segmentation("\u0915\u094D\u09BC\u09CD\u0916"sv, { 0u, 15u });
|
||||
|
||||
test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u09BC\u09CD\u0916"sv, { 0u, 21u });
|
||||
test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u0916"sv, { 0u, 21u });
|
||||
test_grapheme_segmentation("\u0915\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 21u });
|
||||
|
||||
test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u09BC\u09CD\u0916"sv, { 0u, 27u });
|
||||
test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 27u });
|
||||
|
||||
test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 33u });
|
||||
}
|
||||
|
||||
template<size_t N>
|
||||
static void test_word_segmentation(StringView string, size_t const (&expected_boundaries)[N])
|
||||
{
|
||||
Vector<size_t> boundaries;
|
||||
auto segmenter = Locale::Segmenter::create(Locale::SegmenterGranularity::Word);
|
||||
|
||||
segmenter->for_each_boundary(MUST(String::from_utf8(string)), [&](auto boundary) {
|
||||
boundaries.append(boundary);
|
||||
return IterationDecision::Continue;
|
||||
});
|
||||
|
||||
EXPECT_EQ(boundaries, ReadonlySpan<size_t> { expected_boundaries });
|
||||
}
|
||||
|
||||
TEST_CASE(word_segmentation)
|
||||
{
|
||||
auto segmenter = Locale::Segmenter::create(Locale::SegmenterGranularity::Word);
|
||||
|
||||
segmenter->for_each_boundary(String {}, [&](auto) {
|
||||
VERIFY_NOT_REACHED();
|
||||
return IterationDecision::Break;
|
||||
});
|
||||
|
||||
test_word_segmentation("a"sv, { 0u, 1u });
|
||||
test_word_segmentation("ab"sv, { 0u, 2u });
|
||||
test_word_segmentation("abc"sv, { 0u, 3u });
|
||||
|
||||
test_word_segmentation("ab cd"sv, { 0u, 2u, 3u, 5u });
|
||||
test_word_segmentation("ab cd"sv, { 0u, 2u, 4u, 6u });
|
||||
test_word_segmentation("ab\tcd"sv, { 0u, 2u, 3u, 5u });
|
||||
test_word_segmentation("ab\ncd"sv, { 0u, 2u, 3u, 5u });
|
||||
test_word_segmentation("ab\n\rcd"sv, { 0u, 2u, 3u, 4u, 6u });
|
||||
test_word_segmentation("ab\r\ncd"sv, { 0u, 2u, 4u, 6u });
|
||||
|
||||
test_word_segmentation("a😀b"sv, { 0u, 1u, 5u, 6u });
|
||||
test_word_segmentation("a👨👩👧👦b"sv, { 0u, 1u, 26u, 27u });
|
||||
test_word_segmentation("a👩🏼❤️👨🏻b"sv, { 0u, 1u, 29u, 30u });
|
||||
|
||||
test_word_segmentation("ab 12 cd"sv, { 0u, 2u, 3u, 5u, 6u, 8u });
|
||||
test_word_segmentation("ab 1.2 cd"sv, { 0u, 2u, 3u, 6u, 7u, 9u });
|
||||
test_word_segmentation("ab 12.34 cd"sv, { 0u, 2u, 3u, 8u, 9u, 11u });
|
||||
test_word_segmentation("ab example.com cd"sv, { 0u, 2u, 3u, 14u, 15u, 17u });
|
||||
|
||||
test_word_segmentation("ab can't cd"sv, { 0u, 2u, 3u, 8u, 9u, 11u });
|
||||
test_word_segmentation("ab \"can't\" cd"sv, { 0u, 2u, 3u, 4u, 9u, 10u, 11u, 13u });
|
||||
|
||||
test_word_segmentation(
|
||||
"The quick (“brown”) fox can’t jump 32.3 feet, right?"sv,
|
||||
{ 0u, 3u, 4u, 9u, 10u, 11u, 14u, 19u, 22u, 23u, 24u, 27u, 28u, 35u, 36u, 40u, 41u, 45u, 46u, 50u, 51u, 52u, 57u, 58u });
|
||||
}
|
|
@ -8,6 +8,7 @@ set(SOURCES
|
|||
NumberFormat.cpp
|
||||
PluralRules.cpp
|
||||
RelativeTimeFormat.cpp
|
||||
Segmenter.cpp
|
||||
UnicodeKeywords.cpp
|
||||
)
|
||||
|
||||
|
|
|
@ -17,6 +17,7 @@ enum class Style;
|
|||
enum class Weekday;
|
||||
|
||||
class NumberFormat;
|
||||
class Segmenter;
|
||||
|
||||
struct CalendarPattern;
|
||||
struct Keyword;
|
||||
|
|
240
Userland/Libraries/LibLocale/Segmenter.cpp
Normal file
240
Userland/Libraries/LibLocale/Segmenter.cpp
Normal file
|
@ -0,0 +1,240 @@
|
|||
/*
|
||||
* Copyright (c) 2024, Tim Flynn <trflynn89@serenityos.org>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#define AK_DONT_REPLACE_STD
|
||||
|
||||
#include <AK/Utf16View.h>
|
||||
#include <AK/Utf32View.h>
|
||||
#include <LibLocale/ICU.h>
|
||||
#include <LibLocale/Locale.h>
|
||||
#include <LibLocale/Segmenter.h>
|
||||
|
||||
#include <unicode/brkiter.h>
|
||||
#include <unicode/utext.h>
|
||||
#include <unicode/utf8.h>
|
||||
|
||||
namespace Locale {
|
||||
|
||||
SegmenterGranularity segmenter_granularity_from_string(StringView segmenter_granularity)
|
||||
{
|
||||
if (segmenter_granularity == "grapheme"sv)
|
||||
return SegmenterGranularity::Grapheme;
|
||||
if (segmenter_granularity == "sentence"sv)
|
||||
return SegmenterGranularity::Sentence;
|
||||
if (segmenter_granularity == "word"sv)
|
||||
return SegmenterGranularity::Word;
|
||||
VERIFY_NOT_REACHED();
|
||||
}
|
||||
|
||||
StringView segmenter_granularity_to_string(SegmenterGranularity segmenter_granularity)
|
||||
{
|
||||
switch (segmenter_granularity) {
|
||||
case SegmenterGranularity::Grapheme:
|
||||
return "grapheme"sv;
|
||||
case SegmenterGranularity::Sentence:
|
||||
return "sentence"sv;
|
||||
case SegmenterGranularity::Word:
|
||||
return "word"sv;
|
||||
}
|
||||
VERIFY_NOT_REACHED();
|
||||
}
|
||||
|
||||
class SegmenterImpl : public Segmenter {
|
||||
public:
|
||||
SegmenterImpl(NonnullOwnPtr<icu::BreakIterator> segmenter, SegmenterGranularity segmenter_granularity)
|
||||
: Segmenter(segmenter_granularity)
|
||||
, m_segmenter(move(segmenter))
|
||||
{
|
||||
}
|
||||
|
||||
virtual ~SegmenterImpl() override = default;
|
||||
|
||||
virtual NonnullOwnPtr<Segmenter> clone() const override
|
||||
{
|
||||
return make<SegmenterImpl>(adopt_own(*m_segmenter->clone()), m_segmenter_granularity);
|
||||
}
|
||||
|
||||
virtual void set_segmented_text(String text) override
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
m_segmented_text = move(text);
|
||||
auto view = m_segmented_text.get<String>().bytes_as_string_view();
|
||||
|
||||
UText utext = UTEXT_INITIALIZER;
|
||||
utext_openUTF8(&utext, view.characters_without_null_termination(), static_cast<i64>(view.length()), &status);
|
||||
VERIFY(icu_success(status));
|
||||
|
||||
m_segmenter->setText(&utext, status);
|
||||
VERIFY(icu_success(status));
|
||||
|
||||
utext_close(&utext);
|
||||
}
|
||||
|
||||
virtual void set_segmented_text(Utf16View const& text) override
|
||||
{
|
||||
m_segmented_text = icu::UnicodeString { text.data(), static_cast<i32>(text.length_in_code_units()) };
|
||||
m_segmenter->setText(m_segmented_text.get<icu::UnicodeString>());
|
||||
}
|
||||
|
||||
virtual size_t current_boundary() override
|
||||
{
|
||||
return m_segmenter->current();
|
||||
}
|
||||
|
||||
virtual Optional<size_t> previous_boundary(size_t boundary, Inclusive inclusive) override
|
||||
{
|
||||
auto icu_boundary = align_boundary(boundary);
|
||||
|
||||
if (inclusive == Inclusive::Yes) {
|
||||
if (static_cast<bool>(m_segmenter->isBoundary(icu_boundary)))
|
||||
return static_cast<size_t>(icu_boundary);
|
||||
}
|
||||
|
||||
if (auto index = m_segmenter->preceding(icu_boundary); index != icu::BreakIterator::DONE)
|
||||
return static_cast<size_t>(index);
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
virtual Optional<size_t> next_boundary(size_t boundary, Inclusive inclusive) override
|
||||
{
|
||||
auto icu_boundary = align_boundary(boundary);
|
||||
|
||||
if (inclusive == Inclusive::Yes) {
|
||||
if (static_cast<bool>(m_segmenter->isBoundary(icu_boundary)))
|
||||
return static_cast<size_t>(icu_boundary);
|
||||
}
|
||||
|
||||
if (auto index = m_segmenter->following(icu_boundary); index != icu::BreakIterator::DONE)
|
||||
return static_cast<size_t>(index);
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
virtual void for_each_boundary(String text, SegmentationCallback callback) override
|
||||
{
|
||||
if (text.is_empty())
|
||||
return;
|
||||
|
||||
set_segmented_text(move(text));
|
||||
for_each_boundary(move(callback));
|
||||
}
|
||||
|
||||
virtual void for_each_boundary(Utf16View const& text, SegmentationCallback callback) override
|
||||
{
|
||||
if (text.is_empty())
|
||||
return;
|
||||
|
||||
set_segmented_text(text);
|
||||
for_each_boundary(move(callback));
|
||||
}
|
||||
|
||||
virtual void for_each_boundary(Utf32View const& text, SegmentationCallback callback) override
|
||||
{
|
||||
if (text.is_empty())
|
||||
return;
|
||||
|
||||
// FIXME: We should be able to create a custom UText provider to avoid converting to UTF-8 here.
|
||||
set_segmented_text(MUST(String::formatted("{}", text)));
|
||||
|
||||
auto code_points = m_segmented_text.get<String>().code_points();
|
||||
auto current = code_points.begin();
|
||||
size_t code_point_index = 0;
|
||||
|
||||
for_each_boundary([&](auto index) {
|
||||
auto it = code_points.iterator_at_byte_offset(index);
|
||||
|
||||
while (current != it) {
|
||||
++code_point_index;
|
||||
++current;
|
||||
}
|
||||
|
||||
return callback(code_point_index);
|
||||
});
|
||||
}
|
||||
|
||||
virtual bool is_current_boundary_word_like() const override
|
||||
{
|
||||
auto status = m_segmenter->getRuleStatus();
|
||||
|
||||
if (status >= UBRK_WORD_NUMBER && status < UBRK_WORD_NUMBER_LIMIT)
|
||||
return true;
|
||||
if (status >= UBRK_WORD_LETTER && status < UBRK_WORD_LETTER_LIMIT)
|
||||
return true;
|
||||
if (status >= UBRK_WORD_KANA && status < UBRK_WORD_KANA_LIMIT)
|
||||
return true;
|
||||
if (status >= UBRK_WORD_IDEO && status < UBRK_WORD_IDEO_LIMIT)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private:
|
||||
i32 align_boundary(size_t boundary)
|
||||
{
|
||||
auto icu_boundary = static_cast<i32>(boundary);
|
||||
|
||||
return m_segmented_text.visit(
|
||||
[&](String const& text) {
|
||||
U8_SET_CP_START(text.bytes().data(), 0, icu_boundary);
|
||||
return icu_boundary;
|
||||
},
|
||||
[&](icu::UnicodeString const& text) {
|
||||
return text.getChar32Start(icu_boundary);
|
||||
},
|
||||
[](Empty) -> i32 { VERIFY_NOT_REACHED(); });
|
||||
}
|
||||
|
||||
void for_each_boundary(SegmentationCallback callback)
|
||||
{
|
||||
if (callback(static_cast<size_t>(m_segmenter->first())) == IterationDecision::Break)
|
||||
return;
|
||||
|
||||
while (true) {
|
||||
auto index = m_segmenter->next();
|
||||
if (index == icu::BreakIterator::DONE)
|
||||
return;
|
||||
|
||||
if (callback(static_cast<size_t>(index)) == IterationDecision::Break)
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
NonnullOwnPtr<icu::BreakIterator> m_segmenter;
|
||||
Variant<Empty, String, icu::UnicodeString> m_segmented_text;
|
||||
};
|
||||
|
||||
NonnullOwnPtr<Segmenter> Segmenter::create(SegmenterGranularity segmenter_granularity)
|
||||
{
|
||||
return Segmenter::create(default_locale(), segmenter_granularity);
|
||||
}
|
||||
|
||||
NonnullOwnPtr<Segmenter> Segmenter::create(StringView locale, SegmenterGranularity segmenter_granularity)
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
auto locale_data = LocaleData::for_locale(locale);
|
||||
VERIFY(locale_data.has_value());
|
||||
|
||||
auto segmenter = adopt_own_if_nonnull([&]() {
|
||||
switch (segmenter_granularity) {
|
||||
case SegmenterGranularity::Grapheme:
|
||||
return icu::BreakIterator::createCharacterInstance(locale_data->locale(), status);
|
||||
case SegmenterGranularity::Sentence:
|
||||
return icu::BreakIterator::createSentenceInstance(locale_data->locale(), status);
|
||||
case SegmenterGranularity::Word:
|
||||
return icu::BreakIterator::createWordInstance(locale_data->locale(), status);
|
||||
}
|
||||
VERIFY_NOT_REACHED();
|
||||
}());
|
||||
|
||||
VERIFY(icu_success(status));
|
||||
|
||||
return make<SegmenterImpl>(segmenter.release_nonnull(), segmenter_granularity);
|
||||
}
|
||||
|
||||
}
|
62
Userland/Libraries/LibLocale/Segmenter.h
Normal file
62
Userland/Libraries/LibLocale/Segmenter.h
Normal file
|
@ -0,0 +1,62 @@
|
|||
/*
|
||||
* Copyright (c) 2024, Tim Flynn <trflynn89@serenityos.org>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <AK/Function.h>
|
||||
#include <AK/NonnullOwnPtr.h>
|
||||
#include <AK/Optional.h>
|
||||
#include <AK/StringView.h>
|
||||
|
||||
namespace Locale {
|
||||
|
||||
enum class SegmenterGranularity {
|
||||
Grapheme,
|
||||
Sentence,
|
||||
Word,
|
||||
};
|
||||
SegmenterGranularity segmenter_granularity_from_string(StringView);
|
||||
StringView segmenter_granularity_to_string(SegmenterGranularity);
|
||||
|
||||
class Segmenter {
|
||||
public:
|
||||
static NonnullOwnPtr<Segmenter> create(SegmenterGranularity segmenter_granularity);
|
||||
static NonnullOwnPtr<Segmenter> create(StringView locale, SegmenterGranularity segmenter_granularity);
|
||||
virtual ~Segmenter() = default;
|
||||
|
||||
SegmenterGranularity segmenter_granularity() const { return m_segmenter_granularity; }
|
||||
|
||||
virtual NonnullOwnPtr<Segmenter> clone() const = 0;
|
||||
|
||||
virtual void set_segmented_text(String) = 0;
|
||||
virtual void set_segmented_text(Utf16View const&) = 0;
|
||||
|
||||
virtual size_t current_boundary() = 0;
|
||||
|
||||
enum class Inclusive {
|
||||
No,
|
||||
Yes,
|
||||
};
|
||||
virtual Optional<size_t> previous_boundary(size_t index, Inclusive = Inclusive::No) = 0;
|
||||
virtual Optional<size_t> next_boundary(size_t index, Inclusive = Inclusive::No) = 0;
|
||||
|
||||
using SegmentationCallback = Function<IterationDecision(size_t)>;
|
||||
virtual void for_each_boundary(String, SegmentationCallback) = 0;
|
||||
virtual void for_each_boundary(Utf16View const&, SegmentationCallback) = 0;
|
||||
virtual void for_each_boundary(Utf32View const&, SegmentationCallback) = 0;
|
||||
|
||||
virtual bool is_current_boundary_word_like() const = 0;
|
||||
|
||||
protected:
|
||||
explicit Segmenter(SegmenterGranularity segmenter_granularity)
|
||||
: m_segmenter_granularity(segmenter_granularity)
|
||||
{
|
||||
}
|
||||
|
||||
SegmenterGranularity m_segmenter_granularity { SegmenterGranularity::Grapheme };
|
||||
};
|
||||
|
||||
}
|
Loading…
Reference in a new issue