mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-09-29 16:21:29 +00:00
LibUnicode: Parse UCD Scripts.txt and generate as a Unicode property
There are a couple of minor nuances with parsing script values, compared to other properties. In Scripts.txt, the UCD file lists the full name of each script; other properties, like General Category, list the shorter name in their primary files. This means that the aliases listed in PropertyValueAliases.txt are reversed for script values.
This commit is contained in:
parent
619c924042
commit
f5c1bbc00b
Notes:
sideshowbarker
2024-07-18 07:30:19 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/SerenityOS/serenity/commit/f5c1bbc00bc Pull-request: https://github.com/SerenityOS/serenity/pull/9204 Reviewed-by: https://github.com/linusg ✅
|
@ -318,4 +318,26 @@ bool is_ecma262_property([[maybe_unused]] Property property)
|
|||
#endif
|
||||
}
|
||||
|
||||
Optional<Script> script_from_string([[maybe_unused]] StringView const& script)
|
||||
{
|
||||
#if ENABLE_UNICODE_DATA
|
||||
return Detail::script_from_string(script);
|
||||
#else
|
||||
return {};
|
||||
#endif
|
||||
}
|
||||
|
||||
bool code_point_has_script([[maybe_unused]] u32 code_point, [[maybe_unused]] Script script)
|
||||
{
|
||||
#if ENABLE_UNICODE_DATA
|
||||
auto unicode_data = Detail::unicode_data_for_code_point(code_point);
|
||||
if (!unicode_data.has_value())
|
||||
return false;
|
||||
|
||||
return unicode_data->script == script;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -28,4 +28,7 @@ Optional<Property> property_from_string(StringView const&);
|
|||
bool code_point_has_property(u32 code_point, Property property);
|
||||
bool is_ecma262_property(Property);
|
||||
|
||||
Optional<Script> script_from_string(StringView const&);
|
||||
bool code_point_has_script(u32 code_point, Script script);
|
||||
|
||||
}
|
||||
|
|
|
@ -71,6 +71,7 @@ struct CodePointData {
|
|||
Optional<u32> simple_titlecase_mapping;
|
||||
Vector<u32> special_casing_indices;
|
||||
Vector<StringView> prop_list;
|
||||
StringView script;
|
||||
StringView word_break_property;
|
||||
};
|
||||
|
||||
|
@ -112,6 +113,11 @@ struct UnicodeData {
|
|||
};
|
||||
Vector<Alias> prop_aliases;
|
||||
|
||||
PropList script_list {
|
||||
{ "Unknown"sv, {} },
|
||||
};
|
||||
Vector<Alias> script_aliases;
|
||||
|
||||
PropList word_break_prop_list;
|
||||
};
|
||||
|
||||
|
@ -267,9 +273,15 @@ static void parse_alias_list(Core::File& file, PropList const& prop_list, Vector
|
|||
}
|
||||
}
|
||||
|
||||
static void parse_value_alias_list(Core::File& file, StringView desired_category, Vector<String> const& value_list, Vector<Alias>& prop_unions, Vector<Alias>& prop_aliases)
|
||||
static void parse_value_alias_list(Core::File& file, StringView desired_category, Vector<String> const& value_list, Vector<Alias> const& prop_unions, Vector<Alias>& prop_aliases, bool primary_value_is_first = true)
|
||||
{
|
||||
VERIFY(file.seek(0));
|
||||
|
||||
auto append_alias = [&](auto alias, auto value) {
|
||||
// Note: The value alias file contains lines such as "Ahom = Ahom", which we should just skip.
|
||||
if (alias == value)
|
||||
return;
|
||||
|
||||
// FIXME: We will, eventually, need to find where missing properties are located and parse them.
|
||||
if (!value_list.contains_slow(value) && !any_of(prop_unions, [&](auto const& u) { return value == u.alias; }))
|
||||
return;
|
||||
|
@ -292,8 +304,8 @@ static void parse_value_alias_list(Core::File& file, StringView desired_category
|
|||
continue;
|
||||
|
||||
VERIFY((segments.size() == 3) || (segments.size() == 4));
|
||||
auto value = segments[1].trim_whitespace();
|
||||
auto alias = segments[2].trim_whitespace();
|
||||
auto value = primary_value_is_first ? segments[1].trim_whitespace() : segments[2].trim_whitespace();
|
||||
auto alias = primary_value_is_first ? segments[2].trim_whitespace() : segments[1].trim_whitespace();
|
||||
append_alias(alias, value);
|
||||
|
||||
if (segments.size() == 4) {
|
||||
|
@ -307,6 +319,34 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data)
|
|||
{
|
||||
Optional<u32> code_point_range_start;
|
||||
|
||||
auto assign_code_point_property = [&](u32 code_point, auto const& list, auto& property, StringView default_) {
|
||||
using PropertyType = RemoveCVReference<decltype(property)>;
|
||||
constexpr bool is_single_item = IsSame<PropertyType, StringView>;
|
||||
|
||||
auto assign_property = [&](auto const& item) {
|
||||
if constexpr (is_single_item)
|
||||
property = item;
|
||||
else
|
||||
property.append(item);
|
||||
};
|
||||
|
||||
for (auto const& item : list) {
|
||||
for (auto const& range : item.value) {
|
||||
if ((range.first <= code_point) && (code_point <= range.last)) {
|
||||
assign_property(item.key);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if constexpr (is_single_item) {
|
||||
if (!property.is_empty())
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (property.is_empty())
|
||||
assign_property(default_);
|
||||
};
|
||||
|
||||
while (file.can_read_line()) {
|
||||
auto line = file.read_line();
|
||||
if (line.is_empty())
|
||||
|
@ -351,29 +391,9 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data)
|
|||
data.special_casing_indices.append(casing.index);
|
||||
}
|
||||
|
||||
for (auto const& property : unicode_data.prop_list) {
|
||||
for (auto const& range : property.value) {
|
||||
if ((range.first <= data.code_point) && (data.code_point <= range.last)) {
|
||||
data.prop_list.append(property.key);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (data.prop_list.is_empty())
|
||||
data.prop_list.append("Assigned"sv);
|
||||
|
||||
for (auto const& property : unicode_data.word_break_prop_list) {
|
||||
for (auto const& range : property.value) {
|
||||
if ((range.first <= data.code_point) && (data.code_point <= range.last)) {
|
||||
data.word_break_property = property.key;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!data.word_break_property.is_empty())
|
||||
break;
|
||||
}
|
||||
if (data.word_break_property.is_empty())
|
||||
data.word_break_property = "Other"sv;
|
||||
assign_code_point_property(data.code_point, unicode_data.prop_list, data.prop_list, "Assigned"sv);
|
||||
assign_code_point_property(data.code_point, unicode_data.script_list, data.script, "Unknown"sv);
|
||||
assign_code_point_property(data.code_point, unicode_data.word_break_prop_list, data.word_break_property, "Other"sv);
|
||||
|
||||
unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size());
|
||||
|
||||
|
@ -392,7 +412,7 @@ static void generate_unicode_data_header(Core::File& file, UnicodeData& unicode_
|
|||
generator.set("special_casing_size", String::number(unicode_data.largest_special_casing_size));
|
||||
|
||||
auto generate_enum = [&](StringView name, StringView default_, Vector<String> values, Vector<Alias> unions = {}, Vector<Alias> aliases = {}, bool as_bitmask = false) {
|
||||
VERIFY((values.size() + !default_.is_empty()) <= 64);
|
||||
VERIFY(!as_bitmask || (values.size() <= 64));
|
||||
quick_sort(values);
|
||||
quick_sort(unions, [](auto& union1, auto& union2) { return union1.alias < union2.alias; });
|
||||
quick_sort(aliases, [](auto& alias1, auto& alias2) { return alias1.alias < alias2.alias; });
|
||||
|
@ -476,6 +496,7 @@ namespace Unicode {
|
|||
generate_enum("Condition"sv, "None"sv, move(unicode_data.conditions));
|
||||
generate_enum("GeneralCategory"sv, "None"sv, unicode_data.general_categories, unicode_data.general_category_unions, unicode_data.general_category_aliases, true);
|
||||
generate_enum("Property"sv, "Assigned"sv, unicode_data.prop_list.keys(), {}, unicode_data.prop_aliases, true);
|
||||
generate_enum("Script"sv, {}, unicode_data.script_list.keys(), {}, unicode_data.script_aliases);
|
||||
generate_enum("WordBreakProperty"sv, "Other"sv, unicode_data.word_break_prop_list.keys());
|
||||
|
||||
generator.append(R"~~~(
|
||||
|
@ -530,6 +551,7 @@ struct UnicodeData {
|
|||
u32 special_casing_size { 0 };
|
||||
|
||||
Property properties { Property::Assigned };
|
||||
Script script { Script::Unknown };
|
||||
WordBreakProperty word_break_property { WordBreakProperty::Other };
|
||||
};
|
||||
|
||||
|
@ -538,6 +560,7 @@ namespace Detail {
|
|||
Optional<UnicodeData> unicode_data_for_code_point(u32 code_point);
|
||||
Optional<Property> property_from_string(StringView const& property);
|
||||
Optional<GeneralCategory> general_category_from_string(StringView const& general_category);
|
||||
Optional<Script> script_from_string(StringView const& script);
|
||||
|
||||
}
|
||||
|
||||
|
@ -644,6 +667,7 @@ static constexpr Array<UnicodeData, @code_point_data_size@> s_unicode_data { {)~
|
|||
first = false;
|
||||
}
|
||||
|
||||
generator.append(String::formatted(", Script::{}", data.script));
|
||||
generator.append(String::formatted(", WordBreakProperty::{}", data.word_break_property));
|
||||
generator.append(" },");
|
||||
}
|
||||
|
@ -746,6 +770,26 @@ Optional<GeneralCategory> general_category_from_string(StringView const& general
|
|||
return {};
|
||||
}
|
||||
|
||||
Optional<Script> script_from_string(StringView const& script)
|
||||
{)~~~");
|
||||
|
||||
for (auto const& script : unicode_data.script_list) {
|
||||
generator.set("script", script.key);
|
||||
generator.append(R"~~~(
|
||||
if (script == "@script@"sv)
|
||||
return Script::@script@;)~~~");
|
||||
}
|
||||
for (auto const& alias : unicode_data.script_aliases) {
|
||||
generator.set("script", alias.alias);
|
||||
generator.append(R"~~~(
|
||||
if (script == "@script@"sv)
|
||||
return Script::@script@;)~~~");
|
||||
}
|
||||
|
||||
generator.append(R"~~~(
|
||||
return {};
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -764,6 +808,7 @@ int main(int argc, char** argv)
|
|||
char const* derived_core_prop_path = nullptr;
|
||||
char const* prop_alias_path = nullptr;
|
||||
char const* prop_value_alias_path = nullptr;
|
||||
char const* scripts_path = nullptr;
|
||||
char const* word_break_path = nullptr;
|
||||
|
||||
Core::ArgsParser args_parser;
|
||||
|
@ -775,6 +820,7 @@ int main(int argc, char** argv)
|
|||
args_parser.add_option(derived_core_prop_path, "Path to DerivedCoreProperties.txt file", "derived-core-prop-path", 'd', "derived-core-prop-path");
|
||||
args_parser.add_option(prop_alias_path, "Path to PropertyAliases.txt file", "prop-alias-path", 'a', "prop-alias-path");
|
||||
args_parser.add_option(prop_value_alias_path, "Path to PropertyValueAliases.txt file", "prop-value-alias-path", 'v', "prop-value-alias-path");
|
||||
args_parser.add_option(scripts_path, "Path to Scripts.txt file", "scripts-path", 'r', "scripts-path");
|
||||
args_parser.add_option(word_break_path, "Path to WordBreakProperty.txt file", "word-break-path", 'w', "word-break-path");
|
||||
args_parser.parse(argc, argv);
|
||||
|
||||
|
@ -802,6 +848,7 @@ int main(int argc, char** argv)
|
|||
auto derived_core_prop_file = open_file(derived_core_prop_path, "-d/--derived-core-prop-path");
|
||||
auto prop_alias_file = open_file(prop_alias_path, "-a/--prop-alias-path");
|
||||
auto prop_value_alias_file = open_file(prop_value_alias_path, "-v/--prop-value-alias-path");
|
||||
auto scripts_file = open_file(scripts_path, "-r/--scripts-path");
|
||||
auto word_break_file = open_file(word_break_path, "-w/--word-break-path");
|
||||
|
||||
UnicodeData unicode_data {};
|
||||
|
@ -809,9 +856,12 @@ int main(int argc, char** argv)
|
|||
parse_prop_list(prop_list_file, unicode_data.prop_list);
|
||||
parse_prop_list(derived_core_prop_file, unicode_data.prop_list);
|
||||
parse_alias_list(prop_alias_file, unicode_data.prop_list, unicode_data.prop_aliases);
|
||||
parse_prop_list(scripts_file, unicode_data.script_list);
|
||||
parse_prop_list(word_break_file, unicode_data.word_break_prop_list);
|
||||
|
||||
parse_unicode_data(unicode_data_file, unicode_data);
|
||||
parse_value_alias_list(prop_value_alias_file, "gc"sv, unicode_data.general_categories, unicode_data.general_category_unions, unicode_data.general_category_aliases);
|
||||
parse_value_alias_list(prop_value_alias_file, "sc"sv, unicode_data.script_list.keys(), {}, unicode_data.script_aliases, false);
|
||||
|
||||
generate_unicode_data_header(generated_header_file, unicode_data);
|
||||
generate_unicode_data_implementation(generated_implementation_file, unicode_data);
|
||||
|
|
|
@ -14,6 +14,7 @@ enum class Condition;
|
|||
enum class GeneralCategory : u64;
|
||||
enum class Locale;
|
||||
enum class Property : u64;
|
||||
enum class Script;
|
||||
enum class WordBreakProperty;
|
||||
|
||||
struct SpecialCasing;
|
||||
|
|
|
@ -18,6 +18,9 @@ set(PROP_ALIAS_PATH ${CMAKE_BINARY_DIR}/UCD/PropertyAliases.txt)
|
|||
set(PROP_VALUE_ALIAS_URL https://www.unicode.org/Public/13.0.0/ucd/PropertyValueAliases.txt)
|
||||
set(PROP_VALUE_ALIAS_PATH ${CMAKE_BINARY_DIR}/UCD/PropertyValueAliases.txt)
|
||||
|
||||
set(SCRIPTS_URL https://www.unicode.org/Public/13.0.0/ucd/Scripts.txt)
|
||||
set(SCRIPTS_PATH ${CMAKE_BINARY_DIR}/UCD/Scripts.txt)
|
||||
|
||||
set(WORD_BREAK_URL https://www.unicode.org/Public/13.0.0/ucd/auxiliary/WordBreakProperty.txt)
|
||||
set(WORD_BREAK_PATH ${CMAKE_BINARY_DIR}/UCD/WordBreakProperty.txt)
|
||||
|
||||
|
@ -46,6 +49,10 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
|
|||
message(STATUS "Downloading UCD PropertyValueAliases.txt from ${PROP_VALUE_ALIAS_URL}...")
|
||||
file(DOWNLOAD ${PROP_VALUE_ALIAS_URL} ${PROP_VALUE_ALIAS_PATH} INACTIVITY_TIMEOUT 10)
|
||||
endif()
|
||||
if (NOT EXISTS ${SCRIPTS_PATH})
|
||||
message(STATUS "Downloading UCD Scripts.txt from ${SCRIPTS_URL}...")
|
||||
file(DOWNLOAD ${SCRIPTS_URL} ${SCRIPTS_PATH} INACTIVITY_TIMEOUT 10)
|
||||
endif()
|
||||
if (NOT EXISTS ${WORD_BREAK_PATH})
|
||||
message(STATUS "Downloading UCD WordBreakProperty.txt from ${WORD_BREAK_URL}...")
|
||||
file(DOWNLOAD ${WORD_BREAK_URL} ${WORD_BREAK_PATH} INACTIVITY_TIMEOUT 10)
|
||||
|
@ -61,9 +68,9 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
|
|||
|
||||
add_custom_command(
|
||||
OUTPUT ${UNICODE_DATA_HEADER} ${UNICODE_DATA_IMPLEMENTATION}
|
||||
COMMAND $<TARGET_FILE:GenerateUnicodeData> -h ${UNICODE_DATA_HEADER} -c ${UNICODE_DATA_IMPLEMENTATION} -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -d ${DERIVED_CORE_PROP_PATH} -a ${PROP_ALIAS_PATH} -v ${PROP_VALUE_ALIAS_PATH} -w ${WORD_BREAK_PATH}
|
||||
COMMAND $<TARGET_FILE:GenerateUnicodeData> -h ${UNICODE_DATA_HEADER} -c ${UNICODE_DATA_IMPLEMENTATION} -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -d ${DERIVED_CORE_PROP_PATH} -a ${PROP_ALIAS_PATH} -v ${PROP_VALUE_ALIAS_PATH} -r ${SCRIPTS_PATH} -w ${WORD_BREAK_PATH}
|
||||
VERBATIM
|
||||
DEPENDS GenerateUnicodeData ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH} ${PROP_LIST_PATH} ${DERIVED_CORE_PROP_PATH} ${PROP_ALIAS_PATH} ${PROP_VALUE_ALIAS_PATH} ${WORD_BREAK_PATH}
|
||||
DEPENDS GenerateUnicodeData ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH} ${PROP_LIST_PATH} ${DERIVED_CORE_PROP_PATH} ${PROP_ALIAS_PATH} ${PROP_VALUE_ALIAS_PATH} ${SCRIPTS_PATH} ${WORD_BREAK_PATH}
|
||||
)
|
||||
|
||||
set(UNICODE_DATA_SOURCES ${UNICODE_DATA_HEADER} ${UNICODE_DATA_IMPLEMENTATION})
|
||||
|
|
Loading…
Reference in a new issue