LibUnicode: Parse UCD Scripts.txt and generate as a Unicode property

There are a couple of minor nuances with parsing script values, compared
to other properties. In Scripts.txt, the UCD file lists the full name of
each script; other properties, like General Category, list the shorter
name in their primary files. This means that the aliases listed in
PropertyValueAliases.txt are reversed for script values.
This commit is contained in:
Timothy Flynn 2021-08-03 17:11:19 -04:00 committed by Linus Groh
parent 619c924042
commit f5c1bbc00b
Notes: sideshowbarker 2024-07-18 07:30:19 +09:00
5 changed files with 112 additions and 29 deletions

View file

@ -318,4 +318,26 @@ bool is_ecma262_property([[maybe_unused]] Property property)
#endif
}
Optional<Script> script_from_string([[maybe_unused]] StringView const& script)
{
#if ENABLE_UNICODE_DATA
return Detail::script_from_string(script);
#else
return {};
#endif
}
bool code_point_has_script([[maybe_unused]] u32 code_point, [[maybe_unused]] Script script)
{
#if ENABLE_UNICODE_DATA
auto unicode_data = Detail::unicode_data_for_code_point(code_point);
if (!unicode_data.has_value())
return false;
return unicode_data->script == script;
#else
return false;
#endif
}
}

View file

@ -28,4 +28,7 @@ Optional<Property> property_from_string(StringView const&);
bool code_point_has_property(u32 code_point, Property property);
bool is_ecma262_property(Property);
Optional<Script> script_from_string(StringView const&);
bool code_point_has_script(u32 code_point, Script script);
}

View file

@ -71,6 +71,7 @@ struct CodePointData {
Optional<u32> simple_titlecase_mapping;
Vector<u32> special_casing_indices;
Vector<StringView> prop_list;
StringView script;
StringView word_break_property;
};
@ -112,6 +113,11 @@ struct UnicodeData {
};
Vector<Alias> prop_aliases;
PropList script_list {
{ "Unknown"sv, {} },
};
Vector<Alias> script_aliases;
PropList word_break_prop_list;
};
@ -267,9 +273,15 @@ static void parse_alias_list(Core::File& file, PropList const& prop_list, Vector
}
}
static void parse_value_alias_list(Core::File& file, StringView desired_category, Vector<String> const& value_list, Vector<Alias>& prop_unions, Vector<Alias>& prop_aliases)
static void parse_value_alias_list(Core::File& file, StringView desired_category, Vector<String> const& value_list, Vector<Alias> const& prop_unions, Vector<Alias>& prop_aliases, bool primary_value_is_first = true)
{
VERIFY(file.seek(0));
auto append_alias = [&](auto alias, auto value) {
// Note: The value alias file contains lines such as "Ahom = Ahom", which we should just skip.
if (alias == value)
return;
// FIXME: We will, eventually, need to find where missing properties are located and parse them.
if (!value_list.contains_slow(value) && !any_of(prop_unions, [&](auto const& u) { return value == u.alias; }))
return;
@ -292,8 +304,8 @@ static void parse_value_alias_list(Core::File& file, StringView desired_category
continue;
VERIFY((segments.size() == 3) || (segments.size() == 4));
auto value = segments[1].trim_whitespace();
auto alias = segments[2].trim_whitespace();
auto value = primary_value_is_first ? segments[1].trim_whitespace() : segments[2].trim_whitespace();
auto alias = primary_value_is_first ? segments[2].trim_whitespace() : segments[1].trim_whitespace();
append_alias(alias, value);
if (segments.size() == 4) {
@ -307,6 +319,34 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data)
{
Optional<u32> code_point_range_start;
auto assign_code_point_property = [&](u32 code_point, auto const& list, auto& property, StringView default_) {
using PropertyType = RemoveCVReference<decltype(property)>;
constexpr bool is_single_item = IsSame<PropertyType, StringView>;
auto assign_property = [&](auto const& item) {
if constexpr (is_single_item)
property = item;
else
property.append(item);
};
for (auto const& item : list) {
for (auto const& range : item.value) {
if ((range.first <= code_point) && (code_point <= range.last)) {
assign_property(item.key);
break;
}
}
if constexpr (is_single_item) {
if (!property.is_empty())
break;
}
}
if (property.is_empty())
assign_property(default_);
};
while (file.can_read_line()) {
auto line = file.read_line();
if (line.is_empty())
@ -351,29 +391,9 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data)
data.special_casing_indices.append(casing.index);
}
for (auto const& property : unicode_data.prop_list) {
for (auto const& range : property.value) {
if ((range.first <= data.code_point) && (data.code_point <= range.last)) {
data.prop_list.append(property.key);
break;
}
}
}
if (data.prop_list.is_empty())
data.prop_list.append("Assigned"sv);
for (auto const& property : unicode_data.word_break_prop_list) {
for (auto const& range : property.value) {
if ((range.first <= data.code_point) && (data.code_point <= range.last)) {
data.word_break_property = property.key;
break;
}
}
if (!data.word_break_property.is_empty())
break;
}
if (data.word_break_property.is_empty())
data.word_break_property = "Other"sv;
assign_code_point_property(data.code_point, unicode_data.prop_list, data.prop_list, "Assigned"sv);
assign_code_point_property(data.code_point, unicode_data.script_list, data.script, "Unknown"sv);
assign_code_point_property(data.code_point, unicode_data.word_break_prop_list, data.word_break_property, "Other"sv);
unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size());
@ -392,7 +412,7 @@ static void generate_unicode_data_header(Core::File& file, UnicodeData& unicode_
generator.set("special_casing_size", String::number(unicode_data.largest_special_casing_size));
auto generate_enum = [&](StringView name, StringView default_, Vector<String> values, Vector<Alias> unions = {}, Vector<Alias> aliases = {}, bool as_bitmask = false) {
VERIFY((values.size() + !default_.is_empty()) <= 64);
VERIFY(!as_bitmask || (values.size() <= 64));
quick_sort(values);
quick_sort(unions, [](auto& union1, auto& union2) { return union1.alias < union2.alias; });
quick_sort(aliases, [](auto& alias1, auto& alias2) { return alias1.alias < alias2.alias; });
@ -476,6 +496,7 @@ namespace Unicode {
generate_enum("Condition"sv, "None"sv, move(unicode_data.conditions));
generate_enum("GeneralCategory"sv, "None"sv, unicode_data.general_categories, unicode_data.general_category_unions, unicode_data.general_category_aliases, true);
generate_enum("Property"sv, "Assigned"sv, unicode_data.prop_list.keys(), {}, unicode_data.prop_aliases, true);
generate_enum("Script"sv, {}, unicode_data.script_list.keys(), {}, unicode_data.script_aliases);
generate_enum("WordBreakProperty"sv, "Other"sv, unicode_data.word_break_prop_list.keys());
generator.append(R"~~~(
@ -530,6 +551,7 @@ struct UnicodeData {
u32 special_casing_size { 0 };
Property properties { Property::Assigned };
Script script { Script::Unknown };
WordBreakProperty word_break_property { WordBreakProperty::Other };
};
@ -538,6 +560,7 @@ namespace Detail {
Optional<UnicodeData> unicode_data_for_code_point(u32 code_point);
Optional<Property> property_from_string(StringView const& property);
Optional<GeneralCategory> general_category_from_string(StringView const& general_category);
Optional<Script> script_from_string(StringView const& script);
}
@ -644,6 +667,7 @@ static constexpr Array<UnicodeData, @code_point_data_size@> s_unicode_data { {)~
first = false;
}
generator.append(String::formatted(", Script::{}", data.script));
generator.append(String::formatted(", WordBreakProperty::{}", data.word_break_property));
generator.append(" },");
}
@ -746,6 +770,26 @@ Optional<GeneralCategory> general_category_from_string(StringView const& general
return {};
}
Optional<Script> script_from_string(StringView const& script)
{)~~~");
for (auto const& script : unicode_data.script_list) {
generator.set("script", script.key);
generator.append(R"~~~(
if (script == "@script@"sv)
return Script::@script@;)~~~");
}
for (auto const& alias : unicode_data.script_aliases) {
generator.set("script", alias.alias);
generator.append(R"~~~(
if (script == "@script@"sv)
return Script::@script@;)~~~");
}
generator.append(R"~~~(
return {};
}
}
}
@ -764,6 +808,7 @@ int main(int argc, char** argv)
char const* derived_core_prop_path = nullptr;
char const* prop_alias_path = nullptr;
char const* prop_value_alias_path = nullptr;
char const* scripts_path = nullptr;
char const* word_break_path = nullptr;
Core::ArgsParser args_parser;
@ -775,6 +820,7 @@ int main(int argc, char** argv)
args_parser.add_option(derived_core_prop_path, "Path to DerivedCoreProperties.txt file", "derived-core-prop-path", 'd', "derived-core-prop-path");
args_parser.add_option(prop_alias_path, "Path to PropertyAliases.txt file", "prop-alias-path", 'a', "prop-alias-path");
args_parser.add_option(prop_value_alias_path, "Path to PropertyValueAliases.txt file", "prop-value-alias-path", 'v', "prop-value-alias-path");
args_parser.add_option(scripts_path, "Path to Scripts.txt file", "scripts-path", 'r', "scripts-path");
args_parser.add_option(word_break_path, "Path to WordBreakProperty.txt file", "word-break-path", 'w', "word-break-path");
args_parser.parse(argc, argv);
@ -802,6 +848,7 @@ int main(int argc, char** argv)
auto derived_core_prop_file = open_file(derived_core_prop_path, "-d/--derived-core-prop-path");
auto prop_alias_file = open_file(prop_alias_path, "-a/--prop-alias-path");
auto prop_value_alias_file = open_file(prop_value_alias_path, "-v/--prop-value-alias-path");
auto scripts_file = open_file(scripts_path, "-r/--scripts-path");
auto word_break_file = open_file(word_break_path, "-w/--word-break-path");
UnicodeData unicode_data {};
@ -809,9 +856,12 @@ int main(int argc, char** argv)
parse_prop_list(prop_list_file, unicode_data.prop_list);
parse_prop_list(derived_core_prop_file, unicode_data.prop_list);
parse_alias_list(prop_alias_file, unicode_data.prop_list, unicode_data.prop_aliases);
parse_prop_list(scripts_file, unicode_data.script_list);
parse_prop_list(word_break_file, unicode_data.word_break_prop_list);
parse_unicode_data(unicode_data_file, unicode_data);
parse_value_alias_list(prop_value_alias_file, "gc"sv, unicode_data.general_categories, unicode_data.general_category_unions, unicode_data.general_category_aliases);
parse_value_alias_list(prop_value_alias_file, "sc"sv, unicode_data.script_list.keys(), {}, unicode_data.script_aliases, false);
generate_unicode_data_header(generated_header_file, unicode_data);
generate_unicode_data_implementation(generated_implementation_file, unicode_data);

View file

@ -14,6 +14,7 @@ enum class Condition;
enum class GeneralCategory : u64;
enum class Locale;
enum class Property : u64;
enum class Script;
enum class WordBreakProperty;
struct SpecialCasing;

View file

@ -18,6 +18,9 @@ set(PROP_ALIAS_PATH ${CMAKE_BINARY_DIR}/UCD/PropertyAliases.txt)
set(PROP_VALUE_ALIAS_URL https://www.unicode.org/Public/13.0.0/ucd/PropertyValueAliases.txt)
set(PROP_VALUE_ALIAS_PATH ${CMAKE_BINARY_DIR}/UCD/PropertyValueAliases.txt)
set(SCRIPTS_URL https://www.unicode.org/Public/13.0.0/ucd/Scripts.txt)
set(SCRIPTS_PATH ${CMAKE_BINARY_DIR}/UCD/Scripts.txt)
set(WORD_BREAK_URL https://www.unicode.org/Public/13.0.0/ucd/auxiliary/WordBreakProperty.txt)
set(WORD_BREAK_PATH ${CMAKE_BINARY_DIR}/UCD/WordBreakProperty.txt)
@ -46,6 +49,10 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
message(STATUS "Downloading UCD PropertyValueAliases.txt from ${PROP_VALUE_ALIAS_URL}...")
file(DOWNLOAD ${PROP_VALUE_ALIAS_URL} ${PROP_VALUE_ALIAS_PATH} INACTIVITY_TIMEOUT 10)
endif()
if (NOT EXISTS ${SCRIPTS_PATH})
message(STATUS "Downloading UCD Scripts.txt from ${SCRIPTS_URL}...")
file(DOWNLOAD ${SCRIPTS_URL} ${SCRIPTS_PATH} INACTIVITY_TIMEOUT 10)
endif()
if (NOT EXISTS ${WORD_BREAK_PATH})
message(STATUS "Downloading UCD WordBreakProperty.txt from ${WORD_BREAK_URL}...")
file(DOWNLOAD ${WORD_BREAK_URL} ${WORD_BREAK_PATH} INACTIVITY_TIMEOUT 10)
@ -61,9 +68,9 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
add_custom_command(
OUTPUT ${UNICODE_DATA_HEADER} ${UNICODE_DATA_IMPLEMENTATION}
COMMAND $<TARGET_FILE:GenerateUnicodeData> -h ${UNICODE_DATA_HEADER} -c ${UNICODE_DATA_IMPLEMENTATION} -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -d ${DERIVED_CORE_PROP_PATH} -a ${PROP_ALIAS_PATH} -v ${PROP_VALUE_ALIAS_PATH} -w ${WORD_BREAK_PATH}
COMMAND $<TARGET_FILE:GenerateUnicodeData> -h ${UNICODE_DATA_HEADER} -c ${UNICODE_DATA_IMPLEMENTATION} -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -d ${DERIVED_CORE_PROP_PATH} -a ${PROP_ALIAS_PATH} -v ${PROP_VALUE_ALIAS_PATH} -r ${SCRIPTS_PATH} -w ${WORD_BREAK_PATH}
VERBATIM
DEPENDS GenerateUnicodeData ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH} ${PROP_LIST_PATH} ${DERIVED_CORE_PROP_PATH} ${PROP_ALIAS_PATH} ${PROP_VALUE_ALIAS_PATH} ${WORD_BREAK_PATH}
DEPENDS GenerateUnicodeData ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH} ${PROP_LIST_PATH} ${DERIVED_CORE_PROP_PATH} ${PROP_ALIAS_PATH} ${PROP_VALUE_ALIAS_PATH} ${SCRIPTS_PATH} ${WORD_BREAK_PATH}
)
set(UNICODE_DATA_SOURCES ${UNICODE_DATA_HEADER} ${UNICODE_DATA_IMPLEMENTATION})