From a98d3a1a851330aa0d33a73031566ba04cf6b53c Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Tue, 10 Aug 2021 15:29:28 -0400 Subject: [PATCH] LibUnicode: Download and parse DerivedNormalizationProps UCD file This file contains the last properties that LibUnicode is not parsing. Much of the data in this file is not currently used; that is left as a FIXME for when String.prototype.normalize is implemented. Until then, only the code point properties are utilized for regular expression pattern escapes. --- .../Libraries/LibUnicode/CharacterTypes.cpp | 4 +- .../CodeGenerators/GenerateUnicodeData.cpp | 116 ++++++++++++++---- .../Libraries/LibUnicode/unicode_data.cmake | 11 +- 3 files changed, 103 insertions(+), 28 deletions(-) diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index 07d0a3f215b..1098f6c2e58 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -224,8 +224,6 @@ bool is_ecma262_property([[maybe_unused]] Property property) { #if ENABLE_UNICODE_DATA // EMCA-262 only allows a subset of Unicode properties: https://tc39.es/ecma262/#table-binary-unicode-properties - // Note: Some of the properties in the above link are not yet parsed by the LibUnicode generator. They are left - // commented out here until they are parsed and can be used. switch (property) { case Unicode::Property::ASCII: case Unicode::Property::ASCII_Hex_Digit: @@ -239,7 +237,7 @@ bool is_ecma262_property([[maybe_unused]] Property property) case Unicode::Property::Changes_When_Casefolded: case Unicode::Property::Changes_When_Casemapped: case Unicode::Property::Changes_When_Lowercased: - // case Unicode::Property::Changes_When_NFKC_Casefolded: + case Unicode::Property::Changes_When_NFKC_Casefolded: case Unicode::Property::Changes_When_Titlecased: case Unicode::Property::Changes_When_Uppercased: case Unicode::Property::Dash: diff --git a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp index 5d1c289336c..e60b5549b5e 100644 --- a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp +++ b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp @@ -50,6 +50,22 @@ struct Alias { String alias; }; +// Normalization source: https://www.unicode.org/Public/13.0.0/ucd/DerivedNormalizationProps.txt +// Normalization descriptions: https://www.unicode.org/reports/tr44/#DerivedNormalizationProps.txt +enum class QuickCheck { + Yes, + No, + Maybe, +}; + +struct Normalization { + CodePointRange code_point_range; + Vector value; + QuickCheck quick_check { QuickCheck::Yes }; +}; + +using NormalizationProps = HashMap>; + // UnicodeData source: https://www.unicode.org/Public/13.0.0/ucd/UnicodeData.txt // Field descriptions: https://www.unicode.org/reports/tr44/tr44-13.html#UnicodeData.txt // https://www.unicode.org/reports/tr44/#General_Category_Values @@ -99,6 +115,9 @@ struct UnicodeData { }; Vector script_aliases; PropList script_extensions; + + // FIXME: We are not yet doing anything with this data. It will be needed for String.prototype.normalize. + NormalizationProps normalization_props; }; static constexpr auto s_desired_fields = Array { @@ -118,18 +137,38 @@ static void write_to_file_if_different(Core::File& file, StringView contents) VERIFY(file.write(contents)); } +static Vector parse_code_point_list(StringView const& list) +{ + Vector code_points; + + auto segments = list.split_view(' '); + for (auto const& code_point : segments) + code_points.append(AK::StringUtils::convert_to_uint_from_hex(code_point).value()); + + return code_points; +} + +static CodePointRange parse_code_point_range(StringView const& list) +{ + CodePointRange code_point_range {}; + + if (list.contains(".."sv)) { + auto segments = list.split_view(".."sv); + VERIFY(segments.size() == 2); + + auto begin = AK::StringUtils::convert_to_uint_from_hex(segments[0]).value(); + auto end = AK::StringUtils::convert_to_uint_from_hex(segments[1]).value(); + code_point_range = { begin, end }; + } else { + auto code_point = AK::StringUtils::convert_to_uint_from_hex(list).value(); + code_point_range = { code_point, code_point }; + } + + return code_point_range; +} + static void parse_special_casing(Core::File& file, UnicodeData& unicode_data) { - auto parse_code_point_list = [&](auto const& line) { - Vector code_points; - - auto segments = line.split(' '); - for (auto const& code_point : segments) - code_points.append(AK::StringUtils::convert_to_uint_from_hex(code_point).value()); - - return code_points; - }; - while (file.can_read_line()) { auto line = file.read_line(); if (line.is_empty() || line.starts_with('#')) @@ -191,7 +230,7 @@ static void parse_prop_list(Core::File& file, PropList& prop_list, bool multi_va auto segments = line.split_view(';', true); VERIFY(segments.size() == 2); - auto code_point_range = segments[0].trim_whitespace(); + auto code_point_range = parse_code_point_range(segments[0].trim_whitespace()); Vector properties; if (multi_value_property) @@ -201,18 +240,7 @@ static void parse_prop_list(Core::File& file, PropList& prop_list, bool multi_va for (auto const& property : properties) { auto& code_points = prop_list.ensure(property.trim_whitespace()); - - if (code_point_range.contains(".."sv)) { - segments = code_point_range.split_view(".."sv); - VERIFY(segments.size() == 2); - - auto begin = AK::StringUtils::convert_to_uint_from_hex(segments[0]).value(); - auto end = AK::StringUtils::convert_to_uint_from_hex(segments[1]).value(); - code_points.append({ begin, end }); - } else { - auto code_point = AK::StringUtils::convert_to_uint_from_hex(code_point_range).value(); - code_points.append({ code_point, code_point }); - } + code_points.append(code_point_range); } } } @@ -301,6 +329,44 @@ static void parse_value_alias_list(Core::File& file, StringView desired_category } } +static void parse_normalization_props(Core::File& file, UnicodeData& unicode_data) +{ + while (file.can_read_line()) { + auto line = file.read_line(); + if (line.is_empty() || line.starts_with('#')) + continue; + + if (auto index = line.find('#'); index.has_value()) + line = line.substring(0, *index); + + auto segments = line.split_view(';', true); + VERIFY((segments.size() == 2) || (segments.size() == 3)); + + auto code_point_range = parse_code_point_range(segments[0].trim_whitespace()); + auto property = segments[1].trim_whitespace().to_string(); + + Vector value; + QuickCheck quick_check = QuickCheck::Yes; + + if (segments.size() == 3) { + auto value_or_quick_check = segments[2].trim_whitespace(); + + if ((value_or_quick_check == "N"sv)) + quick_check = QuickCheck::No; + else if ((value_or_quick_check == "M"sv)) + quick_check = QuickCheck::Maybe; + else + value = parse_code_point_list(value_or_quick_check); + } + + auto& normalizations = unicode_data.normalization_props.ensure(property); + normalizations.append({ code_point_range, move(value), quick_check }); + + auto& prop_list = unicode_data.prop_list.ensure(property); + prop_list.append(move(code_point_range)); + } +} + static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data) { Optional code_point_range_start; @@ -927,6 +993,7 @@ int main(int argc, char** argv) char const* scripts_path = nullptr; char const* script_extensions_path = nullptr; char const* emoji_data_path = nullptr; + char const* normalization_path = nullptr; Core::ArgsParser args_parser; args_parser.add_option(generated_header_path, "Path to the Unicode Data header file to generate", "generated-header-path", 'h', "generated-header-path"); @@ -942,6 +1009,7 @@ int main(int argc, char** argv) args_parser.add_option(scripts_path, "Path to Scripts.txt file", "scripts-path", 'r', "scripts-path"); args_parser.add_option(script_extensions_path, "Path to ScriptExtensions.txt file", "script-extensions-path", 'x', "script-extensions-path"); args_parser.add_option(emoji_data_path, "Path to emoji-data.txt file", "emoji-data-path", 'e', "emoji-data-path"); + args_parser.add_option(normalization_path, "Path to DerivedNormalizationProps.txt file", "normalization-path", 'n', "normalization-path"); args_parser.parse(argc, argv); auto open_file = [&](StringView path, StringView flags, Core::OpenMode mode = Core::OpenMode::ReadOnly) { @@ -973,6 +1041,7 @@ int main(int argc, char** argv) auto scripts_file = open_file(scripts_path, "-r/--scripts-path"); auto script_extensions_file = open_file(script_extensions_path, "-x/--script-extensions-path"); auto emoji_data_file = open_file(emoji_data_path, "-e/--emoji-data-path"); + auto normalization_file = open_file(normalization_path, "-n/--normalization-path"); UnicodeData unicode_data {}; parse_special_casing(special_casing_file, unicode_data); @@ -981,6 +1050,7 @@ int main(int argc, char** argv) parse_prop_list(derived_core_prop_file, unicode_data.prop_list); parse_prop_list(derived_binary_prop_file, unicode_data.prop_list); parse_prop_list(emoji_data_file, unicode_data.prop_list); + parse_normalization_props(normalization_file, unicode_data); parse_alias_list(prop_alias_file, unicode_data.prop_list, unicode_data.prop_aliases); parse_prop_list(scripts_file, unicode_data.script_list); parse_prop_list(script_extensions_file, unicode_data.script_extensions, true); diff --git a/Userland/Libraries/LibUnicode/unicode_data.cmake b/Userland/Libraries/LibUnicode/unicode_data.cmake index 1d12bec254a..a6153637b75 100644 --- a/Userland/Libraries/LibUnicode/unicode_data.cmake +++ b/Userland/Libraries/LibUnicode/unicode_data.cmake @@ -33,6 +33,9 @@ set(SCRIPT_EXTENSIONS_PATH ${CMAKE_BINARY_DIR}/UCD/ScriptExtensions.txt) set(EMOJI_DATA_URL https://www.unicode.org/Public/13.0.0/ucd/emoji/emoji-data.txt) set(EMOJI_DATA_PATH ${CMAKE_BINARY_DIR}/UCD/emoji-data.txt) +set(NORM_PROPS_URL https://www.unicode.org/Public/13.0.0/ucd/DerivedNormalizationProps.txt) +set(NORM_PROPS_PATH ${CMAKE_BINARY_DIR}/UCD/DerivedNormalizationProps.txt) + if (ENABLE_UNICODE_DATABASE_DOWNLOAD) if (NOT EXISTS ${UNICODE_DATA_PATH}) message(STATUS "Downloading UCD UnicodeData.txt from ${UNICODE_DATA_URL}...") @@ -78,6 +81,10 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) message(STATUS "Downloading UCD emoji-data.txt from ${EMOJI_DATA_URL}...") file(DOWNLOAD ${EMOJI_DATA_URL} ${EMOJI_DATA_PATH} INACTIVITY_TIMEOUT 10) endif() + if (NOT EXISTS ${NORM_PROPS_PATH}) + message(STATUS "Downloading UCD DerivedNormalizationProps.txt from ${NORM_PROPS_URL}...") + file(DOWNLOAD ${NORM_PROPS_URL} ${NORM_PROPS_PATH} INACTIVITY_TIMEOUT 10) + endif() set(UNICODE_DATA_HEADER LibUnicode/UnicodeData.h) set(UNICODE_DATA_IMPLEMENTATION LibUnicode/UnicodeData.cpp) @@ -89,9 +96,9 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) add_custom_command( OUTPUT ${UNICODE_DATA_HEADER} ${UNICODE_DATA_IMPLEMENTATION} - COMMAND $ -h ${UNICODE_DATA_HEADER} -c ${UNICODE_DATA_IMPLEMENTATION} -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -g ${DERIVED_GENERAL_CATEGORY_PATH} -p ${PROP_LIST_PATH} -d ${DERIVED_CORE_PROP_PATH} -b ${DERIVED_BINARY_PROP_PATH} -a ${PROP_ALIAS_PATH} -v ${PROP_VALUE_ALIAS_PATH} -r ${SCRIPTS_PATH} -x ${SCRIPT_EXTENSIONS_PATH} -e ${EMOJI_DATA_PATH} + COMMAND $ -h ${UNICODE_DATA_HEADER} -c ${UNICODE_DATA_IMPLEMENTATION} -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -g ${DERIVED_GENERAL_CATEGORY_PATH} -p ${PROP_LIST_PATH} -d ${DERIVED_CORE_PROP_PATH} -b ${DERIVED_BINARY_PROP_PATH} -a ${PROP_ALIAS_PATH} -v ${PROP_VALUE_ALIAS_PATH} -r ${SCRIPTS_PATH} -x ${SCRIPT_EXTENSIONS_PATH} -e ${EMOJI_DATA_PATH} -n ${NORM_PROPS_PATH} VERBATIM - DEPENDS GenerateUnicodeData ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH} ${DERIVED_GENERAL_CATEGORY_PATH} ${PROP_LIST_PATH} ${DERIVED_CORE_PROP_PATH} ${DERIVED_BINARY_PROP_PATH} ${PROP_ALIAS_PATH} ${PROP_VALUE_ALIAS_PATH} ${SCRIPTS_PATH} ${SCRIPT_EXTENSIONS_PATH} ${EMOJI_DATA_PATH} + DEPENDS GenerateUnicodeData ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH} ${DERIVED_GENERAL_CATEGORY_PATH} ${PROP_LIST_PATH} ${DERIVED_CORE_PROP_PATH} ${DERIVED_BINARY_PROP_PATH} ${PROP_ALIAS_PATH} ${PROP_VALUE_ALIAS_PATH} ${SCRIPTS_PATH} ${SCRIPT_EXTENSIONS_PATH} ${EMOJI_DATA_PATH} ${NORM_PROPS_PATH} ) set(UNICODE_DATA_SOURCES ${UNICODE_DATA_HEADER} ${UNICODE_DATA_IMPLEMENTATION})