diff --git a/Meta/Lagom/CMakeLists.txt b/Meta/Lagom/CMakeLists.txt index 7c1a3c0cf22..6a55a6544f9 100644 --- a/Meta/Lagom/CMakeLists.txt +++ b/Meta/Lagom/CMakeLists.txt @@ -319,6 +319,7 @@ if (BUILD_LAGOM) file(GLOB LIBREGEX_SOURCES CONFIGURE_DEPENDS "../../Userland/Libraries/LibRegex/*.cpp") lagom_lib(Regex regex SOURCES ${LIBREGEX_SOURCES} ${LIBREGEX_LIBC_SOURCES} + LIBS LagomUnicode ) # Shell diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp index a1f6f084d6c..043d520d9c1 100644 --- a/Tests/LibRegex/Regex.cpp +++ b/Tests/LibRegex/Regex.cpp @@ -515,6 +515,13 @@ TEST_CASE(ECMA262_parse) { "\\u{10ffff", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, { "\\u{10ffffx", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, { "\\u{110000}", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, + { "\\p", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, + { "\\p{", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, + { "\\p{}", regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode }, + { "\\p{AsCiI}", regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode }, + { "\\p{hello friends}", regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode }, + { "\\p{Prepended_Concatenation_Mark}", regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode }, + { "\\p{ASCII}", regex::Error::NoError, ECMAScriptFlags::Unicode }, }; for (auto& test : tests) { @@ -635,6 +642,47 @@ TEST_CASE(ECMA262_unicode_match) } } +TEST_CASE(ECMA262_property_match) +{ + struct _test { + char const* pattern; + char const* subject; + bool matches { true }; + ECMAScriptFlags options {}; + }; + + constexpr _test tests[] { + { "\\p{ASCII}", "a", false }, + { "\\p{ASCII}", "p{ASCII}", true }, + { "\\p{ASCII}", "a", true, ECMAScriptFlags::Unicode }, + { "\\p{ASCII}", "😀", false, ECMAScriptFlags::Unicode }, + { "\\p{ASCII_Hex_Digit}", "1", true, ECMAScriptFlags::Unicode }, + { "\\p{ASCII_Hex_Digit}", "a", true, ECMAScriptFlags::Unicode }, + { "\\p{ASCII_Hex_Digit}", "x", false, ECMAScriptFlags::Unicode }, + { "\\p{Any}", "\xcd\xb8", true, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point. + { "\\p{Assigned}", "\xcd\xb8", false, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point. + }; + + for (auto& test : tests) { + Regex re(test.pattern, (ECMAScriptFlags)regex::AllFlags::Global | regex::ECMAScriptFlags::BrowserExtended | test.options); + + auto subject = AK::utf8_to_utf16(test.subject); + Utf16View view { subject }; + + if constexpr (REGEX_DEBUG) { + dbgln("\n"); + RegexDebug regex_dbg(stderr); + regex_dbg.print_raw_bytecode(re); + regex_dbg.print_header(); + regex_dbg.print_bytecode(re); + dbgln("\n"); + } + + EXPECT_EQ(re.parser_result.error, Error::NoError); + EXPECT_EQ(re.match(view).success, test.matches); + } +} + TEST_CASE(replace) { struct _test { diff --git a/Userland/Libraries/LibC/regex.h b/Userland/Libraries/LibC/regex.h index 793c6a35df3..679db8671f3 100644 --- a/Userland/Libraries/LibC/regex.h +++ b/Userland/Libraries/LibC/regex.h @@ -37,6 +37,7 @@ enum __Regex_Error { __Regex_EmptySubExpression, // Sub expression has empty content. __Regex_InvalidCaptureGroup, // Content of capture group is invalid. __Regex_InvalidNameForCaptureGroup, // Name of capture group is invalid. + __Regex_InvalidNameForProperty, // Name of property is invalid. }; enum ReError { diff --git a/Userland/Libraries/LibRegex/CMakeLists.txt b/Userland/Libraries/LibRegex/CMakeLists.txt index 44cc6928f91..c1a25fe4b06 100644 --- a/Userland/Libraries/LibRegex/CMakeLists.txt +++ b/Userland/Libraries/LibRegex/CMakeLists.txt @@ -7,4 +7,4 @@ set(SOURCES ) serenity_lib(LibRegex regex) -target_link_libraries(LibRegex LibC LibCore) +target_link_libraries(LibRegex LibC LibCore LibUnicode) diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp index 964d403c68b..329afb14567 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.cpp +++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp @@ -9,6 +9,7 @@ #include "RegexDebug.h" #include #include +#include namespace regex { @@ -532,6 +533,10 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M if (!compare_string(input, state, str, had_zero_length_match)) return ExecutionResult::Failed_ExecuteLowPrioForks; + } else if (compare_type == CharacterCompareType::Property) { + auto property = static_cast(m_bytecode->at(offset++)); + compare_property(input, state, property, current_inversion_state(), inverse_matched); + } else { warnln("Undefined comparison: {}", (int)compare_type); VERIFY_NOT_REACHED(); @@ -721,6 +726,22 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_range(MatchInput const& inp } } +ALWAYS_INLINE void OpCode_Compare::compare_property(MatchInput const& input, MatchState& state, Unicode::Property property, bool inverse, bool& inverse_matched) +{ + if (state.string_position == input.view.length()) + return; + + u32 code_point = input.view[state.string_position]; + bool equal = Unicode::code_point_has_property(code_point, property); + + if (equal) { + if (inverse) + inverse_matched = true; + else + ++state.string_position; + } +} + String const OpCode_Compare::arguments_string() const { return String::formatted("argc={}, args={} ", arguments_count(), arguments_size()); diff --git a/Userland/Libraries/LibRegex/RegexByteCode.h b/Userland/Libraries/LibRegex/RegexByteCode.h index 2fb09293b61..f86143fd71c 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.h +++ b/Userland/Libraries/LibRegex/RegexByteCode.h @@ -18,6 +18,7 @@ #include #include #include +#include namespace regex { @@ -65,6 +66,7 @@ enum class OpCodeId : ByteCodeValueType { __ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(Reference) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(NamedReference) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(Property) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy) enum class CharacterCompareType : ByteCodeValueType { @@ -722,6 +724,7 @@ private: ALWAYS_INLINE static bool compare_string(MatchInput const& input, MatchState& state, RegexStringView const& str, bool& had_zero_length_match); ALWAYS_INLINE static void compare_character_class(MatchInput const& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched); ALWAYS_INLINE static void compare_character_range(MatchInput const& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched); + ALWAYS_INLINE static void compare_property(MatchInput const& input, MatchState& state, Unicode::Property property, bool inverse, bool& inverse_matched); }; template diff --git a/Userland/Libraries/LibRegex/RegexError.h b/Userland/Libraries/LibRegex/RegexError.h index f595c7d0b6c..57cc5527f19 100644 --- a/Userland/Libraries/LibRegex/RegexError.h +++ b/Userland/Libraries/LibRegex/RegexError.h @@ -34,6 +34,7 @@ enum class Error : u8 { EmptySubExpression = __Regex_EmptySubExpression, // Sub expression has empty content. InvalidCaptureGroup = __Regex_InvalidCaptureGroup, // Content of capture group is invalid. InvalidNameForCaptureGroup = __Regex_InvalidNameForCaptureGroup, // Name of capture group is invalid. + InvalidNameForProperty = __Regex_InvalidNameForProperty, // Name of property is invalid. }; inline String get_error_string(Error error) @@ -73,6 +74,8 @@ inline String get_error_string(Error error) return "Content of capture group is invalid."; case Error::InvalidNameForCaptureGroup: return "Name of capture group is invalid."; + case Error::InvalidNameForProperty: + return "Name of property is invalid."; } return "Undefined error."; } diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp index afa4d52b217..a5574f11c4c 100644 --- a/Userland/Libraries/LibRegex/RegexParser.cpp +++ b/Userland/Libraries/LibRegex/RegexParser.cpp @@ -12,6 +12,7 @@ #include #include #include +#include namespace regex { @@ -1238,12 +1239,12 @@ bool ECMA262Parser::parse_atom(ByteCode& stack, size_t& match_length_minimum, bo if (match(TokenType::LeftBracket)) { // Character class. - return parse_character_class(stack, match_length_minimum, unicode && !m_should_use_browser_extended_grammar, named); + return parse_character_class(stack, match_length_minimum, unicode, named); } if (match(TokenType::LeftParen)) { // Non-capturing group, or a capture group. - return parse_capture_group(stack, match_length_minimum, unicode && !m_should_use_browser_extended_grammar, named); + return parse_capture_group(stack, match_length_minimum, unicode, named); } if (match(TokenType::Period)) { @@ -1541,13 +1542,14 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini } if (unicode) { - if (try_skip("p{")) { - // FIXME: Implement this path, Unicode property match. - TODO(); - } - if (try_skip("P{")) { - // FIXME: Implement this path, Unicode property match. - TODO(); + Unicode::Property property {}; + bool negated = false; + + if (parse_unicode_property_escape(property, negated)) { + if (negated) + stack.insert_bytecode_compare_values({ { CharacterCompareType::Inverse, 0 } }); + stack.insert_bytecode_compare_values({ { CharacterCompareType::Property, (ByteCodeValueType)(property) } }); + return true; } } @@ -1692,10 +1694,12 @@ struct CharClassRangeElement { union { CharClass character_class; u32 code_point { 0 }; + Unicode::Property property; }; bool is_negated { false }; bool is_character_class { false }; + bool is_property_escape { false }; }; bool ECMA262Parser::parse_nonempty_class_ranges(Vector& ranges, bool unicode) @@ -1779,11 +1783,11 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector& if (unicode) { if (try_skip("-")) return { CharClassRangeElement { .code_point = '-', .is_character_class = false } }; - } - if (try_skip("p{") || try_skip("P{")) { - // FIXME: Implement these; unicode properties. - TODO(); + Unicode::Property property {}; + bool negated = false; + if (parse_unicode_property_escape(property, negated)) + return { CharClassRangeElement { .property = property, .is_negated = negated, .is_character_class = true, .is_property_escape = true } }; } if (try_skip("d")) @@ -1820,6 +1824,20 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector& return read_class_atom_no_dash(); }; + auto empend_atom = [&](auto& atom) { + if (atom.is_character_class) { + if (atom.is_negated) + ranges.empend(CompareTypeAndValuePair { CharacterCompareType::TemporaryInverse, 0 }); + if (atom.is_property_escape) + ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)(atom.property) }); + else + ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)atom.character_class }); + } else { + VERIFY(!atom.is_negated); + ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Char, atom.code_point }); + } + }; + while (!match(TokenType::RightBracket)) { if (match(TokenType::Eof)) { set_error(Error::MismatchingBracket); @@ -1848,18 +1866,11 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector& set_error(Error::InvalidRange); return false; } + // CharacterRangeOrUnion > !Unicode > CharClass - if (first_atom->is_character_class) - ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)first_atom->character_class }); - else - ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Char, (ByteCodeValueType)first_atom->code_point }); - + empend_atom(*first_atom); ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Char, (ByteCodeValueType)'-' }); - - if (second_atom->is_character_class) - ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)second_atom->character_class }); - else - ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Char, (ByteCodeValueType)second_atom->code_point }); + empend_atom(*second_atom); continue; } else { set_error(Error::InvalidRange); @@ -1882,15 +1893,7 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector& read_as_single_atom:; auto atom = first_atom.value(); - - if (atom.is_character_class) { - if (atom.is_negated) - ranges.empend(CompareTypeAndValuePair { CharacterCompareType::TemporaryInverse, 0 }); - ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)first_atom.value().character_class }); - } else { - VERIFY(!atom.is_negated); - ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Char, first_atom.value().code_point }); - } + empend_atom(atom); } consume(TokenType::RightBracket, Error::MismatchingBracket); @@ -1898,6 +1901,32 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector& return true; } +bool ECMA262Parser::parse_unicode_property_escape(Unicode::Property& property, bool& negated) +{ + negated = false; + + if (try_skip("p")) + negated = false; + else if (try_skip("P")) + negated = true; + else + return false; + + auto parsed_property = read_unicode_property_escape(); + if (!parsed_property.has_value()) { + set_error(Error::InvalidNameForProperty); + return false; + } + + if (!Unicode::is_ecma262_property(*parsed_property)) { + set_error(Error::InvalidNameForProperty); + return false; + } + + property = *parsed_property; + return true; +} + StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket) { if (take_starting_angle_bracket && !consume("<")) @@ -1919,6 +1948,24 @@ StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_ return name; } +Optional ECMA262Parser::read_unicode_property_escape() +{ + consume(TokenType::LeftCurly, Error::InvalidPattern); + + auto start_token = m_parser_state.current_token; + size_t offset = 0; + while (match(TokenType::Char)) { + if (m_parser_state.current_token.value() == "}") + break; + offset += consume().value().length(); + } + + consume(TokenType::RightCurly, Error::InvalidPattern); + + StringView property_name { start_token.value().characters_without_null_termination(), offset }; + return Unicode::property_from_string(property_name); +} + bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named) { consume(TokenType::LeftParen, Error::InvalidPattern); diff --git a/Userland/Libraries/LibRegex/RegexParser.h b/Userland/Libraries/LibRegex/RegexParser.h index e3d6835a5bd..a48ad90c4be 100644 --- a/Userland/Libraries/LibRegex/RegexParser.h +++ b/Userland/Libraries/LibRegex/RegexParser.h @@ -15,6 +15,7 @@ #include #include #include +#include namespace regex { @@ -212,6 +213,7 @@ private: StringView read_digits_as_string(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1); Optional read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1); StringView read_capture_group_specifier(bool take_starting_angle_bracket = false); + Optional read_unicode_property_escape(); bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named); bool parse_disjunction(ByteCode&, size_t&, bool unicode, bool named); @@ -225,6 +227,7 @@ private: bool parse_capture_group(ByteCode&, size_t&, bool unicode, bool named); Optional parse_character_class_escape(bool& out_inverse, bool expect_backslash = false); bool parse_nonempty_class_ranges(Vector&, bool unicode); + bool parse_unicode_property_escape(Unicode::Property& property, bool& negated); // Used only by B.1.4, Regular Expression Patterns (Extended for use in browsers) bool parse_quantifiable_assertion(ByteCode&, size_t&, bool named); diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index 411da0da9f8..f34cfdd3d45 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -222,4 +222,73 @@ bool code_point_has_property([[maybe_unused]] u32 code_point, [[maybe_unused]] P #endif } +bool is_ecma262_property([[maybe_unused]] Property property) +{ +#if ENABLE_UNICODE_DATA + // EMCA-262 only allows a subset of Unicode properties: https://tc39.es/ecma262/#table-binary-unicode-properties + // Note: Some of the properties in the above link are not yet parsed by the LibUnicode generator. They are left + // commented out here until they are parsed and can be used. + switch (property) { + case Unicode::Property::ASCII: + case Unicode::Property::ASCII_Hex_Digit: + case Unicode::Property::Alphabetic: + case Unicode::Property::Any: + case Unicode::Property::Assigned: + case Unicode::Property::Bidi_Control: + // case Unicode::Property::Bidi_Mirrored: + case Unicode::Property::Case_Ignorable: + case Unicode::Property::Cased: + case Unicode::Property::Changes_When_Casefolded: + case Unicode::Property::Changes_When_Casemapped: + case Unicode::Property::Changes_When_Lowercased: + // case Unicode::Property::Changes_When_NFKC_Casefolded: + case Unicode::Property::Changes_When_Titlecased: + case Unicode::Property::Changes_When_Uppercased: + case Unicode::Property::Dash: + case Unicode::Property::Default_Ignorable_Code_Point: + case Unicode::Property::Deprecated: + case Unicode::Property::Diacritic: + // case Unicode::Property::Emoji: + // case Unicode::Property::Emoji_Component: + // case Unicode::Property::Emoji_Modifier: + // case Unicode::Property::Emoji_Modifier_Base: + // case Unicode::Property::Emoji_Presentation: + // case Unicode::Property::Extended_Pictographic: + case Unicode::Property::Extender: + case Unicode::Property::Grapheme_Base: + case Unicode::Property::Grapheme_Extend: + case Unicode::Property::Hex_Digit: + case Unicode::Property::IDS_Binary_Operator: + case Unicode::Property::IDS_Trinary_Operator: + case Unicode::Property::ID_Continue: + case Unicode::Property::ID_Start: + case Unicode::Property::Ideographic: + case Unicode::Property::Join_Control: + case Unicode::Property::Logical_Order_Exception: + case Unicode::Property::Lowercase: + case Unicode::Property::Math: + case Unicode::Property::Noncharacter_Code_Point: + case Unicode::Property::Pattern_Syntax: + case Unicode::Property::Pattern_White_Space: + case Unicode::Property::Quotation_Mark: + case Unicode::Property::Radical: + case Unicode::Property::Regional_Indicator: + case Unicode::Property::Sentence_Terminal: + case Unicode::Property::Soft_Dotted: + case Unicode::Property::Terminal_Punctuation: + case Unicode::Property::Unified_Ideograph: + case Unicode::Property::Uppercase: + case Unicode::Property::Variation_Selector: + case Unicode::Property::White_Space: + case Unicode::Property::XID_Continue: + case Unicode::Property::XID_Start: + return true; + default: + return false; + } +#else + return false; +#endif +} + } diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.h b/Userland/Libraries/LibUnicode/CharacterTypes.h index eac6e79293f..46c7a1b8e4e 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.h +++ b/Userland/Libraries/LibUnicode/CharacterTypes.h @@ -23,5 +23,6 @@ String to_unicode_uppercase_full(StringView const&); Optional property_from_string(StringView const&); bool code_point_has_property(u32 code_point, Property property); +bool is_ecma262_property(Property); }