From 598dc74a7607fd6c0cbf31ce614f1d30b7164aae Mon Sep 17 00:00:00 2001 From: Ali Mohammad Pur Date: Wed, 20 Jul 2022 23:22:07 +0430 Subject: [PATCH] LibRegex: Partially implement the ECMAScript unicodeSets proposal This skips the new string unicode properties additions, along with \q{}. --- Tests/LibRegex/Regex.cpp | 39 ++ Userland/Libraries/LibC/regex.h | 46 ++- Userland/Libraries/LibRegex/RegexByteCode.cpp | 118 +++++- Userland/Libraries/LibRegex/RegexByteCode.h | 5 +- Userland/Libraries/LibRegex/RegexError.h | 39 +- .../Libraries/LibRegex/RegexOptimizer.cpp | 40 +- Userland/Libraries/LibRegex/RegexOptions.h | 2 + Userland/Libraries/LibRegex/RegexParser.cpp | 379 +++++++++++++++++- Userland/Libraries/LibRegex/RegexParser.h | 12 + 9 files changed, 611 insertions(+), 69 deletions(-) diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp index c5d153b0aa3..daeb623c433 100644 --- a/Tests/LibRegex/Regex.cpp +++ b/Tests/LibRegex/Regex.cpp @@ -765,6 +765,45 @@ TEST_CASE(ECMA262_unicode_match) } } +TEST_CASE(ECMA262_unicode_sets_match) +{ + struct _test { + StringView pattern; + StringView subject; + bool matches { true }; + ECMAScriptFlags options {}; + }; + + constexpr _test tests[] { + { "[\\w--x]"sv, "x"sv, false }, + { "[\\w&&x]"sv, "y"sv, false }, + { "[\\w--x]"sv, "y"sv, true }, + { "[\\w&&x]"sv, "x"sv, true }, + { "[[0-9\\w]--x--6]"sv, "6"sv, false }, + { "[[0-9\\w]--x--6]"sv, "x"sv, false }, + { "[[0-9\\w]--x--6]"sv, "y"sv, true }, + { "[[0-9\\w]--x--6]"sv, "9"sv, true }, + { "[\\w&&\\d]"sv, "a"sv, false }, + { "[\\w&&\\d]"sv, "4"sv, true }, + }; + + for (auto& test : tests) { + Regex re(test.pattern, (ECMAScriptFlags)regex::AllFlags::UnicodeSets | test.options); + if constexpr (REGEX_DEBUG) { + dbgln("\n"); + RegexDebug regex_dbg(stderr); + regex_dbg.print_raw_bytecode(re); + regex_dbg.print_header(); + regex_dbg.print_bytecode(re); + dbgln("\n"); + } + + EXPECT_EQ(re.parser_result.error, regex::Error::NoError); + auto result = re.match(test.subject).success; + EXPECT_EQ(result, test.matches); + } +} + TEST_CASE(ECMA262_property_match) { struct _test { diff --git a/Userland/Libraries/LibC/regex.h b/Userland/Libraries/LibC/regex.h index d51c138a656..a850a43ae84 100644 --- a/Userland/Libraries/LibC/regex.h +++ b/Userland/Libraries/LibC/regex.h @@ -21,24 +21,25 @@ typedef struct { enum __Regex_Error { __Regex_NoError, - __Regex_InvalidPattern, // Invalid regular expression. - __Regex_InvalidCollationElement, // Invalid collating element referenced. - __Regex_InvalidCharacterClass, // Invalid character class type referenced. - __Regex_InvalidTrailingEscape, // Trailing \ in pattern. - __Regex_InvalidNumber, // Number in \digit invalid or in error. - __Regex_MismatchingBracket, // [ ] imbalance. - __Regex_MismatchingParen, // ( ) imbalance. - __Regex_MismatchingBrace, // { } imbalance. - __Regex_InvalidBraceContent, // Content of {} invalid: not a number, number too large, more than two numbers, first larger than second. - __Regex_InvalidBracketContent, // Content of [] invalid. - __Regex_InvalidRange, // Invalid endpoint in range expression. - __Regex_InvalidRepetitionMarker, // ?, * or + not preceded by valid regular expression. - __Regex_ReachedMaxRecursion, // MaximumRecursion has been reached. - __Regex_EmptySubExpression, // Sub expression has empty content. - __Regex_InvalidCaptureGroup, // Content of capture group is invalid. - __Regex_InvalidNameForCaptureGroup, // Name of capture group is invalid. - __Regex_InvalidNameForProperty, // Name of property is invalid. - __Regex_DuplicateNamedCapture, // Duplicate named capture group + __Regex_InvalidPattern, // Invalid regular expression. + __Regex_InvalidCollationElement, // Invalid collating element referenced. + __Regex_InvalidCharacterClass, // Invalid character class type referenced. + __Regex_InvalidTrailingEscape, // Trailing \ in pattern. + __Regex_InvalidNumber, // Number in \digit invalid or in error. + __Regex_MismatchingBracket, // [ ] imbalance. + __Regex_MismatchingParen, // ( ) imbalance. + __Regex_MismatchingBrace, // { } imbalance. + __Regex_InvalidBraceContent, // Content of {} invalid: not a number, number too large, more than two numbers, first larger than second. + __Regex_InvalidBracketContent, // Content of [] invalid. + __Regex_InvalidRange, // Invalid endpoint in range expression. + __Regex_InvalidRepetitionMarker, // ?, * or + not preceded by valid regular expression. + __Regex_ReachedMaxRecursion, // MaximumRecursion has been reached. + __Regex_EmptySubExpression, // Sub expression has empty content. + __Regex_InvalidCaptureGroup, // Content of capture group is invalid. + __Regex_InvalidNameForCaptureGroup, // Name of capture group is invalid. + __Regex_InvalidNameForProperty, // Name of property is invalid. + __Regex_DuplicateNamedCapture, // Duplicate named capture group + __Regex_InvalidCharacterClassEscape, // Invalid escaped entity in character class. }; enum ReError { @@ -82,10 +83,11 @@ enum __RegexAllFlags { __Regex_Multiline = __Regex_Global << 12, // Handle newline characters. Match each line, one by one. __Regex_SkipTrimEmptyMatches = __Regex_Global << 13, // Do not remove empty capture group results. __Regex_SingleMatch = __Regex_Global << 14, // Stop after acquiring a single match. - __Regex_Internal_Stateful = __Regex_Global << 15, // Internal flag; enables stateful matches. - __Regex_Internal_BrowserExtended = __Regex_Global << 16, // Internal flag; enable browser-specific ECMA262 extensions. - __Regex_Internal_ConsiderNewline = __Regex_Global << 17, // Internal flag; allow matchers to consider newlines as line separators. - __Regex_Last = __Regex_SingleMatch + __Regex_UnicodeSets = __Regex_Global << 15, // ECMA262 Parser specific: Allow set operations in char classes. + __Regex_Internal_Stateful = __Regex_Global << 16, // Internal flag; enables stateful matches. + __Regex_Internal_BrowserExtended = __Regex_Global << 17, // Internal flag; enable browser-specific ECMA262 extensions. + __Regex_Internal_ConsiderNewline = __Regex_Global << 18, // Internal flag; allow matchers to consider newlines as line separators. + __Regex_Last = __Regex_UnicodeSets, }; // Values for the cflags parameter to the regcomp() function: diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp index f40d1c25d24..ebcefd21ff1 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.cpp +++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp @@ -435,6 +435,20 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M bool inverse { false }; bool temporary_inverse { false }; bool reset_temp_inverse { false }; + struct DisjunctionState { + bool active { false }; + bool is_conjunction { false }; + bool fail { false }; + size_t initial_position; + size_t initial_code_unit_position; + Optional last_accepted_position {}; + Optional last_accepted_code_unit_position {}; + }; + + Vector disjunction_states; + disjunction_states.empend(); + + auto current_disjunction_state = [&]() -> DisjunctionState& { return disjunction_states.last(); }; auto current_inversion_state = [&]() -> bool { return temporary_inverse ^ inverse; }; @@ -602,16 +616,69 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M auto script = static_cast(m_bytecode->at(offset++)); compare_script_extension(input, state, script, current_inversion_state(), inverse_matched); + } else if (compare_type == CharacterCompareType::And) { + disjunction_states.append({ + .active = true, + .is_conjunction = false, + .fail = false, + .initial_position = state.string_position, + .initial_code_unit_position = state.string_position_in_code_units, + }); + continue; + + } else if (compare_type == CharacterCompareType::Or) { + disjunction_states.append({ + .active = true, + .is_conjunction = true, + .fail = true, + .initial_position = state.string_position, + .initial_code_unit_position = state.string_position_in_code_units, + }); + continue; + + } else if (compare_type == CharacterCompareType::EndAndOr) { + auto disjunction_state = disjunction_states.take_last(); + if (!disjunction_state.fail) { + state.string_position = disjunction_state.last_accepted_position.value_or(disjunction_state.initial_position); + state.string_position_in_code_units = disjunction_state.last_accepted_code_unit_position.value_or(disjunction_state.initial_code_unit_position); + } + } else { warnln("Undefined comparison: {}", (int)compare_type); VERIFY_NOT_REACHED(); break; } - if (current_inversion_state() && !inverse && !inverse_matched) { + auto& new_disjunction_state = current_disjunction_state(); + if (current_inversion_state() && (!inverse || new_disjunction_state.active) && !inverse_matched) { advance_string_position(state, input.view); inverse_matched = true; } + + if (new_disjunction_state.active) { + auto failed = (!had_zero_length_match && string_position == state.string_position) || state.string_position > input.view.length(); + + if (!failed) { + new_disjunction_state.last_accepted_position = state.string_position; + new_disjunction_state.last_accepted_code_unit_position = state.string_position_in_code_units; + } + + if (new_disjunction_state.is_conjunction) + new_disjunction_state.fail = failed && new_disjunction_state.fail; + else + new_disjunction_state.fail = failed || new_disjunction_state.fail; + + state.string_position = new_disjunction_state.initial_position; + state.string_position_in_code_units = new_disjunction_state.initial_code_unit_position; + } + } + + auto& new_disjunction_state = current_disjunction_state(); + if (new_disjunction_state.active) { + if (!new_disjunction_state.fail) { + state.string_position = new_disjunction_state.last_accepted_position.value_or(new_disjunction_state.initial_position); + state.string_position_in_code_units = new_disjunction_state.last_accepted_code_unit_position.value_or(new_disjunction_state.initial_code_unit_position); + } } if (current_inversion_state() && !inverse_matched) @@ -843,6 +910,12 @@ Vector OpCode_Compare::flat_compares() const auto count = m_bytecode->at(offset++); for (size_t i = 0; i < count; ++i) result.append({ CharacterCompareType::CharRange, m_bytecode->at(offset++) }); + } else if (compare_type == CharacterCompareType::GeneralCategory + || compare_type == CharacterCompareType::Property + || compare_type == CharacterCompareType::Script + || compare_type == CharacterCompareType::ScriptExtension) { + auto value = m_bytecode->at(offset++); + result.append({ compare_type, value }); } else { result.append({ compare_type, 0 }); } @@ -867,39 +940,39 @@ Vector OpCode_Compare::variable_arguments_to_string(Optional auto ch = m_bytecode->at(offset++); auto is_ascii = is_ascii_printable(ch); if (is_ascii) - result.empend(String::formatted("value='{:c}'", static_cast(ch))); + result.empend(String::formatted(" value='{:c}'", static_cast(ch))); else - result.empend(String::formatted("value={:x}", ch)); + result.empend(String::formatted(" value={:x}", ch)); if (!view.is_null() && view.length() > string_start_offset) { if (is_ascii) { result.empend(String::formatted( - "compare against: '{}'", + " compare against: '{}'", view.substring_view(string_start_offset, string_start_offset > view.length() ? 0 : 1).to_string())); } else { auto str = view.substring_view(string_start_offset, string_start_offset > view.length() ? 0 : 1).to_string(); u8 buf[8] { 0 }; __builtin_memcpy(buf, str.characters(), min(str.length(), sizeof(buf))); - result.empend(String::formatted("compare against: {:x},{:x},{:x},{:x},{:x},{:x},{:x},{:x}", + result.empend(String::formatted(" compare against: {:x},{:x},{:x},{:x},{:x},{:x},{:x},{:x}", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7])); } } } else if (compare_type == CharacterCompareType::Reference) { auto ref = m_bytecode->at(offset++); - result.empend(String::formatted("number={}", ref)); + result.empend(String::formatted(" number={}", ref)); if (input.has_value()) { if (state().capture_group_matches.size() > input->match_index) { auto& match = state().capture_group_matches[input->match_index]; if (match.size() > ref) { auto& group = match[ref]; - result.empend(String::formatted("left={}", group.left_column)); - result.empend(String::formatted("right={}", group.left_column + group.view.length_in_code_units())); - result.empend(String::formatted("contents='{}'", group.view)); + result.empend(String::formatted(" left={}", group.left_column)); + result.empend(String::formatted(" right={}", group.left_column + group.view.length_in_code_units())); + result.empend(String::formatted(" contents='{}'", group.view)); } else { - result.empend(String::formatted("(invalid ref, max={})", match.size() - 1)); + result.empend(String::formatted(" (invalid ref, max={})", match.size() - 1)); } } else { - result.empend(String::formatted("(invalid index {}, max={})", input->match_index, state().capture_group_matches.size() - 1)); + result.empend(String::formatted(" (invalid index {}, max={})", input->match_index, state().capture_group_matches.size() - 1)); } } } else if (compare_type == CharacterCompareType::String) { @@ -907,35 +980,42 @@ Vector OpCode_Compare::variable_arguments_to_string(Optional StringBuilder str_builder; for (size_t i = 0; i < length; ++i) str_builder.append(m_bytecode->at(offset++)); - result.empend(String::formatted("value=\"{}\"", str_builder.string_view().substring_view(0, length))); + result.empend(String::formatted(" value=\"{}\"", str_builder.string_view().substring_view(0, length))); if (!view.is_null() && view.length() > state().string_position) result.empend(String::formatted( - "compare against: \"{}\"", + " compare against: \"{}\"", input.value().view.substring_view(string_start_offset, string_start_offset + length > view.length() ? 0 : length).to_string())); } else if (compare_type == CharacterCompareType::CharClass) { auto character_class = (CharClass)m_bytecode->at(offset++); - result.empend(String::formatted("ch_class={} [{}]", (size_t)character_class, character_class_name(character_class))); + result.empend(String::formatted(" ch_class={} [{}]", (size_t)character_class, character_class_name(character_class))); if (!view.is_null() && view.length() > state().string_position) result.empend(String::formatted( - "compare against: '{}'", + " compare against: '{}'", input.value().view.substring_view(string_start_offset, state().string_position > view.length() ? 0 : 1).to_string())); } else if (compare_type == CharacterCompareType::CharRange) { auto value = (CharRange)m_bytecode->at(offset++); - result.empend(String::formatted("ch_range={:x}-{:x}", value.from, value.to)); + result.empend(String::formatted(" ch_range={:x}-{:x}", value.from, value.to)); if (!view.is_null() && view.length() > state().string_position) result.empend(String::formatted( - "compare against: '{}'", + " compare against: '{}'", input.value().view.substring_view(string_start_offset, state().string_position > view.length() ? 0 : 1).to_string())); } else if (compare_type == CharacterCompareType::LookupTable) { auto count = m_bytecode->at(offset++); for (size_t j = 0; j < count; ++j) { auto range = (CharRange)m_bytecode->at(offset++); - result.append(String::formatted("{:x}-{:x}", range.from, range.to)); + result.append(String::formatted(" {:x}-{:x}", range.from, range.to)); } if (!view.is_null() && view.length() > state().string_position) result.empend(String::formatted( - "compare against: '{}'", + " compare against: '{}'", input.value().view.substring_view(string_start_offset, state().string_position > view.length() ? 0 : 1).to_string())); + } else if (compare_type == CharacterCompareType::GeneralCategory + || compare_type == CharacterCompareType::Property + || compare_type == CharacterCompareType::Script + || compare_type == CharacterCompareType::ScriptExtension) { + + auto value = m_bytecode->at(offset++); + result.empend(String::formatted(" value={}", value)); } } return result; diff --git a/Userland/Libraries/LibRegex/RegexByteCode.h b/Userland/Libraries/LibRegex/RegexByteCode.h index 8b5b81ca3d6..2c4ab3aa8a4 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.h +++ b/Userland/Libraries/LibRegex/RegexByteCode.h @@ -76,7 +76,10 @@ enum class OpCodeId : ByteCodeValueType { __ENUMERATE_CHARACTER_COMPARE_TYPE(Script) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(ScriptExtension) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy) \ - __ENUMERATE_CHARACTER_COMPARE_TYPE(LookupTable) + __ENUMERATE_CHARACTER_COMPARE_TYPE(LookupTable) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(And) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(Or) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(EndAndOr) enum class CharacterCompareType : ByteCodeValueType { #define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) x, diff --git a/Userland/Libraries/LibRegex/RegexError.h b/Userland/Libraries/LibRegex/RegexError.h index 33dc11ae9fb..040f4b71679 100644 --- a/Userland/Libraries/LibRegex/RegexError.h +++ b/Userland/Libraries/LibRegex/RegexError.h @@ -18,24 +18,25 @@ namespace regex { enum class Error : u8 { NoError = __Regex_NoError, - InvalidPattern = __Regex_InvalidPattern, // Invalid regular expression. - InvalidCollationElement = __Regex_InvalidCollationElement, // Invalid collating element referenced. - InvalidCharacterClass = __Regex_InvalidCharacterClass, // Invalid character class type referenced. - InvalidTrailingEscape = __Regex_InvalidTrailingEscape, // Trailing \ in pattern. - InvalidNumber = __Regex_InvalidNumber, // Number in \digit invalid or in error. - MismatchingBracket = __Regex_MismatchingBracket, // [ ] imbalance. - MismatchingParen = __Regex_MismatchingParen, // ( ) imbalance. - MismatchingBrace = __Regex_MismatchingBrace, // { } imbalance. - InvalidBraceContent = __Regex_InvalidBraceContent, // Content of {} invalid: not a number, number too large, more than two numbers, first larger than second. - InvalidBracketContent = __Regex_InvalidBracketContent, // Content of [] invalid. - InvalidRange = __Regex_InvalidRange, // Invalid endpoint in range expression. - InvalidRepetitionMarker = __Regex_InvalidRepetitionMarker, // ?, * or + not preceded by valid regular expression. - ReachedMaxRecursion = __Regex_ReachedMaxRecursion, // MaximumRecursion has been reached. - EmptySubExpression = __Regex_EmptySubExpression, // Sub expression has empty content. - InvalidCaptureGroup = __Regex_InvalidCaptureGroup, // Content of capture group is invalid. - InvalidNameForCaptureGroup = __Regex_InvalidNameForCaptureGroup, // Name of capture group is invalid. - InvalidNameForProperty = __Regex_InvalidNameForProperty, // Name of property is invalid. - DuplicateNamedCapture = __Regex_DuplicateNamedCapture, // Name of property is invalid. + InvalidPattern = __Regex_InvalidPattern, // Invalid regular expression. + InvalidCollationElement = __Regex_InvalidCollationElement, // Invalid collating element referenced. + InvalidCharacterClass = __Regex_InvalidCharacterClass, // Invalid character class type referenced. + InvalidTrailingEscape = __Regex_InvalidTrailingEscape, // Trailing \ in pattern. + InvalidNumber = __Regex_InvalidNumber, // Number in \digit invalid or in error. + MismatchingBracket = __Regex_MismatchingBracket, // [ ] imbalance. + MismatchingParen = __Regex_MismatchingParen, // ( ) imbalance. + MismatchingBrace = __Regex_MismatchingBrace, // { } imbalance. + InvalidBraceContent = __Regex_InvalidBraceContent, // Content of {} invalid: not a number, number too large, more than two numbers, first larger than second. + InvalidBracketContent = __Regex_InvalidBracketContent, // Content of [] invalid. + InvalidRange = __Regex_InvalidRange, // Invalid endpoint in range expression. + InvalidRepetitionMarker = __Regex_InvalidRepetitionMarker, // ?, * or + not preceded by valid regular expression. + ReachedMaxRecursion = __Regex_ReachedMaxRecursion, // MaximumRecursion has been reached. + EmptySubExpression = __Regex_EmptySubExpression, // Sub expression has empty content. + InvalidCaptureGroup = __Regex_InvalidCaptureGroup, // Content of capture group is invalid. + InvalidNameForCaptureGroup = __Regex_InvalidNameForCaptureGroup, // Name of capture group is invalid. + InvalidNameForProperty = __Regex_InvalidNameForProperty, // Name of property is invalid. + DuplicateNamedCapture = __Regex_DuplicateNamedCapture, // Name of property is invalid. + InvalidCharacterClassEscape = __Regex_InvalidCharacterClassEscape, // Invalid escaped entity in character class. }; inline String get_error_string(Error error) @@ -79,6 +80,8 @@ inline String get_error_string(Error error) return "Name of property is invalid."; case Error::DuplicateNamedCapture: return "Duplicate capture group name"; + case Error::InvalidCharacterClassEscape: + return "Invalid escaped entity in character class."; } return "Undefined error."; } diff --git a/Userland/Libraries/LibRegex/RegexOptimizer.cpp b/Userland/Libraries/LibRegex/RegexOptimizer.cpp index 5fa27a242e1..0eb46ed600d 100644 --- a/Userland/Libraries/LibRegex/RegexOptimizer.cpp +++ b/Userland/Libraries/LibRegex/RegexOptimizer.cpp @@ -205,6 +205,9 @@ static bool has_overlap(Vector const& lhs, Vector const& lhs, Vector& table, CompareTypeAndValuePair pair) @@ -806,11 +814,16 @@ static LookupTableInsertionOutcome insert_into_lookup_table(RedBlackTree 0) { reset(); - return parse_pattern(stack, match_length_minimum, { .unicode = false, .named = true }); + return parse_pattern(stack, match_length_minimum, { .unicode = false, .named = true, .unicode_sets = false }); } if (!res) @@ -1136,7 +1137,7 @@ bool ECMA262Parser::parse_quantifiable_assertion(ByteCode& stack, size_t&, Parse size_t match_length_minimum = 0; if (try_skip("="sv)) { - if (!parse_inner_disjunction(assertion_stack, match_length_minimum, { .unicode = false, .named = flags.named })) + if (!parse_inner_disjunction(assertion_stack, match_length_minimum, { .unicode = false, .named = flags.named, .unicode_sets = false })) return false; stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::LookAhead); @@ -1149,7 +1150,7 @@ bool ECMA262Parser::parse_quantifiable_assertion(ByteCode& stack, size_t&, Parse exit_capture_group_scope(); } }; - if (!parse_inner_disjunction(assertion_stack, match_length_minimum, { .unicode = false, .named = flags.named })) + if (!parse_inner_disjunction(assertion_stack, match_length_minimum, { .unicode = false, .named = flags.named, .unicode_sets = false })) return false; stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookAhead); @@ -1756,6 +1757,7 @@ bool ECMA262Parser::parse_character_class(ByteCode& stack, size_t& match_length_ compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 }); } + // ClassContents :: [empty] if (match(TokenType::RightBracket)) { consume(); // Should only have at most an 'Inverse' @@ -1764,7 +1766,12 @@ bool ECMA262Parser::parse_character_class(ByteCode& stack, size_t& match_length_ return true; } - if (!parse_nonempty_class_ranges(compares, flags)) + // ClassContents :: [~UnicodeSetsMode] NonemptyClassRanges[?UnicodeMode] + if (!flags.unicode_sets && !parse_nonempty_class_ranges(compares, flags)) + return false; + + // ClassContents :: [+UnicodeSetsMode] ClassSetExpression + if (flags.unicode_sets && !parse_class_set_expression(compares)) return false; match_length_minimum += 1; @@ -2029,6 +2036,364 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector& return true; } +bool ECMA262Parser::parse_class_set_expression(Vector& compares) +{ + auto start_position = tell(); + + // ClassSetExpression :: ClassUnion | ClassIntersection | ClassSubtraction + if (parse_class_subtraction(compares)) { + consume(TokenType::RightBracket, Error::MismatchingBracket); + return true; + } + if (has_error()) + return false; + + back(tell() - start_position + 1); + if (parse_class_intersection(compares)) { + consume(TokenType::RightBracket, Error::MismatchingBracket); + return true; + } + if (has_error()) + return false; + + back(tell() - start_position + 1); + if (parse_class_union(compares)) { + consume(TokenType::RightBracket, Error::MismatchingBracket); + return true; + } + + return false; +} + +bool ECMA262Parser::parse_class_union(Vector& compares) +{ + auto start_position = tell(); + ArmedScopeGuard restore_position { [&] { back(tell() - start_position + 1); } }; + + auto first = true; + + // ClassUnion :: ClassSetRange ClassUnion[opt] | ClassSetOperand ClassUnion[opt] + for (;;) { + if (!parse_class_set_range(compares)) { + if (has_error() || match(TokenType::RightBracket)) + break; + + if (!parse_class_set_operand(compares)) { + if (first || has_error()) + return false; + break; + } + } + first = false; + } + + restore_position.disarm(); + return !has_error(); +} + +bool ECMA262Parser::parse_class_intersection(Vector& compares) +{ + // ClassIntersection :: ClassSetOperand "&&" [lookahead != "&"] ClassSetOperand + // | ClassIntersection "&&" [lookahead != "&"] ClassSetOperand + Vector lhs; + Vector rhs; + + auto start_position = tell(); + ArmedScopeGuard restore_position { [&] { back(tell() - start_position + 1); } }; + + if (!parse_class_set_operand(lhs)) + return false; + + if (!try_skip("&&"sv)) + return false; + + compares.append({ CharacterCompareType::And, 0 }); + compares.extend(move(lhs)); + + do { + rhs.clear_with_capacity(); + if (!parse_class_set_operand(rhs)) + return false; + + compares.extend(rhs); + + if (try_skip("&&&"sv)) + return false; + } while (!has_error() && try_skip("&&"sv)); + + compares.append({ CharacterCompareType::EndAndOr, 0 }); + + restore_position.disarm(); + return true; +} + +bool ECMA262Parser::parse_class_subtraction(Vector& compares) +{ + // ClassSubtraction :: ClassSetOperand "--" ClassSetOperand | ClassSubtraction "--" ClassSetOperand + Vector lhs; + Vector rhs; + + auto start_position = tell(); + ArmedScopeGuard restore_position { [&] { back(tell() - start_position + 1); } }; + + if (!parse_class_set_operand(lhs)) + return false; + + if (!try_skip("--"sv)) + return false; + + compares.append({ CharacterCompareType::And, 0 }); + compares.extend(move(lhs)); + + do { + rhs.clear_with_capacity(); + if (!parse_class_set_operand(rhs)) + return false; + + compares.append({ CharacterCompareType::TemporaryInverse, 0 }); + compares.extend(rhs); + } while (!has_error() && try_skip("--"sv)); + + compares.append({ CharacterCompareType::EndAndOr, 0 }); + + restore_position.disarm(); + return true; +} + +bool ECMA262Parser::parse_class_set_range(Vector& compares) +{ + // ClassSetRange :: ClassSetCharacter "-" ClassSetCharacter + auto start_position = tell(); + ArmedScopeGuard restore_position { [&] { back(tell() - start_position + 1); } }; + + auto lhs = parse_class_set_character(); + if (!lhs.has_value()) + return false; + + if (!match(TokenType::HyphenMinus)) + return false; + consume(); + + auto rhs = parse_class_set_character(); + if (!rhs.has_value()) + return false; + + compares.append({ + CharacterCompareType::CharRange, + CharRange { lhs.value(), rhs.value() }, + }); + restore_position.disarm(); + return true; +} + +Optional ECMA262Parser::parse_class_set_character() +{ + // ClassSetCharacter :: [lookahead ∉ ClassSetReservedDoublePunctuator] SourceCharacter but not ClassSetSyntaxCharacter + // | "\" CharacterEscape[+UnicodeMode] + // | "\" ClassSetReservedPunctuator + // | "\" b + // ClassSetReservedDoublePunctuator :: one of "&&" "!!" "##" "$$" "%%" "**" "++" ",," ".." "::" ";;" "<<" "==" ">>" "??" "@@" "^^" "``" "~~" + // ClassSetSyntaxCharacter :: one of "(" ")" "{" "}" "[" "]" "/" "-" "\" "|" + // ClassSetReservedPunctuator :: one of "&" "-" "!" "#" "%" "," ":" ";" "<" "=" ">" "@" "`" "~" + + constexpr auto class_set_reserved_double_punctuator = Array { + "&&"sv, "!!"sv, "##"sv, "$$"sv, "%%"sv, "**"sv, "++"sv, ",,"sv, ".."sv, "::"sv, ";;"sv, "<<"sv, "=="sv, ">>"sv, "??"sv, "@@"sv, "^^"sv, "``"sv, "~~"sv + }; + + auto start_position = tell(); + ArmedScopeGuard restore { [&] { back(tell() - start_position + 1); } }; + + if (try_skip("\\"sv)) { + if (done()) { + set_error(Error::InvalidTrailingEscape); + return {}; + } + + // "\" ClassSetReservedPunctuator + for (auto const& reserved : class_set_reserved_double_punctuator) { + if (try_skip(reserved)) { + // "\" ClassSetReservedPunctuator (ClassSetReservedPunctuator) + back(); + + restore.disarm(); + return reserved[0]; + } + } + // "\" b + if (try_skip("b"sv)) { + restore.disarm(); + return '\b'; + } + + // "\" CharacterEscape[+UnicodeMode] + Vector compares; + size_t minimum_length = 0; + if (parse_character_escape(compares, minimum_length, { .unicode = true })) { + VERIFY(compares.size() == 1); + auto& compare = compares.first(); + VERIFY(compare.type == CharacterCompareType::Char); + restore.disarm(); + return compare.value; + } + + return {}; + } + + // [lookahead ∉ ClassSetReservedDoublePunctuator] SourceCharacter but not ClassSetSyntaxCharacter + auto lookahead_matches = any_of(class_set_reserved_double_punctuator, [this](auto& reserved) { + return try_skip(reserved); + }); + + if (lookahead_matches) + return {}; + + for (auto character : { "("sv, ")"sv, "{"sv, "}"sv, "["sv, "]"sv, "/"sv, "-"sv, "\\"sv, "|"sv }) { + if (try_skip(character)) + return {}; + } + + restore.disarm(); + return skip(); +} + +bool ECMA262Parser::parse_class_set_operand(Vector& compares) +{ + auto start_position = tell(); + + // ClassSetOperand :: ClassSetCharacter | ClassStringDisjunction | NestedClass + if (auto character = parse_class_set_character(); character.has_value()) { + compares.append({ CharacterCompareType::Char, character.value() }); + return true; + } + + // NestedClass :: "[" [lookahead != "^"] ClassContents[+UnicodeMode +UnicodeSetsMode] "]" + // | "[" "^" ClassContents[+UnicodeMode +UnicodeSetsMode] "]" + // | "\" CharacterClassEscape[+UnicodeMode] + if (parse_nested_class(compares)) + return true; + + if (has_error()) + return false; + + auto negated = false; + if (auto ch = parse_character_class_escape(negated, true); ch.has_value()) { + if (negated) + compares.append({ CharacterCompareType::TemporaryInverse, 1 }); + compares.append({ CharacterCompareType::CharClass, (ByteCodeValueType)ch.value() }); + return true; + } + + PropertyEscape property {}; + if (parse_unicode_property_escape(property, negated)) { + if (negated) + compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 }); + property.visit( + [&](Unicode::Property property) { + compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property }); + }, + [&](Unicode::GeneralCategory general_category) { + compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category }); + }, + [&](Script script) { + if (script.is_extension) + compares.empend(CompareTypeAndValuePair { CharacterCompareType::ScriptExtension, (ByteCodeValueType)script.script }); + else + compares.empend(CompareTypeAndValuePair { CharacterCompareType::Script, (ByteCodeValueType)script.script }); + }, + [](Empty&) { VERIFY_NOT_REACHED(); }); + return true; + } + + if (has_error()) + return false; + + // ClassStringDisjunction :: "\q{" ClassStringDisjunctionContents "}" + // ClassStringDisjunctionContents :: ClassString | ClassString "|" ClassStringDisjunctionContents + // ClassString :: [empty] | NonEmptyClassString + // NonEmptyClassString :: ClassCharacter NonEmptyClassString[opt] + if (try_skip("\\q{"sv)) { + // FIXME: Implement this :P + return set_error(Error::InvalidCharacterClass); + } + + back(tell() - start_position + 1); + return false; +} + +bool ECMA262Parser::parse_nested_class(Vector& compares) +{ + auto start_position = tell(); + + // NestedClass :: "[" [lookahead ≠ ^ ] ClassContents [+UnicodeMode, +UnicodeSetsMode] "]" + // | "[" "^" ClassContents[+UnicodeMode, +UnicodeSetsMode] "]" + // | "\" CharacterClassEscape[+UnicodeMode] + + if (match(TokenType::LeftBracket)) { + consume(); + + compares.append(CompareTypeAndValuePair { CharacterCompareType::Or, 0 }); + + if (match(TokenType::Circumflex)) { + // Negated charclass + consume(); + compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 }); + } + + // ClassContents :: [empty] + if (match(TokenType::RightBracket)) { + consume(); + // Should only have at most an 'Inverse' (after an 'Or') + VERIFY(compares.size() <= 2); + compares.append(CompareTypeAndValuePair { CharacterCompareType::EndAndOr, 0 }); + return true; + } + + // ClassContents :: [+UnicodeSetsMode] ClassSetExpression + if (!parse_class_set_expression(compares)) + return false; + + compares.append(CompareTypeAndValuePair { CharacterCompareType::EndAndOr, 0 }); + return true; + } + + if (try_skip("\\"sv)) { + auto negated = false; + if (auto char_class = parse_character_class_escape(negated); char_class.has_value()) { + if (negated) + compares.append({ CharacterCompareType::TemporaryInverse, 1 }); + compares.append({ CharacterCompareType::CharClass, (ByteCodeValueType)char_class.value() }); + return true; + } + + PropertyEscape property {}; + if (parse_unicode_property_escape(property, negated)) { + if (negated) + compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 }); + property.visit( + [&](Unicode::Property property) { + compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property }); + }, + [&](Unicode::GeneralCategory general_category) { + compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category }); + }, + [&](Script script) { + if (script.is_extension) + compares.empend(CompareTypeAndValuePair { CharacterCompareType::ScriptExtension, (ByteCodeValueType)script.script }); + else + compares.empend(CompareTypeAndValuePair { CharacterCompareType::Script, (ByteCodeValueType)script.script }); + }, + [](Empty&) { VERIFY_NOT_REACHED(); }); + return true; + } + + if (has_error()) + return false; + } + + back(tell() - start_position + 1); + return false; +} + bool ECMA262Parser::parse_unicode_property_escape(PropertyEscape& property, bool& negated) { negated = false; diff --git a/Userland/Libraries/LibRegex/RegexParser.h b/Userland/Libraries/LibRegex/RegexParser.h index 3dbe5438d34..a6b9948ea96 100644 --- a/Userland/Libraries/LibRegex/RegexParser.h +++ b/Userland/Libraries/LibRegex/RegexParser.h @@ -92,6 +92,8 @@ protected: ALWAYS_INLINE bool done() const; ALWAYS_INLINE bool set_error(Error error); + size_t tell() const { return m_parser_state.current_token.position(); } + struct NamedCaptureGroup { size_t group_index { 0 }; size_t minimum_length { 0 }; @@ -223,6 +225,7 @@ private: struct ParseFlags { bool unicode { false }; bool named { false }; + bool unicode_sets { false }; }; enum class ReadDigitsInitialZeroState { @@ -257,6 +260,15 @@ private: bool parse_character_escape(Vector&, size_t&, ParseFlags); + bool parse_class_set_expression(Vector&); + bool parse_class_union(Vector&); + bool parse_class_intersection(Vector&); + bool parse_class_subtraction(Vector&); + bool parse_class_set_range(Vector&); + bool parse_class_set_operand(Vector&); + bool parse_nested_class(Vector&); + Optional parse_class_set_character(); + // Used only by B.1.4, Regular Expression Patterns (Extended for use in browsers) bool parse_quantifiable_assertion(ByteCode&, size_t&, ParseFlags); bool parse_extended_atom(ByteCode&, size_t&, ParseFlags);