LibJS: Allow Unicode escape sequences in identifiers

For example, "property.br\u{64}wn" should resolve to "property.brown".

To support this behavior, this commit changes the Token class to hold
both the evaluated identifier name and a view into the original source
for the unevaluated name. There are some contexts in which identifiers
are not allowed to contain Unicode escape sequences; for example, export
statements of the form "export {} from foo.js" forbid escapes in the
identifier "from".

The test file is added to .prettierignore because prettier will replace
all escaped Unicode sequences with their unescaped value.
This commit is contained in:
Timothy Flynn 2021-08-18 16:34:25 -04:00 committed by Andreas Kling
parent c5b5c779ff
commit 1259dc3623
Notes: sideshowbarker 2024-07-18 05:28:09 +09:00
7 changed files with 163 additions and 54 deletions

View file

@ -1,3 +1,3 @@
Base/home/anon/Source/js
Userland/Libraries/LibJS/Tests/eval-aliasing.js
Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js

View file

@ -8,6 +8,7 @@
#include "Lexer.h"
#include <AK/CharacterTypes.h>
#include <AK/Debug.h>
#include <AK/GenericLexer.h>
#include <AK/HashMap.h>
#include <AK/Utf8View.h>
#include <LibUnicode/CharacterTypes.h>
@ -350,6 +351,8 @@ u32 Lexer::current_code_point() const
if (m_position == 0)
return REPLACEMENT_CHARACTER;
Utf8View utf_8_view { m_source.substring_view(m_position - 1) };
if (utf_8_view.is_empty())
return REPLACEMENT_CHARACTER;
return *utf_8_view.begin();
}
@ -369,30 +372,60 @@ bool Lexer::is_whitespace() const
return false;
}
bool Lexer::is_identifier_start() const
Optional<u32> Lexer::is_unicode_escape(size_t& identifier_length) const
{
if (!is_unicode_character())
return is_ascii_alpha(m_current_char) || m_current_char == '_' || m_current_char == '$';
auto code_point = current_code_point();
GenericLexer lexer(source().substring_view(m_position - 1));
static auto id_start_category = Unicode::property_from_string("ID_Start"sv);
if (id_start_category.has_value())
return Unicode::code_point_has_property(code_point, *id_start_category);
return false;
if (auto code_point_or_error = lexer.consume_escaped_code_point(); !code_point_or_error.is_error()) {
identifier_length = lexer.tell();
return code_point_or_error.value();
}
return {};
}
bool Lexer::is_identifier_middle() const
Optional<u32> Lexer::is_identifier_start(size_t& identifier_length) const
{
if (!is_unicode_character())
return is_identifier_start() || is_ascii_digit(m_current_char);
auto code_point = current_code_point();
if (code_point == ZERO_WIDTH_NON_JOINER || code_point == ZERO_WIDTH_JOINER)
return true;
u32 code_point = current_code_point();
identifier_length = 1;
if (code_point == '\\') {
if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value())
code_point = *maybe_code_point;
else
return {};
}
if (is_ascii_alpha(code_point) || code_point == '_' || code_point == '$')
return code_point;
static auto id_start_category = Unicode::property_from_string("ID_Start"sv);
if (id_start_category.has_value() && Unicode::code_point_has_property(code_point, *id_start_category))
return code_point;
return {};
}
Optional<u32> Lexer::is_identifier_middle(size_t& identifier_length) const
{
u32 code_point = current_code_point();
identifier_length = 1;
if (code_point == '\\') {
if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value())
code_point = *maybe_code_point;
else
return {};
}
if (is_ascii_alphanumeric(code_point) || (code_point == '$') || (code_point == ZERO_WIDTH_NON_JOINER) || (code_point == ZERO_WIDTH_JOINER))
return code_point;
static auto id_continue_category = Unicode::property_from_string("ID_Continue"sv);
if (id_continue_category.has_value())
return Unicode::code_point_has_property(code_point, *id_continue_category);
return false;
if (id_continue_category.has_value() && Unicode::code_point_has_property(code_point, *id_continue_category))
return code_point;
return {};
}
bool Lexer::is_line_comment_start(bool line_has_token_yet) const
@ -494,6 +527,9 @@ Token Lexer::next()
// bunch of Invalid* tokens (bad numeric literals, unterminated comments etc.)
String token_message;
Optional<FlyString> identifier;
size_t identifier_length = 0;
if (m_current_token.type() == TokenType::RegexLiteral && !is_eof() && is_ascii_alpha(m_current_char) && !did_consume_whitespace_or_comments) {
token_type = TokenType::RegexFlags;
while (!is_eof() && is_ascii_alpha(m_current_char))
@ -537,19 +573,26 @@ Token Lexer::next()
else
token_type = TokenType::TemplateLiteralString;
}
} else if (is_identifier_start()) {
} else if (auto code_point = is_identifier_start(identifier_length); code_point.has_value()) {
// identifier or keyword
StringBuilder builder;
do {
consume();
} while (is_identifier_middle());
builder.append_code_point(*code_point);
for (size_t i = 0; i < identifier_length; ++i)
consume();
StringView value = m_source.substring_view(value_start - 1, m_position - value_start);
auto it = s_keywords.find(value.hash(), [&](auto& entry) { return entry.key == value; });
if (it == s_keywords.end()) {
code_point = is_identifier_middle(identifier_length);
} while (code_point.has_value());
identifier = builder.build();
if (!m_parsed_identifiers.contains_slow(*identifier))
m_parsed_identifiers.append(*identifier);
auto it = s_keywords.find(identifier->hash(), [&](auto& entry) { return entry.key == identifier; });
if (it == s_keywords.end())
token_type = TokenType::Identifier;
} else {
else
token_type = it->value;
}
} else if (is_numeric_literal_start()) {
token_type = TokenType::NumericLiteral;
bool is_invalid_numeric_literal = false;
@ -708,15 +751,28 @@ Token Lexer::next()
}
}
m_current_token = Token(
token_type,
token_message,
m_source.substring_view(trivia_start - 1, value_start - trivia_start),
m_source.substring_view(value_start - 1, m_position - value_start),
m_filename,
value_start_line_number,
value_start_column_number,
m_position);
if (identifier.has_value()) {
m_current_token = Token(
token_type,
token_message,
m_source.substring_view(trivia_start - 1, value_start - trivia_start),
m_source.substring_view(value_start - 1, m_position - value_start),
identifier.release_value(),
m_filename,
value_start_line_number,
value_start_column_number,
m_position);
} else {
m_current_token = Token(
token_type,
token_message,
m_source.substring_view(trivia_start - 1, value_start - trivia_start),
m_source.substring_view(value_start - 1, m_position - value_start),
m_filename,
value_start_line_number,
value_start_column_number,
m_position);
}
if constexpr (LEXER_DEBUG) {
dbgln("------------------------------");

View file

@ -41,8 +41,9 @@ private:
bool is_eof() const;
bool is_line_terminator() const;
bool is_whitespace() const;
bool is_identifier_start() const;
bool is_identifier_middle() const;
Optional<u32> is_unicode_escape(size_t& identifier_length) const;
Optional<u32> is_identifier_start(size_t& identifier_length) const;
Optional<u32> is_identifier_middle(size_t& identifier_length) const;
bool is_line_comment_start(bool line_has_token_yet) const;
bool is_block_comment_start() const;
bool is_block_comment_end() const;
@ -80,6 +81,10 @@ private:
static HashMap<String, TokenType> s_three_char_tokens;
static HashMap<String, TokenType> s_two_char_tokens;
static HashMap<char, TokenType> s_single_char_tokens;
// Resolved identifiers must be kept alive for the duration of the parsing stage, otherwise
// the only references to these strings are deleted by the Token destructor.
Vector<FlyString> m_parsed_identifiers;
};
}

View file

@ -210,7 +210,6 @@ constexpr OperatorPrecedenceTable g_operator_precedence;
Parser::ParserState::ParserState(Lexer l, Program::Type program_type)
: lexer(move(l))
, current_token(TokenType::Invalid, {}, {}, {}, {}, 0, 0, 0)
{
if (program_type == Program::Type::Module)
lexer.disallow_html_comments();
@ -680,7 +679,7 @@ NonnullRefPtr<ClassExpression> Parser::parse_class_expression(bool expect_class_
if (match_property_key()) {
StringView name;
if (!is_generator && m_state.current_token.value() == "static"sv) {
if (!is_generator && m_state.current_token.original_value() == "static"sv) {
if (match(TokenType::Identifier)) {
consume();
is_static = true;
@ -2524,7 +2523,7 @@ NonnullRefPtr<Statement> Parser::parse_for_statement()
{
auto rule_start = push_start();
auto match_for_in_of = [&]() {
return match(TokenType::In) || (match(TokenType::Identifier) && m_state.current_token.value() == "of");
return match(TokenType::In) || (match(TokenType::Identifier) && m_state.current_token.original_value() == "of");
};
consume(TokenType::For);
@ -3019,7 +3018,7 @@ NonnullRefPtr<ImportStatement> Parser::parse_import_statement(Program& program)
};
auto match_as = [&] {
return match(TokenType::Identifier) && m_state.current_token.value() == "as"sv;
return match(TokenType::Identifier) && m_state.current_token.original_value() == "as"sv;
};
bool continue_parsing = true;
@ -3134,11 +3133,15 @@ NonnullRefPtr<ExportStatement> Parser::parse_export_statement(Program& program)
syntax_error("Cannot use export statement outside a module");
auto match_as = [&] {
return match(TokenType::Identifier) && m_state.current_token.value() == "as"sv;
return match(TokenType::Identifier) && m_state.current_token.original_value() == "as"sv;
};
auto match_from = [&] {
return match(TokenType::Identifier) && m_state.current_token.value() == "from"sv;
return match(TokenType::Identifier) && m_state.current_token.original_value() == "from"sv;
};
auto match_default = [&] {
return match(TokenType::Default) && m_state.current_token.original_value() == "default"sv;
};
consume(TokenType::Export);
@ -3158,7 +3161,7 @@ NonnullRefPtr<ExportStatement> Parser::parse_export_statement(Program& program)
RefPtr<ASTNode> expression = {};
if (match(TokenType::Default)) {
if (match_default()) {
auto default_position = position();
consume(TokenType::Default);

View file

@ -0,0 +1,19 @@
test("basic escapes", () => {
var foo = {};
foo.brown = 12389;
expect(foo.brown).toBe(12389);
expect(foo.br\u006fwn).toBe(12389);
expect(foo.br\u{6f}wn).toBe(12389);
expect(foo.\u{62}\u{72}\u{6f}\u{77}\u{6e}).toBe(12389);
});
test("non-ascii escapes", () => {
var foo = {};
foo.𝓑𝓻𝓸𝔀𝓷 = 12389;
expect(foo.𝓑𝓻𝓸𝔀𝓷).toBe(12389);
expect(foo.𝓑𝓻\ud835\udcf8𝔀𝓷).toBe(12389);
expect(foo.𝓑𝓻\u{1d4f8}𝔀𝓷).toBe(12389);
expect(foo.\u{1d4d1}\u{1d4fb}\u{1d4f8}\u{1d500}\u{1d4f7}).toBe(12389);
});

View file

@ -56,7 +56,7 @@ double Token::double_value() const
StringBuilder builder;
for (auto ch : m_value) {
for (auto ch : value()) {
if (ch == '_')
continue;
builder.append(ch);
@ -75,7 +75,7 @@ double Token::double_value() const
return static_cast<double>(strtoul(value_string.characters() + 2, nullptr, 2));
} else if (is_ascii_digit(value_string[1])) {
// also octal, but syntax error in strict mode
if (!m_value.contains('8') && !m_value.contains('9'))
if (!value().contains('8') && !value().contains('9'))
return static_cast<double>(strtoul(value_string.characters() + 1, nullptr, 8));
}
}
@ -95,7 +95,7 @@ String Token::string_value(StringValueStatus& status) const
VERIFY(type() == TokenType::StringLiteral || type() == TokenType::TemplateLiteralString);
auto is_template = type() == TokenType::TemplateLiteralString;
GenericLexer lexer(is_template ? m_value : m_value.substring_view(1, m_value.length() - 2));
GenericLexer lexer(is_template ? value() : value().substring_view(1, value().length() - 2));
auto encoding_failure = [&status](StringValueStatus parse_status) -> String {
status = parse_status;
@ -195,7 +195,7 @@ String Token::string_value(StringValueStatus& status) const
bool Token::bool_value() const
{
VERIFY(type() == TokenType::BoolLiteral);
return m_value == "true";
return value() == "true";
}
bool Token::is_identifier_name() const

View file

@ -6,8 +6,10 @@
#pragma once
#include <AK/FlyString.h>
#include <AK/String.h>
#include <AK/StringView.h>
#include <AK/Variant.h>
namespace JS {
@ -172,10 +174,13 @@ enum class TokenCategory {
class Token {
public:
Token() = default;
Token(TokenType type, String message, StringView trivia, StringView value, StringView filename, size_t line_number, size_t line_column, size_t offset)
: m_type(type)
, m_message(message)
, m_trivia(trivia)
, m_original_value(value)
, m_value(value)
, m_filename(filename)
, m_line_number(line_number)
@ -184,6 +189,19 @@ public:
{
}
Token(TokenType type, String message, StringView trivia, StringView original_value, FlyString value, StringView filename, size_t line_number, size_t line_column, size_t offset)
: m_type(type)
, m_message(message)
, m_trivia(trivia)
, m_original_value(original_value)
, m_value(move(value))
, m_filename(filename)
, m_line_number(line_number)
, m_line_column(line_column)
, m_offset(offset)
{
}
TokenType type() const { return m_type; }
TokenCategory category() const;
static TokenCategory category(TokenType);
@ -192,7 +210,14 @@ public:
const String& message() const { return m_message; }
const StringView& trivia() const { return m_trivia; }
const StringView& value() const { return m_value; }
const StringView& original_value() const { return m_original_value; }
StringView value() const
{
return m_value.visit(
[](StringView const& view) { return view; },
[](FlyString const& identifier) { return identifier.view(); },
[](Empty) -> StringView { VERIFY_NOT_REACHED(); });
}
const StringView& filename() const { return m_filename; }
size_t line_number() const { return m_line_number; }
size_t line_column() const { return m_line_column; }
@ -213,14 +238,15 @@ public:
bool trivia_contains_line_terminator() const;
private:
TokenType m_type;
TokenType m_type { TokenType::Invalid };
String m_message;
StringView m_trivia;
StringView m_value;
StringView m_original_value;
Variant<Empty, StringView, FlyString> m_value { Empty {} };
StringView m_filename;
size_t m_line_number;
size_t m_line_column;
size_t m_offset;
size_t m_line_number { 0 };
size_t m_line_column { 0 };
size_t m_offset { 0 };
};
}