AK: Stop using U+0000 as end of file code point in URL parser

This changes URL parser to use the 0xFFFFFFFF constant instead of 0 to
indicate end of file. This fixes a bug where inputs containing null
bytes would terminate the parser early, because they were interpreted
as end of file.
This commit is contained in:
Max Wipfli 2021-06-03 12:43:08 +02:00 committed by Andreas Kling
parent 97425c7dfb
commit 2164d8aae8
Notes: sideshowbarker 2024-07-18 16:52:00 +09:00

View file

@ -16,6 +16,9 @@
namespace AK {
// NOTE: This is similar to the LibC macro EOF = -1.
constexpr u32 end_of_file = 0xFFFFFFFF;
constexpr bool is_url_code_point(u32 code_point)
{
// FIXME: [...] and code points in the range U+00A0 to U+10FFFD, inclusive, excluding surrogates and noncharacters.
@ -221,14 +224,13 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
// NOTE: "continue" should only be used to prevent incrementing the iterator, as this is done at the end of the loop.
// ++iterator : "increase pointer by 1"
// continue : "decrease pointer by 1"
// NOTE: The NULL code point is used as the "EOF code point".
for (;;) {
u32 code_point = 0;
u32 code_point = end_of_file;
if (!iterator.done())
code_point = *iterator;
if constexpr (URL_PARSER_DEBUG) {
if (!code_point)
if (code_point == end_of_file)
dbgln("URLParser::parse: {} state with EOF.", state_name(state));
else if (is_ascii_printable(code_point))
dbgln("URLParser::parse: {} state with code point U+{:04X} ({:c}).", state_name(state), code_point, code_point);
@ -335,7 +337,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
} else if (code_point == '#') {
url.m_fragment = "";
state = State::Fragment;
} else if (code_point != 0) {
} else if (code_point != end_of_file) {
url.m_query = {};
if (url.m_paths.size())
url.m_paths.remove(url.m_paths.size() - 1);
@ -408,7 +410,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
}
}
buffer.clear();
} else if (code_point == 0 || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) {
} else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) {
if (at_sign_seen && buffer.is_empty()) {
report_validation_error();
return {};
@ -434,7 +436,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
url.m_host = host.release_value();
buffer.clear();
state = State::Port;
} else if (code_point == 0 || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) {
} else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) {
if (url.is_special() && buffer.is_empty()) {
report_validation_error();
return {};
@ -457,7 +459,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
case State::Port:
if (is_ascii_digit(code_point)) {
buffer.append_code_point(code_point);
} else if (code_point == 0 || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) {
} else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) {
if (!buffer.is_empty()) {
auto port = buffer.to_string().to_uint();
if (!port.has_value() || port.value() > 65535) {
@ -494,7 +496,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
} else if (code_point == '#') {
url.m_fragment = "";
state = State::Fragment;
} else if (code_point != 0) {
} else if (code_point != end_of_file) {
url.m_query = {};
auto substring_from_pointer = input.substring_view(iterator - input.begin()).as_string();
if (!starts_with_windows_drive_letter(substring_from_pointer)) {
@ -524,7 +526,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
}
break;
case State::FileHost:
if (code_point == 0 || code_point == '/' || code_point == '\\' || code_point == '?' || code_point == '#') {
if (code_point == end_of_file || code_point == '/' || code_point == '\\' || code_point == '?' || code_point == '#') {
if (is_windows_drive_letter(buffer.to_string())) {
report_validation_error();
state = State::Path;
@ -559,14 +561,14 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
} else if (code_point == '#') {
url.m_fragment = "";
state = State::Fragment;
} else if (code_point != 0) {
} else if (code_point != end_of_file) {
state = State::Path;
if (code_point != '/')
continue;
}
break;
case State::Path:
if (code_point == 0 || code_point == '/' || (url.is_special() && code_point == '\\') || code_point == '?' || code_point == '#') {
if (code_point == end_of_file || code_point == '/' || (url.is_special() && code_point == '\\') || code_point == '?' || code_point == '#') {
if (url.is_special() && code_point == '\\')
report_validation_error();
if (is_double_dot_path_segment(buffer.to_string())) {
@ -616,10 +618,10 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
url.m_fragment = "";
state = State::Fragment;
} else {
if (code_point != 0 && !is_url_code_point(code_point) && code_point != '%')
if (code_point != end_of_file && !is_url_code_point(code_point) && code_point != '%')
report_validation_error();
// FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
if (code_point != 0) {
if (code_point != end_of_file) {
URL::append_percent_encoded_if_necessary(buffer, code_point, URL::PercentEncodeSet::C0Control);
} else {
// NOTE: This needs to be percent decoded since the member variables contain decoded data.
@ -628,7 +630,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
}
break;
case State::Query:
if (code_point == '#' || code_point == 0) {
if (code_point == end_of_file || code_point == '#') {
VERIFY(url.m_query == "");
auto query_percent_encode_set = url.is_special() ? URL::PercentEncodeSet::SpecialQuery : URL::PercentEncodeSet::Query;
// NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences.
@ -638,7 +640,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
url.m_fragment = "";
state = State::Fragment;
}
} else if (code_point != 0) {
} else if (code_point != end_of_file) {
if (!is_url_code_point(code_point) && code_point != '%')
report_validation_error();
// FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
@ -647,7 +649,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
break;
case State::Fragment:
// NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the fragment on EOF.
if (code_point) {
if (code_point != end_of_file) {
if (!is_url_code_point(code_point) && code_point != '%')
report_validation_error();
// FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.