From 77718c0a6643cee2d2a186cce7f410965ce27b85 Mon Sep 17 00:00:00 2001 From: Andrew Kaster Date: Wed, 28 Aug 2024 21:25:42 -0600 Subject: [PATCH] LibWeb: Implement the Data state for the Swift tokenizer And add tests! This implementation closely follows the current C++ implementation, replacing macros and gotos with a slightly more complex state machine. It's very possible that an async version that yields tokens on "emit" would be even simpler, but let's get this one working first :). --- Tests/LibWeb/TestHTMLTokenizerSwift.swift | 84 ++++++- .../LibWeb/HTML/Parser/HTMLToken.swift | 16 +- .../LibWeb/HTML/Parser/HTMLTokenizer.swift | 217 +++++++++++++++++- 3 files changed, 296 insertions(+), 21 deletions(-) diff --git a/Tests/LibWeb/TestHTMLTokenizerSwift.swift b/Tests/LibWeb/TestHTMLTokenizerSwift.swift index e7e92bae417..b6be94539de 100644 --- a/Tests/LibWeb/TestHTMLTokenizerSwift.swift +++ b/Tests/LibWeb/TestHTMLTokenizerSwift.swift @@ -5,8 +5,8 @@ */ import AK -import Web import Testing +import Web @Suite struct TestHTMLTokenizerSwift { @@ -30,4 +30,86 @@ struct TestHTMLTokenizerSwift { #expect(!token.isParserWhitespace()) } } + + @Test func dataStateNoInput() { + let tokenizer = HTMLTokenizer() + #expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state + + let token = tokenizer.nextToken() + #expect(token?.type == .EndOfFile) + + let token2 = tokenizer.nextToken() + #expect(token2 == nil) + #expect(tokenizer.state == HTMLTokenizer.State.Data) + } + + @Test func dataStateSingleChar() { + guard let tokenizer = HTMLTokenizer(input: "X") else { + Issue.record("Failed to create tokenizer for 'X'") + return + } + #expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state + + let token = tokenizer.nextToken() + #expect(token?.type == .Character(codePoint: "X")) + + let token2 = tokenizer.nextToken() + #expect(token2?.type == .EndOfFile) + + let token3 = tokenizer.nextToken() + #expect(token3 == nil) + #expect(tokenizer.state == HTMLTokenizer.State.Data) + } + + @Test func dataStateAmpersand() { + guard let tokenizer = HTMLTokenizer(input: "&") else { + Issue.record("Failed to create tokenizer for '&'") + return + } + #expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state + + let token = tokenizer.nextToken() + #expect(token?.type == .EndOfFile) + #expect(tokenizer.state == HTMLTokenizer.State.CharacterReference) + + let token2 = tokenizer.nextToken() + #expect(token2 == nil) + } + + @Test func dataStateTagOpen() { + guard let tokenizer = HTMLTokenizer(input: "<") else { + Issue.record("Failed to create tokenizer for '<'") + return + } + #expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state + + let token = tokenizer.nextToken() + #expect(token?.type == .EndOfFile) + #expect(tokenizer.state == HTMLTokenizer.State.TagOpen) + + let token2 = tokenizer.nextToken() + #expect(token2 == nil) + } + + @Test func dataStateNulChar() { + guard let tokenizer = HTMLTokenizer(input: "H\0I") else { + Issue.record("Failed to create tokenizer for 'H\\0I'") + return + } + #expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state + + let token = tokenizer.nextToken() + #expect(token?.type == .Character(codePoint: "H")) + + let token2 = tokenizer.nextToken() + #expect(token2?.type == .Character(codePoint: "\u{FFFD}")) + + let token3 = tokenizer.nextToken() + #expect(token3?.type == .Character(codePoint: "I")) + + let token4 = tokenizer.nextToken() + #expect(token4?.type == .EndOfFile) + + #expect(tokenizer.state == HTMLTokenizer.State.Data) + } } diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.swift b/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.swift index fe0045f2a28..c5920d13cc6 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.swift +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.swift @@ -7,13 +7,13 @@ @_exported import WebCxx public class HTMLToken { - public struct Position { + public struct Position: Equatable { var line = UInt() var column = UInt() var byteOffset = UInt() } - public struct Attribute { + public struct Attribute: Equatable { var prefix: Swift.String? var localName: Swift.String var namespace_: Swift.String? @@ -24,7 +24,7 @@ public class HTMLToken { var valueEndPosition: Position } - public enum TokenType { + public enum TokenType: Equatable { case Invalid case DOCTYPE( name: Swift.String?, @@ -79,7 +79,7 @@ public class HTMLToken { } } -extension HTMLToken.Position: Equatable, CustomStringConvertible { +extension HTMLToken.Position: CustomStringConvertible { public var description: Swift.String { return "\(self.line):\(self.column)" } @@ -109,13 +109,11 @@ extension HTMLToken.TokenType: CustomStringConvertible { extension HTMLToken: CustomStringConvertible { public var description: Swift.String { - if (self.startPosition == Position()) { + if self.startPosition == Position() { return "HTMLToken(type: \(self.type))" - } - else if (self.endPosition == Position()) { + } else if self.endPosition == Position() { return "HTMLToken(type: \(self.type))@\(self.startPosition)" - } - else { + } else { return "HTMLToken(type: \(self.type))@\(self.startPosition)-\(self.endPosition)" } } diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.swift b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.swift index 8d1aebc5c45..79bce616082 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.swift +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.swift @@ -20,9 +20,9 @@ extension Swift.String { } } -class HTMLTokenizer { +public class HTMLTokenizer { - enum State { + public enum State { case Data case RCDATA case RAWTEXT @@ -105,29 +105,224 @@ class HTMLTokenizer { case NumericCharacterReferenceEnd } - var input = Swift.String() - var state = State.Data - var returnState = State.Data + private var input = Swift.String() + private var cursor: Swift.String.Index + private var previousCursor: Swift.String.Index - var currentToken = HTMLToken() - var queuedTokens = Deque() + public private(set) var state = State.Data + private var returnState = State.Data - public init() {} + private var currentToken = HTMLToken() + private var queuedTokens = Deque() + + private var aborted = false + private var hasEmittedEOF = false + + public init() { + self.cursor = self.input.startIndex + self.previousCursor = self.input.startIndex + } public init?(input: AK.StringView, encoding: AK.StringView) { if let string = Swift.String(decoding: input, as: encoding) { self.input = string } else { return nil } + self.cursor = self.input.startIndex + self.previousCursor = self.input.startIndex + } + + public convenience init?(input: AK.StringView) { + self.init(input: input, encoding: "UTF-8") + } + + public func abort() { + self.aborted = true + } + + func skip(_ count: Int) { + self.cursor = self.input.index(self.cursor, offsetBy: count, limitedBy: self.input.endIndex) ?? input.endIndex + self.previousCursor = self.input.index(before: self.cursor) + } + + func peekCodePoint(_ offset: Int = 0) -> Character? { + guard let index = self.input.index(self.cursor, offsetBy: offset, limitedBy: self.input.index(before: self.input.endIndex)) else { + return nil + } + return self.input[index] + } + + func nextCodePoint() -> Character? { + guard self.cursor < self.input.endIndex else { + return nil + } + + // https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream:tokenization + // https://infra.spec.whatwg.org/#normalize-newlines + var codePoint: Character + if let peeked = peekCodePoint(), let peekedNext = peekCodePoint(1), peeked == "\r", peekedNext == "\n" { + // replace every U+000D CR U+000A LF code point pair with a single U+000A LF code point, + skip(2) + codePoint = "\n" + } else if let peeked = peekCodePoint(), peeked == "\r" { + // replace every remaining U+000D CR code point with a U+000A LF code point. + skip(1) + codePoint = "\n" + } else { + skip(1) + codePoint = self.input[self.previousCursor] + } + return codePoint + } + + func restoreCursorToPrevious() { + self.cursor = self.previousCursor + } + + func createNewToken(_ token: HTMLToken) { + self.currentToken = token + // FIXME: Assign Position + } + + enum NextTokenState { + case Emit(token: HTMLToken?) + case SwitchTo + case Reconsume(inputCharacter: Character?) + case ReprocessQueue } public func nextToken(stopAtInsertionPoint: Bool = false) -> HTMLToken? { - while !queuedTokens.isEmpty { - return queuedTokens.popFirst() + let processQueue = { () -> HTMLToken?? in + if let token = self.queuedTokens.popFirst() { + return token + } + return self.aborted ? Optional(nil) : nil } - return nil + if let maybeToken = processQueue() { + return maybeToken + } + + var nextInputCharacter: Character? = nil + while true { + // FIXME: Handle insertion point + switch nextTokenImpl(nextInputCharacter) { + case .Emit(let token): + return token + case .SwitchTo: + nextInputCharacter = nil + break + case .Reconsume(let character): + nextInputCharacter = character + break + case .ReprocessQueue: + if let maybeToken = processQueue() { + return maybeToken + } + nextInputCharacter = nil + break + } + } } + func switchTo(_ state: State) -> NextTokenState { + self.state = state + return .SwitchTo + } + + func reconsume(_ character: Character, `in` state: State) -> NextTokenState { + self.state = state + return .Reconsume(inputCharacter: character) + } + + func switchToReturnState() -> NextTokenState { + self.state = self.returnState + return .ReprocessQueue + } + + func reconsumeInReturnState(_ character: Character?) -> NextTokenState { + self.state = self.returnState + if character != nil { + restoreCursorToPrevious() + } + return .ReprocessQueue + } + + func switchToAndEmitCurrentToken(_ state: State) -> NextTokenState { + self.state = state + self.queuedTokens.append(self.currentToken) + self.currentToken = HTMLToken() + return .Emit(token: self.queuedTokens.popFirst()!) + } + + func switchToAndEmitCharacter(_ state: State, character: Character) -> NextTokenState { + self.state = state + return emitCharacter(character) + } + + func emitCharacterAndReconsume(_ character: Character, `in`: State, currentInputCharacter: Character?) -> NextTokenState { + self.queuedTokens.append(HTMLToken(type: .Character(codePoint: character))) + self.state = `in` + return .Reconsume(inputCharacter: currentInputCharacter) + } + + func emitEOF() -> NextTokenState { + if self.hasEmittedEOF { + return .Emit(token: nil) + } + self.hasEmittedEOF = true + createNewToken(HTMLToken(type: .EndOfFile)) + self.queuedTokens.append(self.currentToken) + self.currentToken = HTMLToken() + return .Emit(token: self.queuedTokens.popFirst()!) + } + + func emitCurrentTokenFollowedByEOF() -> NextTokenState { + precondition(!self.hasEmittedEOF) + self.queuedTokens.append(self.currentToken) + self.currentToken = HTMLToken() + return emitEOF() + } + + func emitCharacter(_ character: Character) -> NextTokenState { + createNewToken(HTMLToken(type: .Character(codePoint: character))) + self.queuedTokens.append(self.currentToken) + self.currentToken = HTMLToken() + return .Emit(token: self.queuedTokens.popFirst()!) + } + + func nextTokenImpl(_ nextInputCharacter: Character? = nil) -> NextTokenState { + let dontConsumeNextInputCharacter = { + self.restoreCursorToPrevious() + } + let _ = dontConsumeNextInputCharacter + + // FIXME: flushCodepointsConsumedAsACharacterReference needs currentBuilder + + // Handle reconsume by passing the character around in the state enum + let currentInputCharacter = nextInputCharacter ?? nextCodePoint() + + switch self.state { + // 13.2.5.1 Data state, https://html.spec.whatwg.org/multipage/parsing.html#data-state + case .Data: + switch currentInputCharacter { + case "&": + self.returnState = .Data + return switchTo(.CharacterReference) + case "<": + return switchTo(.TagOpen) + case "\0": + // FIXME: log_parse_error() + return emitCharacter("\u{FFFD}") + case nil: + return emitEOF() + default: + return emitCharacter(currentInputCharacter!) + } + default: + print("TODO: In state \(self.state) with input \(Swift.String(describing: currentInputCharacter))") + return emitEOF() + } + } }