LibWeb: Implement the Data state for the Swift tokenizer

And add tests! This implementation closely follows the current C++
implementation, replacing macros and gotos with a slightly more
complex state machine. It's very possible that an async version that
yields tokens on "emit" would be even simpler, but let's get this
one working first :).
This commit is contained in:
Andrew Kaster 2024-08-28 21:25:42 -06:00 committed by Andreas Kling
parent 01c4625a42
commit 77718c0a66
Notes: github-actions[bot] 2024-08-29 04:32:14 +00:00
3 changed files with 296 additions and 21 deletions

View file

@ -5,8 +5,8 @@
*/
import AK
import Web
import Testing
import Web
@Suite
struct TestHTMLTokenizerSwift {
@ -30,4 +30,86 @@ struct TestHTMLTokenizerSwift {
#expect(!token.isParserWhitespace())
}
}
@Test func dataStateNoInput() {
let tokenizer = HTMLTokenizer()
#expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
let token = tokenizer.nextToken()
#expect(token?.type == .EndOfFile)
let token2 = tokenizer.nextToken()
#expect(token2 == nil)
#expect(tokenizer.state == HTMLTokenizer.State.Data)
}
@Test func dataStateSingleChar() {
guard let tokenizer = HTMLTokenizer(input: "X") else {
Issue.record("Failed to create tokenizer for 'X'")
return
}
#expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
let token = tokenizer.nextToken()
#expect(token?.type == .Character(codePoint: "X"))
let token2 = tokenizer.nextToken()
#expect(token2?.type == .EndOfFile)
let token3 = tokenizer.nextToken()
#expect(token3 == nil)
#expect(tokenizer.state == HTMLTokenizer.State.Data)
}
@Test func dataStateAmpersand() {
guard let tokenizer = HTMLTokenizer(input: "&") else {
Issue.record("Failed to create tokenizer for '&'")
return
}
#expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
let token = tokenizer.nextToken()
#expect(token?.type == .EndOfFile)
#expect(tokenizer.state == HTMLTokenizer.State.CharacterReference)
let token2 = tokenizer.nextToken()
#expect(token2 == nil)
}
@Test func dataStateTagOpen() {
guard let tokenizer = HTMLTokenizer(input: "<") else {
Issue.record("Failed to create tokenizer for '<'")
return
}
#expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
let token = tokenizer.nextToken()
#expect(token?.type == .EndOfFile)
#expect(tokenizer.state == HTMLTokenizer.State.TagOpen)
let token2 = tokenizer.nextToken()
#expect(token2 == nil)
}
@Test func dataStateNulChar() {
guard let tokenizer = HTMLTokenizer(input: "H\0I") else {
Issue.record("Failed to create tokenizer for 'H\\0I'")
return
}
#expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
let token = tokenizer.nextToken()
#expect(token?.type == .Character(codePoint: "H"))
let token2 = tokenizer.nextToken()
#expect(token2?.type == .Character(codePoint: "\u{FFFD}"))
let token3 = tokenizer.nextToken()
#expect(token3?.type == .Character(codePoint: "I"))
let token4 = tokenizer.nextToken()
#expect(token4?.type == .EndOfFile)
#expect(tokenizer.state == HTMLTokenizer.State.Data)
}
}

View file

@ -7,13 +7,13 @@
@_exported import WebCxx
public class HTMLToken {
public struct Position {
public struct Position: Equatable {
var line = UInt()
var column = UInt()
var byteOffset = UInt()
}
public struct Attribute {
public struct Attribute: Equatable {
var prefix: Swift.String?
var localName: Swift.String
var namespace_: Swift.String?
@ -24,7 +24,7 @@ public class HTMLToken {
var valueEndPosition: Position
}
public enum TokenType {
public enum TokenType: Equatable {
case Invalid
case DOCTYPE(
name: Swift.String?,
@ -79,7 +79,7 @@ public class HTMLToken {
}
}
extension HTMLToken.Position: Equatable, CustomStringConvertible {
extension HTMLToken.Position: CustomStringConvertible {
public var description: Swift.String {
return "\(self.line):\(self.column)"
}
@ -109,13 +109,11 @@ extension HTMLToken.TokenType: CustomStringConvertible {
extension HTMLToken: CustomStringConvertible {
public var description: Swift.String {
if (self.startPosition == Position()) {
if self.startPosition == Position() {
return "HTMLToken(type: \(self.type))"
}
else if (self.endPosition == Position()) {
} else if self.endPosition == Position() {
return "HTMLToken(type: \(self.type))@\(self.startPosition)"
}
else {
} else {
return "HTMLToken(type: \(self.type))@\(self.startPosition)-\(self.endPosition)"
}
}

View file

@ -20,9 +20,9 @@ extension Swift.String {
}
}
class HTMLTokenizer {
public class HTMLTokenizer {
enum State {
public enum State {
case Data
case RCDATA
case RAWTEXT
@ -105,29 +105,224 @@ class HTMLTokenizer {
case NumericCharacterReferenceEnd
}
var input = Swift.String()
var state = State.Data
var returnState = State.Data
private var input = Swift.String()
private var cursor: Swift.String.Index
private var previousCursor: Swift.String.Index
var currentToken = HTMLToken()
var queuedTokens = Deque<HTMLToken>()
public private(set) var state = State.Data
private var returnState = State.Data
public init() {}
private var currentToken = HTMLToken()
private var queuedTokens = Deque<HTMLToken>()
private var aborted = false
private var hasEmittedEOF = false
public init() {
self.cursor = self.input.startIndex
self.previousCursor = self.input.startIndex
}
public init?(input: AK.StringView, encoding: AK.StringView) {
if let string = Swift.String(decoding: input, as: encoding) {
self.input = string
} else {
return nil
}
self.cursor = self.input.startIndex
self.previousCursor = self.input.startIndex
}
public convenience init?(input: AK.StringView) {
self.init(input: input, encoding: "UTF-8")
}
public func abort() {
self.aborted = true
}
func skip(_ count: Int) {
self.cursor = self.input.index(self.cursor, offsetBy: count, limitedBy: self.input.endIndex) ?? input.endIndex
self.previousCursor = self.input.index(before: self.cursor)
}
func peekCodePoint(_ offset: Int = 0) -> Character? {
guard let index = self.input.index(self.cursor, offsetBy: offset, limitedBy: self.input.index(before: self.input.endIndex)) else {
return nil
}
return self.input[index]
}
func nextCodePoint() -> Character? {
guard self.cursor < self.input.endIndex else {
return nil
}
// https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream:tokenization
// https://infra.spec.whatwg.org/#normalize-newlines
var codePoint: Character
if let peeked = peekCodePoint(), let peekedNext = peekCodePoint(1), peeked == "\r", peekedNext == "\n" {
// replace every U+000D CR U+000A LF code point pair with a single U+000A LF code point,
skip(2)
codePoint = "\n"
} else if let peeked = peekCodePoint(), peeked == "\r" {
// replace every remaining U+000D CR code point with a U+000A LF code point.
skip(1)
codePoint = "\n"
} else {
skip(1)
codePoint = self.input[self.previousCursor]
}
return codePoint
}
func restoreCursorToPrevious() {
self.cursor = self.previousCursor
}
func createNewToken(_ token: HTMLToken) {
self.currentToken = token
// FIXME: Assign Position
}
enum NextTokenState {
case Emit(token: HTMLToken?)
case SwitchTo
case Reconsume(inputCharacter: Character?)
case ReprocessQueue
}
public func nextToken(stopAtInsertionPoint: Bool = false) -> HTMLToken? {
while !queuedTokens.isEmpty {
return queuedTokens.popFirst()
let processQueue = { () -> HTMLToken?? in
if let token = self.queuedTokens.popFirst() {
return token
}
return self.aborted ? Optional(nil) : nil
}
return nil
if let maybeToken = processQueue() {
return maybeToken
}
var nextInputCharacter: Character? = nil
while true {
// FIXME: Handle insertion point
switch nextTokenImpl(nextInputCharacter) {
case .Emit(let token):
return token
case .SwitchTo:
nextInputCharacter = nil
break
case .Reconsume(let character):
nextInputCharacter = character
break
case .ReprocessQueue:
if let maybeToken = processQueue() {
return maybeToken
}
nextInputCharacter = nil
break
}
}
}
func switchTo(_ state: State) -> NextTokenState {
self.state = state
return .SwitchTo
}
func reconsume(_ character: Character, `in` state: State) -> NextTokenState {
self.state = state
return .Reconsume(inputCharacter: character)
}
func switchToReturnState() -> NextTokenState {
self.state = self.returnState
return .ReprocessQueue
}
func reconsumeInReturnState(_ character: Character?) -> NextTokenState {
self.state = self.returnState
if character != nil {
restoreCursorToPrevious()
}
return .ReprocessQueue
}
func switchToAndEmitCurrentToken(_ state: State) -> NextTokenState {
self.state = state
self.queuedTokens.append(self.currentToken)
self.currentToken = HTMLToken()
return .Emit(token: self.queuedTokens.popFirst()!)
}
func switchToAndEmitCharacter(_ state: State, character: Character) -> NextTokenState {
self.state = state
return emitCharacter(character)
}
func emitCharacterAndReconsume(_ character: Character, `in`: State, currentInputCharacter: Character?) -> NextTokenState {
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: character)))
self.state = `in`
return .Reconsume(inputCharacter: currentInputCharacter)
}
func emitEOF() -> NextTokenState {
if self.hasEmittedEOF {
return .Emit(token: nil)
}
self.hasEmittedEOF = true
createNewToken(HTMLToken(type: .EndOfFile))
self.queuedTokens.append(self.currentToken)
self.currentToken = HTMLToken()
return .Emit(token: self.queuedTokens.popFirst()!)
}
func emitCurrentTokenFollowedByEOF() -> NextTokenState {
precondition(!self.hasEmittedEOF)
self.queuedTokens.append(self.currentToken)
self.currentToken = HTMLToken()
return emitEOF()
}
func emitCharacter(_ character: Character) -> NextTokenState {
createNewToken(HTMLToken(type: .Character(codePoint: character)))
self.queuedTokens.append(self.currentToken)
self.currentToken = HTMLToken()
return .Emit(token: self.queuedTokens.popFirst()!)
}
func nextTokenImpl(_ nextInputCharacter: Character? = nil) -> NextTokenState {
let dontConsumeNextInputCharacter = {
self.restoreCursorToPrevious()
}
let _ = dontConsumeNextInputCharacter
// FIXME: flushCodepointsConsumedAsACharacterReference needs currentBuilder
// Handle reconsume by passing the character around in the state enum
let currentInputCharacter = nextInputCharacter ?? nextCodePoint()
switch self.state {
// 13.2.5.1 Data state, https://html.spec.whatwg.org/multipage/parsing.html#data-state
case .Data:
switch currentInputCharacter {
case "&":
self.returnState = .Data
return switchTo(.CharacterReference)
case "<":
return switchTo(.TagOpen)
case "\0":
// FIXME: log_parse_error()
return emitCharacter("\u{FFFD}")
case nil:
return emitEOF()
default:
return emitCharacter(currentInputCharacter!)
}
default:
print("TODO: In state \(self.state) with input \(Swift.String(describing: currentInputCharacter))")
return emitEOF()
}
}
}