mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-09-29 08:11:13 +00:00
LibWeb: Implement the Data state for the Swift tokenizer
And add tests! This implementation closely follows the current C++ implementation, replacing macros and gotos with a slightly more complex state machine. It's very possible that an async version that yields tokens on "emit" would be even simpler, but let's get this one working first :).
This commit is contained in:
parent
01c4625a42
commit
77718c0a66
Notes:
github-actions[bot]
2024-08-29 04:32:14 +00:00
Author: https://github.com/ADKaster Commit: https://github.com/LadybirdBrowser/ladybird/commit/77718c0a664 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/1220
|
@ -5,8 +5,8 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import AK
|
import AK
|
||||||
import Web
|
|
||||||
import Testing
|
import Testing
|
||||||
|
import Web
|
||||||
|
|
||||||
@Suite
|
@Suite
|
||||||
struct TestHTMLTokenizerSwift {
|
struct TestHTMLTokenizerSwift {
|
||||||
|
@ -30,4 +30,86 @@ struct TestHTMLTokenizerSwift {
|
||||||
#expect(!token.isParserWhitespace())
|
#expect(!token.isParserWhitespace())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test func dataStateNoInput() {
|
||||||
|
let tokenizer = HTMLTokenizer()
|
||||||
|
#expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
|
||||||
|
|
||||||
|
let token = tokenizer.nextToken()
|
||||||
|
#expect(token?.type == .EndOfFile)
|
||||||
|
|
||||||
|
let token2 = tokenizer.nextToken()
|
||||||
|
#expect(token2 == nil)
|
||||||
|
#expect(tokenizer.state == HTMLTokenizer.State.Data)
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test func dataStateSingleChar() {
|
||||||
|
guard let tokenizer = HTMLTokenizer(input: "X") else {
|
||||||
|
Issue.record("Failed to create tokenizer for 'X'")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
#expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
|
||||||
|
|
||||||
|
let token = tokenizer.nextToken()
|
||||||
|
#expect(token?.type == .Character(codePoint: "X"))
|
||||||
|
|
||||||
|
let token2 = tokenizer.nextToken()
|
||||||
|
#expect(token2?.type == .EndOfFile)
|
||||||
|
|
||||||
|
let token3 = tokenizer.nextToken()
|
||||||
|
#expect(token3 == nil)
|
||||||
|
#expect(tokenizer.state == HTMLTokenizer.State.Data)
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test func dataStateAmpersand() {
|
||||||
|
guard let tokenizer = HTMLTokenizer(input: "&") else {
|
||||||
|
Issue.record("Failed to create tokenizer for '&'")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
#expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
|
||||||
|
|
||||||
|
let token = tokenizer.nextToken()
|
||||||
|
#expect(token?.type == .EndOfFile)
|
||||||
|
#expect(tokenizer.state == HTMLTokenizer.State.CharacterReference)
|
||||||
|
|
||||||
|
let token2 = tokenizer.nextToken()
|
||||||
|
#expect(token2 == nil)
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test func dataStateTagOpen() {
|
||||||
|
guard let tokenizer = HTMLTokenizer(input: "<") else {
|
||||||
|
Issue.record("Failed to create tokenizer for '<'")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
#expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
|
||||||
|
|
||||||
|
let token = tokenizer.nextToken()
|
||||||
|
#expect(token?.type == .EndOfFile)
|
||||||
|
#expect(tokenizer.state == HTMLTokenizer.State.TagOpen)
|
||||||
|
|
||||||
|
let token2 = tokenizer.nextToken()
|
||||||
|
#expect(token2 == nil)
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test func dataStateNulChar() {
|
||||||
|
guard let tokenizer = HTMLTokenizer(input: "H\0I") else {
|
||||||
|
Issue.record("Failed to create tokenizer for 'H\\0I'")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
#expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
|
||||||
|
|
||||||
|
let token = tokenizer.nextToken()
|
||||||
|
#expect(token?.type == .Character(codePoint: "H"))
|
||||||
|
|
||||||
|
let token2 = tokenizer.nextToken()
|
||||||
|
#expect(token2?.type == .Character(codePoint: "\u{FFFD}"))
|
||||||
|
|
||||||
|
let token3 = tokenizer.nextToken()
|
||||||
|
#expect(token3?.type == .Character(codePoint: "I"))
|
||||||
|
|
||||||
|
let token4 = tokenizer.nextToken()
|
||||||
|
#expect(token4?.type == .EndOfFile)
|
||||||
|
|
||||||
|
#expect(tokenizer.state == HTMLTokenizer.State.Data)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,13 +7,13 @@
|
||||||
@_exported import WebCxx
|
@_exported import WebCxx
|
||||||
|
|
||||||
public class HTMLToken {
|
public class HTMLToken {
|
||||||
public struct Position {
|
public struct Position: Equatable {
|
||||||
var line = UInt()
|
var line = UInt()
|
||||||
var column = UInt()
|
var column = UInt()
|
||||||
var byteOffset = UInt()
|
var byteOffset = UInt()
|
||||||
}
|
}
|
||||||
|
|
||||||
public struct Attribute {
|
public struct Attribute: Equatable {
|
||||||
var prefix: Swift.String?
|
var prefix: Swift.String?
|
||||||
var localName: Swift.String
|
var localName: Swift.String
|
||||||
var namespace_: Swift.String?
|
var namespace_: Swift.String?
|
||||||
|
@ -24,7 +24,7 @@ public class HTMLToken {
|
||||||
var valueEndPosition: Position
|
var valueEndPosition: Position
|
||||||
}
|
}
|
||||||
|
|
||||||
public enum TokenType {
|
public enum TokenType: Equatable {
|
||||||
case Invalid
|
case Invalid
|
||||||
case DOCTYPE(
|
case DOCTYPE(
|
||||||
name: Swift.String?,
|
name: Swift.String?,
|
||||||
|
@ -79,7 +79,7 @@ public class HTMLToken {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
extension HTMLToken.Position: Equatable, CustomStringConvertible {
|
extension HTMLToken.Position: CustomStringConvertible {
|
||||||
public var description: Swift.String {
|
public var description: Swift.String {
|
||||||
return "\(self.line):\(self.column)"
|
return "\(self.line):\(self.column)"
|
||||||
}
|
}
|
||||||
|
@ -109,13 +109,11 @@ extension HTMLToken.TokenType: CustomStringConvertible {
|
||||||
|
|
||||||
extension HTMLToken: CustomStringConvertible {
|
extension HTMLToken: CustomStringConvertible {
|
||||||
public var description: Swift.String {
|
public var description: Swift.String {
|
||||||
if (self.startPosition == Position()) {
|
if self.startPosition == Position() {
|
||||||
return "HTMLToken(type: \(self.type))"
|
return "HTMLToken(type: \(self.type))"
|
||||||
}
|
} else if self.endPosition == Position() {
|
||||||
else if (self.endPosition == Position()) {
|
|
||||||
return "HTMLToken(type: \(self.type))@\(self.startPosition)"
|
return "HTMLToken(type: \(self.type))@\(self.startPosition)"
|
||||||
}
|
} else {
|
||||||
else {
|
|
||||||
return "HTMLToken(type: \(self.type))@\(self.startPosition)-\(self.endPosition)"
|
return "HTMLToken(type: \(self.type))@\(self.startPosition)-\(self.endPosition)"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,9 +20,9 @@ extension Swift.String {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
class HTMLTokenizer {
|
public class HTMLTokenizer {
|
||||||
|
|
||||||
enum State {
|
public enum State {
|
||||||
case Data
|
case Data
|
||||||
case RCDATA
|
case RCDATA
|
||||||
case RAWTEXT
|
case RAWTEXT
|
||||||
|
@ -105,29 +105,224 @@ class HTMLTokenizer {
|
||||||
case NumericCharacterReferenceEnd
|
case NumericCharacterReferenceEnd
|
||||||
}
|
}
|
||||||
|
|
||||||
var input = Swift.String()
|
private var input = Swift.String()
|
||||||
var state = State.Data
|
private var cursor: Swift.String.Index
|
||||||
var returnState = State.Data
|
private var previousCursor: Swift.String.Index
|
||||||
|
|
||||||
var currentToken = HTMLToken()
|
public private(set) var state = State.Data
|
||||||
var queuedTokens = Deque<HTMLToken>()
|
private var returnState = State.Data
|
||||||
|
|
||||||
public init() {}
|
private var currentToken = HTMLToken()
|
||||||
|
private var queuedTokens = Deque<HTMLToken>()
|
||||||
|
|
||||||
|
private var aborted = false
|
||||||
|
private var hasEmittedEOF = false
|
||||||
|
|
||||||
|
public init() {
|
||||||
|
self.cursor = self.input.startIndex
|
||||||
|
self.previousCursor = self.input.startIndex
|
||||||
|
}
|
||||||
public init?(input: AK.StringView, encoding: AK.StringView) {
|
public init?(input: AK.StringView, encoding: AK.StringView) {
|
||||||
if let string = Swift.String(decoding: input, as: encoding) {
|
if let string = Swift.String(decoding: input, as: encoding) {
|
||||||
self.input = string
|
self.input = string
|
||||||
} else {
|
} else {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
self.cursor = self.input.startIndex
|
||||||
|
self.previousCursor = self.input.startIndex
|
||||||
|
}
|
||||||
|
|
||||||
|
public convenience init?(input: AK.StringView) {
|
||||||
|
self.init(input: input, encoding: "UTF-8")
|
||||||
|
}
|
||||||
|
|
||||||
|
public func abort() {
|
||||||
|
self.aborted = true
|
||||||
|
}
|
||||||
|
|
||||||
|
func skip(_ count: Int) {
|
||||||
|
self.cursor = self.input.index(self.cursor, offsetBy: count, limitedBy: self.input.endIndex) ?? input.endIndex
|
||||||
|
self.previousCursor = self.input.index(before: self.cursor)
|
||||||
|
}
|
||||||
|
|
||||||
|
func peekCodePoint(_ offset: Int = 0) -> Character? {
|
||||||
|
guard let index = self.input.index(self.cursor, offsetBy: offset, limitedBy: self.input.index(before: self.input.endIndex)) else {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return self.input[index]
|
||||||
|
}
|
||||||
|
|
||||||
|
func nextCodePoint() -> Character? {
|
||||||
|
guard self.cursor < self.input.endIndex else {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream:tokenization
|
||||||
|
// https://infra.spec.whatwg.org/#normalize-newlines
|
||||||
|
var codePoint: Character
|
||||||
|
if let peeked = peekCodePoint(), let peekedNext = peekCodePoint(1), peeked == "\r", peekedNext == "\n" {
|
||||||
|
// replace every U+000D CR U+000A LF code point pair with a single U+000A LF code point,
|
||||||
|
skip(2)
|
||||||
|
codePoint = "\n"
|
||||||
|
} else if let peeked = peekCodePoint(), peeked == "\r" {
|
||||||
|
// replace every remaining U+000D CR code point with a U+000A LF code point.
|
||||||
|
skip(1)
|
||||||
|
codePoint = "\n"
|
||||||
|
} else {
|
||||||
|
skip(1)
|
||||||
|
codePoint = self.input[self.previousCursor]
|
||||||
|
}
|
||||||
|
return codePoint
|
||||||
|
}
|
||||||
|
|
||||||
|
func restoreCursorToPrevious() {
|
||||||
|
self.cursor = self.previousCursor
|
||||||
|
}
|
||||||
|
|
||||||
|
func createNewToken(_ token: HTMLToken) {
|
||||||
|
self.currentToken = token
|
||||||
|
// FIXME: Assign Position
|
||||||
|
}
|
||||||
|
|
||||||
|
enum NextTokenState {
|
||||||
|
case Emit(token: HTMLToken?)
|
||||||
|
case SwitchTo
|
||||||
|
case Reconsume(inputCharacter: Character?)
|
||||||
|
case ReprocessQueue
|
||||||
}
|
}
|
||||||
|
|
||||||
public func nextToken(stopAtInsertionPoint: Bool = false) -> HTMLToken? {
|
public func nextToken(stopAtInsertionPoint: Bool = false) -> HTMLToken? {
|
||||||
|
|
||||||
while !queuedTokens.isEmpty {
|
let processQueue = { () -> HTMLToken?? in
|
||||||
return queuedTokens.popFirst()
|
if let token = self.queuedTokens.popFirst() {
|
||||||
|
return token
|
||||||
|
}
|
||||||
|
return self.aborted ? Optional(nil) : nil
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
if let maybeToken = processQueue() {
|
||||||
|
return maybeToken
|
||||||
|
}
|
||||||
|
|
||||||
|
var nextInputCharacter: Character? = nil
|
||||||
|
while true {
|
||||||
|
// FIXME: Handle insertion point
|
||||||
|
switch nextTokenImpl(nextInputCharacter) {
|
||||||
|
case .Emit(let token):
|
||||||
|
return token
|
||||||
|
case .SwitchTo:
|
||||||
|
nextInputCharacter = nil
|
||||||
|
break
|
||||||
|
case .Reconsume(let character):
|
||||||
|
nextInputCharacter = character
|
||||||
|
break
|
||||||
|
case .ReprocessQueue:
|
||||||
|
if let maybeToken = processQueue() {
|
||||||
|
return maybeToken
|
||||||
|
}
|
||||||
|
nextInputCharacter = nil
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func switchTo(_ state: State) -> NextTokenState {
|
||||||
|
self.state = state
|
||||||
|
return .SwitchTo
|
||||||
|
}
|
||||||
|
|
||||||
|
func reconsume(_ character: Character, `in` state: State) -> NextTokenState {
|
||||||
|
self.state = state
|
||||||
|
return .Reconsume(inputCharacter: character)
|
||||||
|
}
|
||||||
|
|
||||||
|
func switchToReturnState() -> NextTokenState {
|
||||||
|
self.state = self.returnState
|
||||||
|
return .ReprocessQueue
|
||||||
|
}
|
||||||
|
|
||||||
|
func reconsumeInReturnState(_ character: Character?) -> NextTokenState {
|
||||||
|
self.state = self.returnState
|
||||||
|
if character != nil {
|
||||||
|
restoreCursorToPrevious()
|
||||||
|
}
|
||||||
|
return .ReprocessQueue
|
||||||
|
}
|
||||||
|
|
||||||
|
func switchToAndEmitCurrentToken(_ state: State) -> NextTokenState {
|
||||||
|
self.state = state
|
||||||
|
self.queuedTokens.append(self.currentToken)
|
||||||
|
self.currentToken = HTMLToken()
|
||||||
|
return .Emit(token: self.queuedTokens.popFirst()!)
|
||||||
|
}
|
||||||
|
|
||||||
|
func switchToAndEmitCharacter(_ state: State, character: Character) -> NextTokenState {
|
||||||
|
self.state = state
|
||||||
|
return emitCharacter(character)
|
||||||
|
}
|
||||||
|
|
||||||
|
func emitCharacterAndReconsume(_ character: Character, `in`: State, currentInputCharacter: Character?) -> NextTokenState {
|
||||||
|
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: character)))
|
||||||
|
self.state = `in`
|
||||||
|
return .Reconsume(inputCharacter: currentInputCharacter)
|
||||||
|
}
|
||||||
|
|
||||||
|
func emitEOF() -> NextTokenState {
|
||||||
|
if self.hasEmittedEOF {
|
||||||
|
return .Emit(token: nil)
|
||||||
|
}
|
||||||
|
self.hasEmittedEOF = true
|
||||||
|
createNewToken(HTMLToken(type: .EndOfFile))
|
||||||
|
self.queuedTokens.append(self.currentToken)
|
||||||
|
self.currentToken = HTMLToken()
|
||||||
|
return .Emit(token: self.queuedTokens.popFirst()!)
|
||||||
|
}
|
||||||
|
|
||||||
|
func emitCurrentTokenFollowedByEOF() -> NextTokenState {
|
||||||
|
precondition(!self.hasEmittedEOF)
|
||||||
|
self.queuedTokens.append(self.currentToken)
|
||||||
|
self.currentToken = HTMLToken()
|
||||||
|
return emitEOF()
|
||||||
|
}
|
||||||
|
|
||||||
|
func emitCharacter(_ character: Character) -> NextTokenState {
|
||||||
|
createNewToken(HTMLToken(type: .Character(codePoint: character)))
|
||||||
|
self.queuedTokens.append(self.currentToken)
|
||||||
|
self.currentToken = HTMLToken()
|
||||||
|
return .Emit(token: self.queuedTokens.popFirst()!)
|
||||||
|
}
|
||||||
|
|
||||||
|
func nextTokenImpl(_ nextInputCharacter: Character? = nil) -> NextTokenState {
|
||||||
|
let dontConsumeNextInputCharacter = {
|
||||||
|
self.restoreCursorToPrevious()
|
||||||
|
}
|
||||||
|
let _ = dontConsumeNextInputCharacter
|
||||||
|
|
||||||
|
// FIXME: flushCodepointsConsumedAsACharacterReference needs currentBuilder
|
||||||
|
|
||||||
|
// Handle reconsume by passing the character around in the state enum
|
||||||
|
let currentInputCharacter = nextInputCharacter ?? nextCodePoint()
|
||||||
|
|
||||||
|
switch self.state {
|
||||||
|
// 13.2.5.1 Data state, https://html.spec.whatwg.org/multipage/parsing.html#data-state
|
||||||
|
case .Data:
|
||||||
|
switch currentInputCharacter {
|
||||||
|
case "&":
|
||||||
|
self.returnState = .Data
|
||||||
|
return switchTo(.CharacterReference)
|
||||||
|
case "<":
|
||||||
|
return switchTo(.TagOpen)
|
||||||
|
case "\0":
|
||||||
|
// FIXME: log_parse_error()
|
||||||
|
return emitCharacter("\u{FFFD}")
|
||||||
|
case nil:
|
||||||
|
return emitEOF()
|
||||||
|
default:
|
||||||
|
return emitCharacter(currentInputCharacter!)
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
print("TODO: In state \(self.state) with input \(Swift.String(describing: currentInputCharacter))")
|
||||||
|
return emitEOF()
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue