srctree

Robin Linden parent a077363f cfa554eb
html2/test: Add an example spec-test from html5lib-tests

There are more parts of the test to check, and more tests to include,but this is a start.

inlinesplit
html2/BUILD added: 181, removed: 3, total 178
@@ -41,9 +41,35 @@ extra_deps = {
),
) for src in glob(
include = ["*_test.cpp"],
exclude = ["*_fuzz_test.cpp"],
exclude = [
"html5lib_test.cpp",
"*_fuzz_test.cpp",
],
)]
 
cc_test(
name = "html5lib_test",
size = "small",
srcs = ["html5lib_test.cpp"],
copts = HASTUR_COPTS + select({
# simdjson leaks a bunch of warnings into our code.
"@platforms//os:windows": [
"/wd4100",
"/wd4706",
],
"//conditions:default": [],
}),
data = ["@html5lib-tests//:tokenizer/test1.test"],
# simdjson seems to blow up qemu when we run our aarch64 crosscompiled
# tests.
tags = ["no-cross"],
deps = [
":html2",
"//etest",
"@simdjson",
],
)
 
[cc_fuzz_test(
name = src[:-4],
size = "small",
 
filename was Deleted added: 181, removed: 3, total 178
@@ -0,0 +1,152 @@
// SPDX-FileCopyrightText: 2024 Robin Lindén <dev@robinlinden.eu>
//
// SPDX-License-Identifier: BSD-2-Clause
 
#include "html2/token.h"
#include "html2/tokenizer.h"
 
#include "etest/etest2.h"
 
#include <simdjson.h> // IWYU pragma: keep
 
#include <cstdlib>
#include <iostream>
#include <optional>
#include <string>
#include <string_view>
#include <utility>
#include <variant>
#include <vector>
 
namespace {
struct Error {
html2::ParseError error{};
html2::SourceLocation location{};
};
 
std::pair<std::vector<html2::Token>, std::vector<Error>> tokenize(std::string_view input) {
std::vector<html2::Token> tokens;
std::vector<Error> errors;
html2::Tokenizer tokenizer{input,
[&](html2::Tokenizer &t, html2::Token token) {
// The expected token output doesn't contain eof tokens.
if (std::holds_alternative<html2::EndOfFileToken>(token)) {
return;
}
 
if (auto const *start_tag = std::get_if<html2::StartTagToken>(&token);
start_tag != nullptr && start_tag->tag_name == "script") {
t.set_state(html2::State::ScriptData);
}
 
tokens.push_back(std::move(token));
},
[&](html2::Tokenizer &t, html2::ParseError error) {
errors.push_back({error, t.current_source_location()});
}};
tokenizer.run();
 
return {std::move(tokens), std::move(errors)};
}
 
// NOLINTBEGIN(misc-include-cleaner): What you're meant to include from
// simdjson depends on things like the architecture you're compiling for.
// This is handled automagically with detection macros inside simdjson.
std::vector<html2::Token> to_html2_tokens(simdjson::ondemand::array tokens) {
constexpr auto kGetOptionalStr = [](simdjson::ondemand::value v) -> std::optional<std::string> {
if (v.is_null()) {
return std::nullopt;
}
return std::string{v.get_string().value()};
};
 
std::vector<html2::Token> result;
for (auto token : tokens) {
auto it = token.begin().value();
auto kind = (*it).get_string().value();
if (kind == "DOCTYPE") {
auto name = std::string{(*++it).value().get_string().value()};
auto public_id = kGetOptionalStr((*++it).value());
auto system_id = kGetOptionalStr((*++it).value());
// The json has "correctness" instead of "force quirks", so we negate it.
auto force_quirks = !(*++it).value().get_bool().value();
result.push_back(html2::DoctypeToken{
std::move(name),
std::move(public_id),
std::move(system_id),
force_quirks,
});
continue;
}
 
if (kind == "Comment") {
result.push_back(html2::CommentToken{std::string{(*++it).value().get_string().value()}});
continue;
}
 
if (kind == "StartTag") {
html2::StartTagToken start{std::string{(*++it).value().get_string().value()}};
auto attrs = (*++it).value().get_object().value();
for (auto attr : attrs) {
start.attributes.push_back({
std::string{attr.unescaped_key().value()},
std::string{attr.value().get_string().value()},
});
}
result.push_back(std::move(start));
continue;
}
 
if (kind == "EndTag") {
result.push_back(html2::EndTagToken{std::string{(*++it).value().get_string().value()}});
continue;
}
 
if (kind == "Character") {
auto characters = (*++it).value().get_string().value();
for (auto c : characters) {
result.push_back(html2::CharacterToken{c});
}
continue;
}
 
std::cerr << "Unknown token kind: " << kind << '\n';
std::abort();
}
 
return result;
}
} // namespace
 
int main() {
// TODO(robinlinden): Run all tests.
auto json = simdjson::padded_string::load("../html5lib-tests/tokenizer/test1.test");
if (json.error() != simdjson::SUCCESS) {
std::cerr << "Error loading test file: " << json.error() << '\n';
return 1;
}
 
etest::Suite s;
 
simdjson::ondemand::parser parser;
simdjson::ondemand::document doc = parser.iterate(json);
auto tests = doc.find_field("tests").get_array().value();
for (auto test : tests) {
auto name = test["description"].get_string().value();
// TOOD(robinlinden): Don't skip these.
if (test["initialStates"].error() == simdjson::SUCCESS) {
continue;
}
auto in = test["input"].get_string().value();
auto out_tokens = to_html2_tokens(test["output"].get_array().value());
 
s.add_test(std::string{name}, [input = std::string{in}, expected = std::move(out_tokens)](etest::IActions &a) {
auto [tokens, errors] = tokenize(input);
a.expect_eq(tokens, expected);
// TODO(robinlinden): Check that errors match.
});
}
 
return s.run();
}
// NOLINTEND(misc-include-cleaner)