srctree

Robin Linden parent e8f85798 a5220779
Replace pugixml with a custom parser

inlinesplit
.bazelrc added: 451, removed: 79, total 372
@@ -1,2 +1,3 @@
build:gnulike --cxxopt='-std=c++2a'
build:msvc --cxxopt='-std:c++latest'
test --test_output=errors
 
WORKSPACE added: 451, removed: 79, total 372
@@ -15,3 +15,10 @@ http_archive(
strip_prefix = "pugixml-1.11.4",
urls = ["https://github.com/zeux/pugixml/releases/download/v1.11.4/pugixml-1.11.4.tar.gz"],
)
 
http_archive(
name = "catch2",
sha256 = "e7eb70b3d0ac2ed7dcf14563ad808740c29e628edde99e973adad373a2b5e4df",
strip_prefix = "Catch2-2.13.4",
urls = ["https://github.com/catchorg/Catch2/archive/v2.13.4.tar.gz"],
)
 
ev/null added: 451, removed: 79, total 372
@@ -1,9 +0,0 @@
load("@rules_cc//cc:defs.bzl", "cc_library")
 
cc_library(
name = "pugixml",
srcs = glob(["src/*.cpp"]),
hdrs = glob(["src/*.hpp"]),
strip_include_prefix = "src",
visibility = ["//visibility:public"],
)
 
bin/browser/BUILD added: 451, removed: 79, total 372
@@ -4,11 +4,11 @@ cc_binary(
name = "browser",
srcs = ["main.cpp"],
linkopts = select({
"@bazel_tools//platforms:windows": [],
"@bazel_tools//platforms:linux": ["-lpthread"],
"@bazel_tools//platforms:windows": [],
}),
deps = [
"//parser",
"@asio",
"@pugixml",
],
)
 
bin/browser/main.cpp added: 451, removed: 79, total 372
@@ -1,5 +1,6 @@
#include "parser/parser.h"
 
#include <asio.hpp>
#include <pugixml.hpp>
 
#include <cassert>
#include <iostream>
@@ -8,6 +9,23 @@
 
using namespace std::string_literals;
 
namespace {
 
template<class... Ts>
struct overloaded : Ts... { using Ts::operator()...; };
 
void print_node(dom::Node node, uint8_t depth = 0) {
for (int8_t i = 0; i < depth; ++i) { std::cout << " "; }
std::visit(overloaded {
[](std::monostate) {},
[](dom::Doctype const &node) { std::cout << "doctype: " << node.doctype << '\n'; },
[](dom::Element const &node) { std::cout << "tag: " << node.name << '\n'; },
[](dom::Text const &node) { std::cout << "value: " << node.text << '\n'; },
}, node.data);
 
for (auto const &child : node.children) { print_node(child, depth + 1); }
}
 
std::string drop_http_headers(std::string html) {
const auto delim = "\r\n\r\n"s;
auto it = html.find(delim);
@@ -15,44 +33,7 @@ std::string drop_http_headers(std::string html) {
return html;
}
 
std::string drop_head(std::string html) {
const auto tag_start = "<head>"s;
const auto tag_end = "</head>"s;
auto head = html.find(tag_start);
html.erase(head, html.find(tag_end) - head + tag_end.size());
return html;
}
 
std::string drop_doctype(std::string html) {
html.erase(0, "<!doctype html>"s.size());
return html;
}
 
struct Node {
int32_t depth{0};
int8_t type{0};
std::string name;
std::string value;
};
 
struct Tree {
std::vector<Node> nodes;
};
 
struct TreeSaver : pugi::xml_tree_walker {
Tree tree;
 
bool for_each(pugi::xml_node &xml) override {
tree.nodes.push_back(Node{
.depth = depth(),
.type = xml.type(),
.name = xml.name(),
.value = xml.value(),
});
 
return true;
}
};
} // namespace
 
int main(int argc, char **argv) {
asio::ip::tcp::iostream stream("www.example.com", "http");
@@ -67,24 +48,7 @@ int main(int argc, char **argv) {
auto buffer = ss.str();
 
buffer = drop_http_headers(buffer);
buffer = drop_head(buffer);
buffer = drop_doctype(buffer);
 
pugi::xml_document doc;
if (auto res = doc.load_string(buffer.c_str()); !res) {
std::cerr << res.offset << ": " << res.description() << '\n';
std::cerr << buffer.c_str() + res.offset;
return 1;
}
 
auto walker = TreeSaver{};
doc.traverse(walker);
 
for (const auto &node : walker.tree.nodes) {
for (int8_t i = 0; i < node.depth; ++i) {
std::cout << " ";
}
 
std::cout << "name=" << node.name << ", value=" << node.value << '\n';
}
auto nodes = parser::Parser{buffer}.parse_nodes();
for (auto const &node : nodes) { print_node(node); }
}
 
filename was Deleted added: 451, removed: 79, total 372
@@ -0,0 +1,7 @@
load("@rules_cc//cc:defs.bzl", "cc_library")
 
cc_library(
name = "dom",
hdrs = ["dom.h"],
visibility = ["//visibility:public"],
)
 
filename was Deleted added: 451, removed: 79, total 372
@@ -0,0 +1,43 @@
#ifndef DOM_H_
#define DOM_H_
 
#include <map>
#include <string>
#include <string_view>
#include <utility>
#include <variant>
#include <vector>
 
namespace dom {
 
using AttrMap = std::map<std::string, std::string>;
 
struct Doctype { std::string doctype; };
 
struct Text { std::string text; };
 
struct Element {
std::string name;
AttrMap attributes;
};
 
struct Node {
std::vector<Node> children;
std::variant<std::monostate, Doctype, Text, Element> data;
};
 
Node create_doctype_node(std::string_view doctype) {
return {{}, Doctype{std::string{doctype}}};
}
 
Node create_text_node(std::string_view data) {
return {{}, Text{std::string(data)}};
}
 
Node create_element_node(std::string_view name, AttrMap attrs, std::vector<Node> children) {
return {std::move(children), Element{std::string{name}, std::move(attrs)}};
}
 
} // namespace dom
 
#endif
 
filename was Deleted added: 451, removed: 79, total 372
@@ -0,0 +1,18 @@
load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
 
cc_library(
name = "parser",
hdrs = ["parser.h"],
deps = ["//dom"],
visibility = ["//visibility:public"],
)
 
cc_test(
name = "parser_test",
srcs = ["test_parser.cpp"],
size = "small",
deps = [
":parser",
"@catch2//:catch2_with_main",
],
)
 
filename was Deleted added: 451, removed: 79, total 372
@@ -0,0 +1,168 @@
#ifndef PARSER_H_
#define PARSER_H_
 
#include "dom/dom.h"
 
#include <array>
#include <cctype>
#include <cstddef>
#include <functional>
#include <string_view>
#include <utility>
#include <vector>
 
namespace parser {
 
// Inspired by
// https://github.com/servo/rust-cssparser/blob/02129220f848246ce8899f45a50d4b15068ebd79/src/tokenizer.rs
struct Parser {
// https://html.spec.whatwg.org/multipage/syntax.html#void-elements
static constexpr auto void_elements = std::to_array({
"area", "base", "br", "col", "embed",
"hr", "img", "input", "link", "meta",
"param", "source", "track", "wbr"});
 
constexpr bool is_void_element(std::string_view tag) {
return find(begin(void_elements), end(void_elements), tag) != end(void_elements);
}
 
constexpr char peek() const {
return input[pos];
}
 
constexpr std::string_view peek(std::size_t chars) const {
return input.substr(pos, chars);
}
 
constexpr bool starts_with(std::string_view prefix) const {
return peek(prefix.size()) == prefix;
}
 
constexpr bool is_eof() const {
return pos >= input.size();
}
 
constexpr char consume_char() {
return input[pos++];
}
 
constexpr void advance(std::size_t n) {
pos += n;
}
 
std::string_view consume_while(std::function<bool(char)> const &pred) {
std::size_t start = pos;
while (pred(input[pos])) { ++pos; }
return input.substr(start, pos - start);
}
 
constexpr void skip_whitespace() {
while (!is_eof() && std::isspace(peek())) { advance(1); }
}
 
std::string_view parse_tag_name() {
return consume_while([](char c) {
return (c >= 'a' && c <= 'z')
|| (c >= 'A' && c <= 'Z')
|| (c >= '0' && c <= '9');
});
}
 
std::string_view parse_attr_value() {
auto open_quote = consume_char(); // ' or "
auto value = consume_while([=](char c) { return c != open_quote; });
consume_char(); // same as open_quote
return value;
}
 
std::pair<std::string, std::string> parse_attr() {
auto name = parse_tag_name();
consume_char(); // =
auto value = parse_attr_value();
return {std::string{name}, std::string{value}};
}
 
dom::AttrMap parse_attributes() {
dom::AttrMap attrs;
while (true) {
skip_whitespace();
if (peek() == '>' || starts_with("/>")) { break; }
attrs.insert(parse_attr());
}
 
return attrs;
}
 
dom::Node parse_doctype() {
consume_while([](char c) { return c != ' '; }); // <!doctype
skip_whitespace();
auto doctype = dom::create_doctype_node(consume_while([](char c) { return c != '>'; }));
consume_char(); // >
return doctype;
}
 
dom::Node parse_text() {
return dom::create_text_node(consume_while([](char c) { return c != '<'; }));
}
 
dom::Node parse_element() {
consume_char(); // <
auto name = parse_tag_name();
auto attrs = parse_attributes();
if (is_void_element(name)) {
if (consume_char() == '/') { // optional / or >
consume_char(); // >
}
return dom::create_element_node(name, std::move(attrs), {});
} else {
consume_char(); // >
}
 
auto children = parse_nodes();
consume_char(); // <
consume_char(); // /
parse_tag_name();
consume_char(); // >
return dom::create_element_node(name, std::move(attrs), std::move(children));
}
 
static constexpr auto no_case_compare = [](std::string_view a, std::string_view b) {
if (a.size() != b.size()) { return false; }
for (size_t i = 0; i < a.size(); ++i) {
if (std::tolower(a[i]) != std::tolower(b[i])) {
return false;
}
}
 
return true;
};
 
dom::Node parse_node() {
using namespace std::string_view_literals;
 
constexpr auto doctype_prefix = "<!doctype"sv;
auto peeked = peek(doctype_prefix.size());
if (no_case_compare(doctype_prefix, peeked)) { return parse_doctype(); }
if (peek() == '<') { return parse_element(); }
return parse_text();
}
 
std::vector<dom::Node> parse_nodes() {
using namespace std::string_view_literals;
 
std::vector<dom::Node> nodes;
while (!is_eof()) {
skip_whitespace();
if (is_eof() || starts_with("</"sv)) { break; }
nodes.push_back(parse_node());
}
return nodes;
}
 
std::string_view input;
std::size_t pos{0};
};
 
} // namespace parser
 
#endif
 
filename was Deleted added: 451, removed: 79, total 372
@@ -0,0 +1,173 @@
#include "parser/parser.h"
 
#include <catch2/catch.hpp>
 
using namespace std::literals;
 
namespace {
 
TEST_CASE("parser", "[parser]") {
using parser::Parser;
 
SECTION("doctype") {
Parser parser{"<!doctype html>"sv};
auto nodes = parser.parse_nodes();
REQUIRE(nodes.size() == 1);
 
auto doctype = nodes[0];
REQUIRE(doctype.children.size() == 0);
REQUIRE(std::get<dom::Doctype>(doctype.data).doctype == "html"s);
}
 
SECTION("weirdly capitalized doctype") {
Parser parser{"<!docTYpe html>"sv};
auto nodes = parser.parse_nodes();
REQUIRE(nodes.size() == 1);
 
auto doctype = nodes[0];
REQUIRE(doctype.children.size() == 0);
REQUIRE(std::get<dom::Doctype>(doctype.data).doctype == "html"s);
}
 
SECTION("single element") {
Parser parser{"<html></html>"sv};
auto nodes = parser.parse_nodes();
REQUIRE(nodes.size() == 1);
 
auto html = nodes[0];
REQUIRE(html.children.size() == 0);
REQUIRE(std::get<dom::Element>(html.data).name == "html"s);
REQUIRE(std::get<dom::Element>(html.data).attributes.size() == 0);
}
 
SECTION("self-closing single element") {
Parser parser{"<br>"sv};
auto nodes = parser.parse_nodes();
REQUIRE(nodes.size() == 1);
 
auto br = nodes[0];
REQUIRE(br.children.size() == 0);
REQUIRE(std::get<dom::Element>(br.data).name == "br"s);
REQUIRE(std::get<dom::Element>(br.data).attributes.size() == 0);
}
 
SECTION("self-closing single element with slash") {
Parser parser{"<img/>"sv};
auto nodes = parser.parse_nodes();
REQUIRE(nodes.size() == 1);
 
auto img = nodes[0];
REQUIRE(img.children.size() == 0);
REQUIRE(std::get<dom::Element>(img.data).name == "img"s);
REQUIRE(std::get<dom::Element>(img.data).attributes.size() == 0);
}
 
SECTION("multiple elements") {
Parser parser{"<span></span><div></div>"sv};
auto nodes = parser.parse_nodes();
REQUIRE(nodes.size() == 2);
 
auto span = nodes[0];
REQUIRE(span.children.size() == 0);
REQUIRE(std::get<dom::Element>(span.data).name == "span"s);
REQUIRE(std::get<dom::Element>(span.data).attributes.size() == 0);
 
auto div = nodes[1];
REQUIRE(div.children.size() == 0);
REQUIRE(std::get<dom::Element>(div.data).name == "div"s);
REQUIRE(std::get<dom::Element>(div.data).attributes.size() == 0);
}
 
SECTION("nested elements") {
Parser parser{"<html><body></body></html>"sv};
auto nodes = parser.parse_nodes();
REQUIRE(nodes.size() == 1);
 
auto html = nodes[0];
REQUIRE(html.children.size() == 1);
REQUIRE(std::get<dom::Element>(html.data).name == "html"s);
REQUIRE(std::get<dom::Element>(html.data).attributes.size() == 0);
 
auto body = html.children[0];
REQUIRE(std::get<dom::Element>(body.data).name == "body"s);
REQUIRE(std::get<dom::Element>(body.data).attributes.size() == 0);
}
 
SECTION("single-quoted attribute") {
Parser parser{"<meta charset='utf-8'/>"sv};
auto nodes = parser.parse_nodes();
REQUIRE(nodes.size() == 1);
 
auto meta = nodes[0];
REQUIRE(meta.children.size() == 0);
 
auto meta_data = std::get<dom::Element>(meta.data);
REQUIRE(meta_data.name == "meta"s);
REQUIRE(meta_data.attributes.size() == 1);
REQUIRE(meta_data.attributes.at("charset") == "utf-8"s);
}
 
SECTION("double-quoted attribute") {
Parser parser{"<meta charset=\"utf-8\"/>"sv};
auto nodes = parser.parse_nodes();
REQUIRE(nodes.size() == 1);
 
auto meta = nodes[0];
REQUIRE(meta.children.size() == 0);
 
auto meta_data = std::get<dom::Element>(meta.data);
REQUIRE(meta_data.name == "meta"s);
REQUIRE(meta_data.attributes.size() == 1);
REQUIRE(meta_data.attributes.at("charset"s) == "utf-8"s);
}
 
SECTION("multiple attributes") {
Parser parser{"<meta name=\"viewport\" content=\"width=100em, initial-scale=1\"/>"sv};
auto nodes = parser.parse_nodes();
REQUIRE(nodes.size() == 1);
 
auto meta = nodes[0];
REQUIRE(meta.children.size() == 0);
 
auto meta_data = std::get<dom::Element>(meta.data);
REQUIRE(meta_data.name == "meta"s);
REQUIRE(meta_data.attributes.size() == 2);
REQUIRE(meta_data.attributes.at("name"s) == "viewport"s);
REQUIRE(meta_data.attributes.at("content"s) == "width=100em, initial-scale=1"s);
}
 
SECTION("multiple nodes with attributes") {
Parser parser{"<html bonus='hello'><body style='fancy'></body></html>"sv};
auto nodes = parser.parse_nodes();
REQUIRE(nodes.size() == 1);
 
auto html = nodes[0];
REQUIRE(html.children.size() == 1);
auto html_data = std::get<dom::Element>(html.data);
REQUIRE(html_data.name == "html"s);
REQUIRE(html_data.attributes.size() == 1);
REQUIRE(html_data.attributes.at("bonus"s) == "hello"s);
 
auto body = html.children[0];
auto body_data = std::get<dom::Element>(body.data);
REQUIRE(body_data.name == "body"s);
REQUIRE(body_data.attributes.size() == 1);
REQUIRE(body_data.attributes.at("style"s) == "fancy"s);
}
 
SECTION("text node") {
Parser parser{"<html>fantastic, the future is now</html>"sv};
auto nodes = parser.parse_nodes();
REQUIRE(nodes.size() == 1);
 
auto html = nodes[0];
REQUIRE(html.children.size() == 1);
REQUIRE(std::get<dom::Element>(html.data).name == "html"s);
REQUIRE(std::get<dom::Element>(html.data).attributes.size() == 0);
 
auto text = html.children[0];
REQUIRE(std::get<dom::Text>(text.data).text == "fantastic, the future is now"s);
}
}
 
} // namespace