测试
cpp
#include <fstream>
#include <iostream>
#include <stdlib.h>
#include <string>
#include "../src/gumbo.h"
// 提取纯文本内容
static std::string cleantext(GumboNode* node) {
if (node->type == GUMBO_NODE_TEXT) {
return std::string(node->v.text.text);
} else if (node->type == GUMBO_NODE_ELEMENT &&
node->v.element.tag != GUMBO_TAG_SCRIPT &&
node->v.element.tag != GUMBO_TAG_STYLE) {
std::string contents = "";
GumboVector* children = &node->v.element.children;
for (unsigned int i = 0; i < children->length; ++i) {
const std::string text = cleantext((GumboNode*) children->data[i]);
if (i != 0 && !text.empty()) {
contents.append(" ");
}
contents.append(text);
}
return contents;
} else {
return "";
}
}
// 提取url链接
static void search_for_links(GumboNode* node) {
if (node->type != GUMBO_NODE_ELEMENT) {
return;
}
GumboAttribute* href;
if (node->v.element.tag == GUMBO_TAG_A &&
(href = gumbo_get_attribute(&node->v.element.attributes, "href"))) {
std::cout << href->value << std::endl;
}
GumboVector* children = &node->v.element.children;
for (unsigned int i = 0; i < children->length; ++i) {
search_for_links(static_cast<GumboNode*>(children->data[i]));
}
}
// 查找标题
#include <assert.h>
static const char* find_title(const GumboNode* root) {
assert(root->type == GUMBO_NODE_ELEMENT);
assert(root->v.element.children.length >= 2);
const GumboVector* root_children = &root->v.element.children;
GumboNode* head = NULL;
for (int i = 0; i < root_children->length; ++i) {
GumboNode* child = (GumboNode*) root_children->data[i];
if (child->type == GUMBO_NODE_ELEMENT &&
child->v.element.tag == GUMBO_TAG_HEAD) {
head = child;
break;
}
}
assert(head != NULL);
GumboVector* head_children = &head->v.element.children;
for (int i = 0; i < head_children->length; ++i) {
GumboNode* child = (GumboNode*) head_children->data[i];
if (child->type == GUMBO_NODE_ELEMENT &&
child->v.element.tag == GUMBO_TAG_TITLE) {
if (child->v.element.children.length != 1) {
return "<empty title>";
}
GumboNode* title_text = (GumboNode *) child->v.element.children.data[0];
assert(title_text->type == GUMBO_NODE_TEXT ||
title_text->type == GUMBO_NODE_WHITESPACE);
return title_text->v.text.text;
}
}
return "<no title found>";
}
static std::string find_line(
const std::string& original_text, const GumboAttribute& attr) {
size_t attr_index = attr.original_value.data - original_text.data();
size_t begin = original_text.rfind("\n", attr_index) + 1;
size_t end = original_text.find("\n", attr_index);
if (end != std::string::npos) {
end--;
} else {
end = (size_t) original_text.length() - 1;
}
end = std::min(end, attr_index + 40);
begin = std::max(begin, attr_index - 40);
return original_text.substr(begin, end - begin);
}
// 查找class元素
static void search_for_class(
GumboNode* node, const std::string& original_text, const char* cls_name) {
if (node->type != GUMBO_NODE_ELEMENT) {
return;
}
GumboAttribute* cls_attr;
if ((cls_attr = gumbo_get_attribute(&node->v.element.attributes, "class")) &&
strstr(cls_attr->value, cls_name) != NULL) {
std::cout << cls_attr->value_start.line << ":"
<< cls_attr->value_start.column << " - "
<< find_line(original_text, *cls_attr) << std::endl;
}
GumboVector* children = &node->v.element.children;
for (int i = 0; i < children->length; ++i) {
search_for_class(
static_cast<GumboNode*>(children->data[i]), original_text, cls_name);
}
}
int main(int argc, char** argv) {
if (argc != 2) {
std::cout << "Usage: clean_text <html filename>\n";
exit(EXIT_FAILURE);
}
const char* filename = argv[1];
std::ifstream in(filename, std::ios::in | std::ios::binary);
if (!in) {
std::cout << "File " << filename << " not found!\n";
exit(EXIT_FAILURE);
}
std::string contents;
in.seekg(0, std::ios::end);
contents.resize(in.tellg());
in.seekg(0, std::ios::beg);
in.read(&contents[0], contents.size());
in.close();
GumboOutput* output = gumbo_parse_with_options(
&kGumboDefaultOptions, contents.data(), contents.length());
std::cout << cleantext(output->root) << std::endl;
//search_for_links(output->root);
//find_title(output->root);
//const char* cls = "article";
//search_for_class(output->root, contents, cls);
gumbo_destroy_output(&kGumboDefaultOptions, output);
}
效果
参考
GitHub - google/gumbo-parser: An HTML5 parsing library in pure C99