能用来做数据抓取的代码类型有很多,在Java领域,可以使用Jsoup
这样的库轻松完成网页内容的抓取和解析;而在Python生态系统中,则有像Scrapy这样功能强大的框架可供选择。今天我将使用Java和Jsoup
库完成一个简单的通用爬虫模版,并且有可扩展性,方便修改。

下面是一个使用Java和Jsoup库实现的简单、通用且可扩展的爬虫程序。该程序支持多级爬取、自定义解析规则、结果存储扩展和并发控制:
java
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.*;
import java.util.concurrent.*;
import java.util.function.Function;
public class SimpleCrawler {
// 爬虫配置类
public static class CrawlerConfig {
private String startUrl;
private int maxDepth = 1;
private int timeoutMillis = 5000;
private int maxPages = 100;
private int maxConcurrency = 10;
private List<DataExtractor> extractors = new ArrayList<>();
private Function<String, Boolean> urlFilter = url -> true;
public CrawlerConfig startUrl(String startUrl) {
this.startUrl = startUrl;
return this;
}
public CrawlerConfig maxDepth(int maxDepth) {
this.maxDepth = maxDepth;
return this;
}
public CrawlerConfig timeoutMillis(int timeoutMillis) {
this.timeoutMillis = timeoutMillis;
return this;
}
public CrawlerConfig maxPages(int maxPages) {
this.maxPages = maxPages;
return this;
}
public CrawlerConfig maxConcurrency(int maxConcurrency) {
this.maxConcurrency = maxConcurrency;
return this;
}
public CrawlerConfig addExtractor(DataExtractor extractor) {
this.extractors.add(extractor);
return this;
}
public CrawlerConfig urlFilter(Function<String, Boolean> urlFilter) {
this.urlFilter = urlFilter;
return this;
}
}
// 数据提取器接口
public interface DataExtractor {
String getName();
void extract(Document doc, Map<String, Object> result);
List<String> getLinks(Document doc);
}
// 结果处理器接口
public interface ResultHandler {
void handle(String url, Map<String, Object> data);
}
// 核心爬虫类
public static class CrawlerEngine {
private final CrawlerConfig config;
private final Set<String> visitedUrls = ConcurrentHashMap.newKeySet();
private final Queue<PageTask> taskQueue = new ConcurrentLinkedQueue<>();
private final ExecutorService executor;
private final ResultHandler resultHandler;
public CrawlerEngine(CrawlerConfig config, ResultHandler resultHandler) {
this.config = config;
this.resultHandler = resultHandler;
this.executor = Executors.newFixedThreadPool(config.maxConcurrency);
}
public void start() {
taskQueue.add(new PageTask(config.startUrl, 0));
visitedUrls.add(config.startUrl);
List<Future<?>> futures = new ArrayList<>();
for (int i = 0; i < config.maxConcurrency; i++) {
futures.add(executor.submit(this::processTasks));
}
// 等待所有任务完成
for (Future<?> future : futures) {
try {
future.get();
} catch (InterruptedException | ExecutionException e) {
Thread.currentThread().interrupt();
}
}
executor.shutdown();
}
private void processTasks() {
while (!taskQueue.isEmpty() && visitedUrls.size() < config.maxPages) {
PageTask task = taskQueue.poll();
if (task == null) continue;
try {
Document doc = Jsoup.connect(task.url)
.timeout(config.timeoutMillis)
.userAgent("Mozilla/5.0 (compatible; SimpleCrawler/1.0)")
.get();
// 处理页面数据
Map<String, Object> pageData = new HashMap<>();
for (DataExtractor extractor : config.extractors) {
extractor.extract(doc, pageData);
}
resultHandler.handle(task.url, pageData);
// 处理深层链接
if (task.depth < config.maxDepth) {
for (DataExtractor extractor : config.extractors) {
for (String link : extractor.getLinks(doc)) {
String absUrl = makeAbsoluteUrl(task.url, link);
if (shouldVisit(absUrl)) {
taskQueue.add(new PageTask(absUrl, task.depth + 1));
visitedUrls.add(absUrl);
}
}
}
}
} catch (Exception e) {
System.err.println("Error processing: " + task.url + " - " + e.getMessage());
}
}
}
private boolean shouldVisit(String url) {
return url != null &&
!visitedUrls.contains(url) &&
config.urlFilter.apply(url) &&
visitedUrls.size() < config.maxPages;
}
private String makeAbsoluteUrl(String baseUrl, String relativeUrl) {
try {
return new java.net.URL(new java.net.URL(baseUrl), relativeUrl).toString();
} catch (Exception e) {
return null;
}
}
private static class PageTask {
String url;
int depth;
PageTask(String url, int depth) {
this.url = url;
this.depth = depth;
}
}
}
// 示例使用
public static void main(String[] args) {
// 1. 创建配置
CrawlerConfig config = new CrawlerConfig()
.startUrl("https://example.com")
.maxDepth(2)
.maxPages(50)
.maxConcurrency(5)
.urlFilter(url -> url.startsWith("https://example.com"))
.addExtractor(new TitleExtractor())
.addExtractor(new LinkExtractor("a[href]", "href"))
.addExtractor(new ContentExtractor("div.content"));
// 2. 创建结果处理器
ResultHandler consoleHandler = (url, data) -> {
System.out.println("\nURL: " + url);
data.forEach((key, value) -> System.out.println(key + ": " + value));
};
// 3. 启动爬虫
new CrawlerEngine(config, consoleHandler).start();
}
// 示例提取器:标题提取
static class TitleExtractor implements DataExtractor {
@Override
public String getName() { return "title"; }
@Override
public void extract(Document doc, Map<String, Object> result) {
String title = doc.title();
if (title != null && !title.isEmpty()) {
result.put(getName(), title);
}
}
@Override
public List<String> getLinks(Document doc) {
return Collections.emptyList(); // 不从此提取器获取链接
}
}
// 示例提取器:链接提取
static class LinkExtractor implements DataExtractor {
private final String selector;
private final String attr;
LinkExtractor(String selector, String attr) {
this.selector = selector;
this.attr = attr;
}
@Override
public String getName() { return "links"; }
@Override
public void extract(Document doc, Map<String, Object> result) {
// 链接提取通常不存储在结果中
}
@Override
public List<String> getLinks(Document doc) {
List<String> links = new ArrayList<>();
Elements elements = doc.select(selector);
for (Element el : elements) {
String link = el.attr("abs:" + attr);
if (!link.isEmpty()) links.add(link);
}
return links;
}
}
// 示例提取器:内容提取
static class ContentExtractor implements DataExtractor {
private final String selector;
ContentExtractor(String selector) {
this.selector = selector;
}
@Override
public String getName() { return "content"; }
@Override
public void extract(Document doc, Map<String, Object> result) {
Elements elements = doc.select(selector);
if (!elements.isEmpty()) {
result.put(getName(), elements.first().text());
}
}
@Override
public List<String> getLinks(Document doc) {
return Collections.emptyList();
}
}
}
核心设计特点:
-
模块化设计:
CrawlerConfig
:集中管理爬虫配置DataExtractor
:可扩展的数据提取接口ResultHandler
:结果处理接口CrawlerEngine
:核心爬取逻辑
-
可扩展性:
- 通过实现
DataExtractor
接口添加新的解析规则 - 通过实现
ResultHandler
支持不同输出方式(文件、数据库等) - 使用函数式接口
urlFilter
自定义URL过滤逻辑
- 通过实现
-
并发控制:
- 线程池管理并发请求
ConcurrentHashMap
保证线程安全ConcurrentLinkedQueue
任务队列
-
健壮性特性:
- 连接超时设置
- URL规范化处理
- 异常捕获机制
- 最大页面限制
-
配置选项:
- 爬取深度控制
- 最大页面限制
- 并发线程数
- 请求超时时间
- 自定义URL过滤
使用示例:
arduino
public static void main(String[] args) {
// 创建爬虫配置
CrawlerConfig config = new CrawlerConfig()
.startUrl("https://news.example.com")
.maxDepth(3)
.maxPages(100)
.urlFilter(url -> url.contains("/articles/"))
.addExtractor(new TitleExtractor())
.addExtractor(new LinkExtractor("a.article-link", "href"))
.addExtractor(new AuthorExtractor("span.author")) // 自定义提取器
.addExtractor(new DateExtractor("time.published")); // 自定义提取器
// 创建结果处理器(可替换为数据库存储)
ResultHandler dbHandler = (url, data) -> {
// 这里实现数据库存储逻辑
System.out.println("Saving to DB: " + url);
};
// 启动爬虫
new CrawlerEngine(config, dbHandler).start();
}
自定义提取器示例:
typescript
// 作者信息提取器
static class AuthorExtractor implements DataExtractor {
private final String selector;
AuthorExtractor(String selector) {
this.selector = selector;
}
@Override
public String getName() { return "author"; }
@Override
public void extract(Document doc, Map<String, Object> result) {
Element author = doc.selectFirst(selector);
if (author != null) {
result.put(getName(), author.text());
}
}
@Override
public List<String> getLinks(Document doc) {
return Collections.emptyList();
}
}
最佳实践建议:
- 遵守robots.txt:在真实项目中使用前添加robots.txt解析
- 限速策略:添加请求延迟避免被封禁
- 错误处理:增强网络异常处理和重试机制
- 代理支持:添加代理轮换功能
- 去重策略:使用Bloom过滤器优化URL去重
- 分布式扩展:对于大规模爬取,可改造为分布式架构
此爬虫框架提供了良好的基础结构,我们在实际使用中可以根据具体需求扩展更多功能,如:添加JavaScript渲染支持(使用Selenium或HtmlUnit)、实现自动翻页功能、添加验证码识别模块、集成更复杂的调度算法。使用时请确保遵守目标网站的爬取政策。