大学的时候选python课 课设就是让我们用爬虫去爬取数据 写入文件 然后再做数据分析 词云图 地图分类等 python已经记不清了 现在用Java尝试一下爬取数据
爬虫分为三步骤:1.获取你自己电脑访问网站的时候的请求头 2.目标网站的url 3.对爬出来的网页返回值进行切分出有用的部分
java
package com.example.concurrent;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
public class BookLibrary {
// 目标榜单URL 序号 作者 作品 类型 进度 字数 作品积分
private static final String TARGET_URL = "https://www.jjwxc.net/topten.php?orderstr=7&t=1";
public static void main(String[] args) {
List<Novel> novels = crawlRankList();
// novels.forEach(System.out::println);
writeNovelsToExcel(novels,"/Users/xuejiangjing/Documents/novels.xlsx");
}
public static void writeNovelsToExcel(List<Novel> novelList, String outputPath) {
try (Workbook workbook = new XSSFWorkbook()) { // 创建.xlsx格式工作簿
Sheet sheet = workbook.createSheet("小说列表"); // 创建工作表
// 创建表头行
String[] headers = {"排名", "作者", "书名", "类型", "进度", "总字数", "投票数", "更新时间", "简介"};
Row headerRow = sheet.createRow(0);
for (int i = 0; i < headers.length; i++) {
Cell cell = headerRow.createCell(i);
cell.setCellValue(headers[i]);
}
// 填充数据行
int rowNum = 1;
for (Novel novel : novelList) {
Row row = sheet.createRow(rowNum++);
// 按字段顺序写入(需与Novel类构造参数顺序一致)
row.createCell(0).setCellValue(novel.getRank());
row.createCell(1).setCellValue(novel.getAuthor());
row.createCell(2).setCellValue(novel.getTitle());
row.createCell(3).setCellValue(novel.getType());
row.createCell(4).setCellValue(novel.getProgress());
row.createCell(5).setCellValue(novel.getTotalNUm());
row.createCell(6).setCellValue(novel.getVotes());
row.createCell(7).setCellValue(novel.getTime());
row.createCell(8).setCellValue(novel.getJianjie());
}
// 自动调整列宽(可选)
for (int i = 0; i < headers.length; i++) {
sheet.autoSizeColumn(i);
}
// 写入文件
try (FileOutputStream outputStream = new FileOutputStream(outputPath)) {
workbook.write(outputStream);
System.out.println("Excel文件已生成:" + outputPath);
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/**
* 爬取榜单数据
*/
public static List<Novel> crawlRankList() {
List<Novel> novelList = new ArrayList<>();
try {
// 1. 模拟浏览器请求(关键反爬策略)- useragent放入你自己的请求头
Document doc = Jsoup.connect(TARGET_URL)
.userAgent("Mozilla/5.0 Version/17.4 Safari")
.header("Accept-Language", "zh-CN,zh;q=0.9")
.timeout(10_000)
.get();
String htmlContent = doc.html(); // 获取完整HTML内容(含格式)
// System.out.println(htmlContent);
// 2. 定位榜单表格(需根据实际HTML结构调整选择器)
// Element table = doc.selectFirst("table.rank-table");
Element targetTable = null;
Elements tables = doc.select("table[width=984][border=0][align=center][cellpadding=0][cellspacing=1][bgcolor=#009900]");
// System.out.println("tables.size>>>" + tables.size());
for (Element table : tables) {
Element firstTd = table.selectFirst("td:eq(0)"); // 第一个td
if (firstTd != null && firstTd.text().trim().equals("序号")) {
// System.out.println("找到目标表格:\n" + table);
targetTable = table;
break;
}
}
if (targetTable == null) {
return new ArrayList<>();
}
Elements rows = targetTable.select("tr:has(td)"); // 跳过表头
// 3. 解析每一行数据 序号 作者 作品 类型 进度 字数 作品积分 截止时间
for (int i = 1; i < rows.size(); i++) {
Elements cols = rows.get(i).select("td");
if (cols.size() < 8) continue;
// System.out.println(cols.html() + ">>>>>>");
String rank = cols.get(0).text();
Element authorTd = cols.get(1);
String author = "";
Element links = authorTd.selectFirst("a");
if (links != null) {
author = links.text().trim();
} else {
System.out.println("未找到<a>标签");
}
Element authorTdss = cols.get(2);
String bookName = "";
String rawRel = "";
Element link = authorTdss.selectFirst("a");
if (link != null) {
// 提取书名
bookName = link.text().trim();
// 提取并处理rel属性
rawRel = link.attr("rel").replaceAll("<br>", "\n");
// 输出结果
// System.out.println("书名: " + bookName);
// System.out.println("简介:\n" + rawRel);
} else {
System.out.println("未找到<a>标签");
}
String type = cols.get(3).text();
String progress = cols.get(4).text();
String totalNUm = cols.get(5).text();
String votes = cols.get(6).text();
String time = cols.get(7).text();
novelList.add(new Novel(rank, author, bookName, type, progress, totalNUm, votes, time, rawRel));
}
// 4. 添加延迟防止封IP
Thread.sleep(8000);
} catch (IOException | InterruptedException e) {
e.printStackTrace();
}
return novelList;
}
/**
* 小说数据实体类 序号 作者 作品 类型 进度 字数 作品积分
*/
static class Novel {
private String rank;//序号
private String author;//作者
private String title;//作品
private String type;//类型
private String progress;//进度
private String totalNUm;//字数
private String votes;//积分
private String time;//时间
private String jianjie;//简介
public Novel(String rank, String author, String title, String type, String progress, String totalNUm, String votes, String time, String jianjie) {
this.rank = rank;
this.author = author;
this.title = title;
this.type = type;
this.progress = progress;
this.totalNUm = totalNUm;
this.votes = votes;
this.time = time;
this.jianjie = jianjie;
}
@Override
public String toString() {
return "Novel{" +
"rank='" + rank + '\'' +
", author='" + author + '\'' +
", title='" + title + '\'' +
", type='" + type + '\'' +
", progress='" + progress + '\'' +
", totalNUm='" + totalNUm + '\'' +
", votes='" + votes + '\'' +
", time='" + time + '\'' +
", jianjie='" + jianjie + '\'' +
'}';
}
public String getRank() {
return rank;
}
public void setRank(String rank) {
this.rank = rank;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getProgress() {
return progress;
}
public void setProgress(String progress) {
this.progress = progress;
}
public String getTotalNUm() {
return totalNUm;
}
public void setTotalNUm(String totalNUm) {
this.totalNUm = totalNUm;
}
public String getTime() {
return time;
}
public void setTime(String time) {
this.time = time;
}
public String getVotes() {
return votes;
}
public void setVotes(String votes) {
this.votes = votes;
}
public String getJianjie() {
return jianjie;
}
public void setJianjie(String jianjie) {
this.jianjie = jianjie;
}
}
}