Java爬虫|爬虫爬jj榜单数据写入excel

大学的时候选python课 课设就是让我们用爬虫去爬取数据 写入文件 然后再做数据分析 词云图 地图分类等 python已经记不清了 现在用Java尝试一下爬取数据

爬虫分为三步骤:1.获取你自己电脑访问网站的时候的请求头 2.目标网站的url 3.对爬出来的网页返回值进行切分出有用的部分

java 复制代码
package com.example.concurrent;

import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.FileOutputStream;
import java.io.IOException;

import java.util.ArrayList;
import java.util.List;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;

public class BookLibrary {
    // 目标榜单URL 序号 作者 作品 类型 进度 字数 作品积分
    private static final String TARGET_URL = "https://www.jjwxc.net/topten.php?orderstr=7&t=1";

    public static void main(String[] args) {
        List<Novel> novels = crawlRankList();
//        novels.forEach(System.out::println);
        writeNovelsToExcel(novels,"/Users/xuejiangjing/Documents/novels.xlsx");
    }



    public static void writeNovelsToExcel(List<Novel> novelList, String outputPath) {
        try (Workbook workbook = new XSSFWorkbook()) { // 创建.xlsx格式工作簿
            Sheet sheet = workbook.createSheet("小说列表"); // 创建工作表

            // 创建表头行
            String[] headers = {"排名", "作者", "书名", "类型", "进度", "总字数", "投票数", "更新时间", "简介"};
            Row headerRow = sheet.createRow(0);
            for (int i = 0; i < headers.length; i++) {
                Cell cell = headerRow.createCell(i);
                cell.setCellValue(headers[i]);
            }
            // 填充数据行
            int rowNum = 1;
            for (Novel novel : novelList) {
                Row row = sheet.createRow(rowNum++);
                // 按字段顺序写入(需与Novel类构造参数顺序一致)
                row.createCell(0).setCellValue(novel.getRank());
                row.createCell(1).setCellValue(novel.getAuthor());
                row.createCell(2).setCellValue(novel.getTitle());
                row.createCell(3).setCellValue(novel.getType());
                row.createCell(4).setCellValue(novel.getProgress());
                row.createCell(5).setCellValue(novel.getTotalNUm());
                row.createCell(6).setCellValue(novel.getVotes());
                row.createCell(7).setCellValue(novel.getTime());
                row.createCell(8).setCellValue(novel.getJianjie());
            }

            // 自动调整列宽(可选)
            for (int i = 0; i < headers.length; i++) {
                sheet.autoSizeColumn(i);
            }

            // 写入文件
            try (FileOutputStream outputStream = new FileOutputStream(outputPath)) {
                workbook.write(outputStream);
                System.out.println("Excel文件已生成:" + outputPath);
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * 爬取榜单数据
     */
    public static List<Novel> crawlRankList() {
        List<Novel> novelList = new ArrayList<>();
        try {
            // 1. 模拟浏览器请求(关键反爬策略)- useragent放入你自己的请求头
            Document doc = Jsoup.connect(TARGET_URL)
                    .userAgent("Mozilla/5.0 Version/17.4 Safari")
                    .header("Accept-Language", "zh-CN,zh;q=0.9")
                    .timeout(10_000)
                    .get();
            String htmlContent = doc.html(); // 获取完整HTML内容(含格式)
//            System.out.println(htmlContent);
            // 2. 定位榜单表格(需根据实际HTML结构调整选择器)
//            Element table = doc.selectFirst("table.rank-table");
            Element targetTable = null;
            Elements tables = doc.select("table[width=984][border=0][align=center][cellpadding=0][cellspacing=1][bgcolor=#009900]");
//            System.out.println("tables.size>>>" + tables.size());
            for (Element table : tables) {
                Element firstTd = table.selectFirst("td:eq(0)"); // 第一个td
                if (firstTd != null && firstTd.text().trim().equals("序号")) {
//                    System.out.println("找到目标表格:\n" + table);
                    targetTable = table;
                    break;
                }
            }
            if (targetTable == null) {
                return new ArrayList<>();
            }
            Elements rows = targetTable.select("tr:has(td)"); // 跳过表头

            // 3. 解析每一行数据 序号 作者 作品 类型 进度 字数 作品积分 截止时间
            for (int i = 1; i < rows.size(); i++) {
                Elements cols = rows.get(i).select("td");
                if (cols.size() < 8) continue;
//                System.out.println(cols.html() + ">>>>>>");
                String rank = cols.get(0).text();
                Element authorTd = cols.get(1);
                String author = "";

                Element links = authorTd.selectFirst("a");
                if (links != null) {
                    author = links.text().trim();
                } else {
                    System.out.println("未找到<a>标签");
                }
                Element authorTdss = cols.get(2);
                String bookName = "";
                String rawRel = "";

                Element link = authorTdss.selectFirst("a");
                if (link != null) {
                    // 提取书名
                    bookName = link.text().trim();
                    // 提取并处理rel属性
                    rawRel = link.attr("rel").replaceAll("<br>", "\n");
                    // 输出结果
//                        System.out.println("书名: " + bookName);
//                        System.out.println("简介:\n" + rawRel);
                } else {
                    System.out.println("未找到<a>标签");
                }
                String type = cols.get(3).text();
                String progress = cols.get(4).text();
                String totalNUm = cols.get(5).text();
                String votes = cols.get(6).text();
                String time = cols.get(7).text();
                novelList.add(new Novel(rank, author, bookName, type, progress, totalNUm, votes, time, rawRel));
            }

            // 4. 添加延迟防止封IP
            Thread.sleep(8000);

        } catch (IOException | InterruptedException e) {
            e.printStackTrace();
        }
        return novelList;
    }

    /**
     * 小说数据实体类 序号 作者 作品 类型 进度 字数 作品积分
     */
    static class Novel {
        private String rank;//序号
        private String author;//作者
        private String title;//作品
        private String type;//类型
        private String progress;//进度
        private String totalNUm;//字数
        private String votes;//积分

        private String time;//时间

        private String jianjie;//简介

        public Novel(String rank, String author, String title, String type, String progress, String totalNUm, String votes, String time, String jianjie) {
            this.rank = rank;
            this.author = author;
            this.title = title;
            this.type = type;
            this.progress = progress;
            this.totalNUm = totalNUm;
            this.votes = votes;
            this.time = time;
            this.jianjie = jianjie;
        }

        @Override
        public String toString() {
            return "Novel{" +
                    "rank='" + rank + '\'' +
                    ", author='" + author + '\'' +
                    ", title='" + title + '\'' +
                    ", type='" + type + '\'' +
                    ", progress='" + progress + '\'' +
                    ", totalNUm='" + totalNUm + '\'' +
                    ", votes='" + votes + '\'' +
                    ", time='" + time + '\'' +
                    ", jianjie='" + jianjie + '\'' +
                    '}';
        }

        public String getRank() {
            return rank;
        }

        public void setRank(String rank) {
            this.rank = rank;
        }

        public String getAuthor() {
            return author;
        }

        public void setAuthor(String author) {
            this.author = author;
        }

        public String getType() {
            return type;
        }

        public void setType(String type) {
            this.type = type;
        }

        public String getTitle() {
            return title;
        }

        public void setTitle(String title) {
            this.title = title;
        }

        public String getProgress() {
            return progress;
        }

        public void setProgress(String progress) {
            this.progress = progress;
        }

        public String getTotalNUm() {
            return totalNUm;
        }

        public void setTotalNUm(String totalNUm) {
            this.totalNUm = totalNUm;
        }

        public String getTime() {
            return time;
        }

        public void setTime(String time) {
            this.time = time;
        }

        public String getVotes() {
            return votes;
        }

        public void setVotes(String votes) {
            this.votes = votes;
        }

        public String getJianjie() {
            return jianjie;
        }

        public void setJianjie(String jianjie) {
            this.jianjie = jianjie;
        }
    }

}
相关推荐
沙子迷了蜗牛眼11 小时前
当展示列表使用 URL.createObjectURL 的创建临时图片、视频无法加载问题
java·前端·javascript·vue.js
ganshenml11 小时前
【Android】 开发四角版本全解析:AS、AGP、Gradle 与 JDK 的配套关系
android·java·开发语言
我命由我1234511 小时前
Kotlin 运算符 - == 运算符与 === 运算符
android·java·开发语言·java-ee·kotlin·android studio·android-studio
小途软件12 小时前
ssm327校园二手交易平台的设计与实现+vue
java·人工智能·pytorch·python·深度学习·语言模型
alonewolf_9912 小时前
Java类加载机制深度解析:从双亲委派到热加载实战
java·开发语言
追梦者12312 小时前
springboot整合minio
java·spring boot·后端
云游12 小时前
Jaspersoft Studio community edition 7.0.3的应用
java·报表
帅气的你12 小时前
Spring Boot 集成 AOP 实现日志记录与接口权限校验
java·spring boot
zhglhy12 小时前
Spring Data Slice使用指南
java·spring
win x12 小时前
Redis 主从复制
java·数据库·redis