Java爬虫|爬虫爬jj榜单数据写入excel

大学的时候选python课 课设就是让我们用爬虫去爬取数据 写入文件 然后再做数据分析 词云图 地图分类等 python已经记不清了 现在用Java尝试一下爬取数据

爬虫分为三步骤:1.获取你自己电脑访问网站的时候的请求头 2.目标网站的url 3.对爬出来的网页返回值进行切分出有用的部分

java 复制代码
package com.example.concurrent;

import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.FileOutputStream;
import java.io.IOException;

import java.util.ArrayList;
import java.util.List;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;

public class BookLibrary {
    // 目标榜单URL 序号 作者 作品 类型 进度 字数 作品积分
    private static final String TARGET_URL = "https://www.jjwxc.net/topten.php?orderstr=7&t=1";

    public static void main(String[] args) {
        List<Novel> novels = crawlRankList();
//        novels.forEach(System.out::println);
        writeNovelsToExcel(novels,"/Users/xuejiangjing/Documents/novels.xlsx");
    }



    public static void writeNovelsToExcel(List<Novel> novelList, String outputPath) {
        try (Workbook workbook = new XSSFWorkbook()) { // 创建.xlsx格式工作簿
            Sheet sheet = workbook.createSheet("小说列表"); // 创建工作表

            // 创建表头行
            String[] headers = {"排名", "作者", "书名", "类型", "进度", "总字数", "投票数", "更新时间", "简介"};
            Row headerRow = sheet.createRow(0);
            for (int i = 0; i < headers.length; i++) {
                Cell cell = headerRow.createCell(i);
                cell.setCellValue(headers[i]);
            }
            // 填充数据行
            int rowNum = 1;
            for (Novel novel : novelList) {
                Row row = sheet.createRow(rowNum++);
                // 按字段顺序写入(需与Novel类构造参数顺序一致)
                row.createCell(0).setCellValue(novel.getRank());
                row.createCell(1).setCellValue(novel.getAuthor());
                row.createCell(2).setCellValue(novel.getTitle());
                row.createCell(3).setCellValue(novel.getType());
                row.createCell(4).setCellValue(novel.getProgress());
                row.createCell(5).setCellValue(novel.getTotalNUm());
                row.createCell(6).setCellValue(novel.getVotes());
                row.createCell(7).setCellValue(novel.getTime());
                row.createCell(8).setCellValue(novel.getJianjie());
            }

            // 自动调整列宽(可选)
            for (int i = 0; i < headers.length; i++) {
                sheet.autoSizeColumn(i);
            }

            // 写入文件
            try (FileOutputStream outputStream = new FileOutputStream(outputPath)) {
                workbook.write(outputStream);
                System.out.println("Excel文件已生成:" + outputPath);
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * 爬取榜单数据
     */
    public static List<Novel> crawlRankList() {
        List<Novel> novelList = new ArrayList<>();
        try {
            // 1. 模拟浏览器请求(关键反爬策略)- useragent放入你自己的请求头
            Document doc = Jsoup.connect(TARGET_URL)
                    .userAgent("Mozilla/5.0 Version/17.4 Safari")
                    .header("Accept-Language", "zh-CN,zh;q=0.9")
                    .timeout(10_000)
                    .get();
            String htmlContent = doc.html(); // 获取完整HTML内容(含格式)
//            System.out.println(htmlContent);
            // 2. 定位榜单表格(需根据实际HTML结构调整选择器)
//            Element table = doc.selectFirst("table.rank-table");
            Element targetTable = null;
            Elements tables = doc.select("table[width=984][border=0][align=center][cellpadding=0][cellspacing=1][bgcolor=#009900]");
//            System.out.println("tables.size>>>" + tables.size());
            for (Element table : tables) {
                Element firstTd = table.selectFirst("td:eq(0)"); // 第一个td
                if (firstTd != null && firstTd.text().trim().equals("序号")) {
//                    System.out.println("找到目标表格:\n" + table);
                    targetTable = table;
                    break;
                }
            }
            if (targetTable == null) {
                return new ArrayList<>();
            }
            Elements rows = targetTable.select("tr:has(td)"); // 跳过表头

            // 3. 解析每一行数据 序号 作者 作品 类型 进度 字数 作品积分 截止时间
            for (int i = 1; i < rows.size(); i++) {
                Elements cols = rows.get(i).select("td");
                if (cols.size() < 8) continue;
//                System.out.println(cols.html() + ">>>>>>");
                String rank = cols.get(0).text();
                Element authorTd = cols.get(1);
                String author = "";

                Element links = authorTd.selectFirst("a");
                if (links != null) {
                    author = links.text().trim();
                } else {
                    System.out.println("未找到<a>标签");
                }
                Element authorTdss = cols.get(2);
                String bookName = "";
                String rawRel = "";

                Element link = authorTdss.selectFirst("a");
                if (link != null) {
                    // 提取书名
                    bookName = link.text().trim();
                    // 提取并处理rel属性
                    rawRel = link.attr("rel").replaceAll("<br>", "\n");
                    // 输出结果
//                        System.out.println("书名: " + bookName);
//                        System.out.println("简介:\n" + rawRel);
                } else {
                    System.out.println("未找到<a>标签");
                }
                String type = cols.get(3).text();
                String progress = cols.get(4).text();
                String totalNUm = cols.get(5).text();
                String votes = cols.get(6).text();
                String time = cols.get(7).text();
                novelList.add(new Novel(rank, author, bookName, type, progress, totalNUm, votes, time, rawRel));
            }

            // 4. 添加延迟防止封IP
            Thread.sleep(8000);

        } catch (IOException | InterruptedException e) {
            e.printStackTrace();
        }
        return novelList;
    }

    /**
     * 小说数据实体类 序号 作者 作品 类型 进度 字数 作品积分
     */
    static class Novel {
        private String rank;//序号
        private String author;//作者
        private String title;//作品
        private String type;//类型
        private String progress;//进度
        private String totalNUm;//字数
        private String votes;//积分

        private String time;//时间

        private String jianjie;//简介

        public Novel(String rank, String author, String title, String type, String progress, String totalNUm, String votes, String time, String jianjie) {
            this.rank = rank;
            this.author = author;
            this.title = title;
            this.type = type;
            this.progress = progress;
            this.totalNUm = totalNUm;
            this.votes = votes;
            this.time = time;
            this.jianjie = jianjie;
        }

        @Override
        public String toString() {
            return "Novel{" +
                    "rank='" + rank + '\'' +
                    ", author='" + author + '\'' +
                    ", title='" + title + '\'' +
                    ", type='" + type + '\'' +
                    ", progress='" + progress + '\'' +
                    ", totalNUm='" + totalNUm + '\'' +
                    ", votes='" + votes + '\'' +
                    ", time='" + time + '\'' +
                    ", jianjie='" + jianjie + '\'' +
                    '}';
        }

        public String getRank() {
            return rank;
        }

        public void setRank(String rank) {
            this.rank = rank;
        }

        public String getAuthor() {
            return author;
        }

        public void setAuthor(String author) {
            this.author = author;
        }

        public String getType() {
            return type;
        }

        public void setType(String type) {
            this.type = type;
        }

        public String getTitle() {
            return title;
        }

        public void setTitle(String title) {
            this.title = title;
        }

        public String getProgress() {
            return progress;
        }

        public void setProgress(String progress) {
            this.progress = progress;
        }

        public String getTotalNUm() {
            return totalNUm;
        }

        public void setTotalNUm(String totalNUm) {
            this.totalNUm = totalNUm;
        }

        public String getTime() {
            return time;
        }

        public void setTime(String time) {
            this.time = time;
        }

        public String getVotes() {
            return votes;
        }

        public void setVotes(String votes) {
            this.votes = votes;
        }

        public String getJianjie() {
            return jianjie;
        }

        public void setJianjie(String jianjie) {
            this.jianjie = jianjie;
        }
    }

}
相关推荐
Remember_Ray7 分钟前
重学Java基础篇—什么是快速失败(fail-fast)和安全失败(fail-safe)?
java
奋进的小暄7 分钟前
贪心算法(12))(java)坏了的计算器
java·算法·贪心算法
RainbowSea14 分钟前
2. RabbitMQ 的详细安装步骤(两种方式,第一种:yum 安装;第二种:docker 容器安装)
java·消息队列·rabbitmq
小鱼人爱编程23 分钟前
Look My Eyes 最新IDEA快速搭建Java Web工程的两种方式
java·前端·后端
Mr.wangh26 分钟前
SpringBoot 配置⽂件
java·spring boot·spring
RainbowSea26 分钟前
1. 初始 RabbitMQ 消息队列
java·消息队列·rabbitmq
秋野酱38 分钟前
基于javaweb的SpringBoot水果生鲜商城系统设计与实现(源码+文档+部署讲解)
java·spring boot·后端
sunly_1 小时前
Flutter:切换账号功能记录
android·java·flutter
大白曾是少年1 小时前
【Java进阶学习 第十篇】递归和异常
java·笔记·学习
getapi1 小时前
Flutter和React Native在开发app中,哪个对java开发工程师更适合
java·flutter·react native