Java开发笔记Ⅱ(Jsoup爬虫)

Jsoup 爬虫

Java 也能写爬虫!!!

Jsoup重要对象如下:

Document:文档对象,每个html页面都是一个Document对象

Element:元素对象,一个Document对象里有多个Element对象

Node:节点对象,用于存储数据,标签名称、属性都是节点对象

Jsoup的主要方法如下:

static Connection connect(String url) 创建URL连接

static Document parse(File in, String charsetName) 解析文件为 Document 对象

static Document parse(String html) 解析html代码为 Document 对象

(虽然上边是最主要的方法,但是下边这段代码中,是用 document对象 + css 选择器来获取的信息)

爬虫示例(豆瓣)

java 复制代码
   
    /**
     * 通过访问接口获取代理IP
     */
    public void initIPPool() {
        System.out.println("开始获取IP...");
        Process proc;
        try {
            // 这个代码之前是python改的,这里偷懒直接调用,这个文件贴在后边
            proc = Runtime.getRuntime().exec("python getIP.py");
            BufferedReader in = new BufferedReader(new InputStreamReader(proc.getInputStream()));
            String line = null;
            while ((line = in.readLine()) != null) {
                System.out.println(line);
            }
            in.close();
            proc.waitFor();
        } catch (Exception e) {
            System.out.println(e.toString());
        }
        System.out.println("成功获取代理IP");
    }
    
    /**
     * 从存储代理IP的文件获取代理IP
     */
    public void loadIPPool() {
        File file = new File("ipPool.txt");
        List<String> list = new ArrayList<String>();
        synchronized (this) {
            BufferedReader reader = null;
            try {
                reader = new BufferedReader(new FileReader(file));
                String tempString = null;
                // 一次读入一行,直到读入null为文件结束
                while ((tempString = reader.readLine()) != null) {
                    list.add(tempString);
                }
                reader.close();
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                if (reader != null) {
                    try {
                        reader.close();
                    } catch (IOException e1) {
                        System.out.println(e1.toString());
                    }
                }
            }
        }
        System.out.println(list);
        myIPPool = list.toArray(new String[list.size()]);
        System.out.println("成功载入IP代理池");
    }

    public String crawlOnce(Integer start) {
        StringBuilder finalResult = new StringBuilder();
        Random random = new Random();
        // 请求地址
        String url =
                "http://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=&start=" +
                (start);
        HttpGet request = new HttpGet(url);
        String proxyIp = myIPPool[random.nextInt(myIPPool.length)];
        while (proxyIp.split(":").length != 2) {
            // 在代理ip池里随机获取一个ip
            proxyIp = myIPPool[random.nextInt(myIPPool.length)];
        }
        HttpHost proxy = new HttpHost(proxyIp.split(":")[0],
                                      Integer.parseInt(proxyIp.split(":")[1]));
        SSLContextBuilder builder = new SSLContextBuilder();
        // 全部信任 不做身份鉴定
        PoolingHttpClientConnectionManager cm = null;
        SSLConnectionSocketFactory sslsf = null;
        try {
            builder.loadTrustMaterial(null, new TrustStrategy() {
                @Override
                public boolean isTrusted(X509Certificate[] x509Certificates, String s) throws CertificateException {
                    return true;
                }
            });
            sslsf = new SSLConnectionSocketFactory(builder.build(), new String[]{"SSLv2Hello",
                                                                                 "SSLv3", "TLSv1"
                    , "TLSv1.2"}, null, NoopHostnameVerifier.INSTANCE);
            Registry<ConnectionSocketFactory> registry =
                    RegistryBuilder.<ConnectionSocketFactory>create()
                                                                        .register("http",
                                                                                  new PlainConnectionSocketFactory())
                                                                        .register("https", sslsf)
                                                                        .build();
            cm = new PoolingHttpClientConnectionManager(registry);
            cm.setMaxTotal(200);//max connection
        } catch (Exception e) {
            System.out.println(e.toString());
            return "";
        }
        //设置认证
        CredentialsProvider provider = new BasicCredentialsProvider();
        //第一个参数对应代理httpHost,第二个参数设置代理的用户名和密码,如果代理不需要用户名和密码,填空
        provider.setCredentials(new AuthScope(proxy), new UsernamePasswordCredentials("", ""));
        //实例化CloseableHttpClient对象
        CloseableHttpClient httpClient = HttpClients.custom().setSSLSocketFactory(sslsf)
                                                    .setConnectionManager(cm)
                                                    .setConnectionManagerShared(true)
                                                    .setDefaultCredentialsProvider(provider)
                                                    .build();
        RequestConfig config = RequestConfig.custom().setProxy(proxy)
                                            .setConnectTimeout(CONNECTION_TIME_OUT)
                                            .setConnectionRequestTimeout(CONNECTION_TIME_OUT)
                                            .setSocketTimeout(CONNECTION_TIME_OUT).build();
        request.setConfig(config);
        //添加请求头
        request.addHeader("User-Agent", myUAPool[random.nextInt(myUAPool.length)]);
        request.addHeader("Cookie", myCookies[random.nextInt(myCookies.length)]);
        request.addHeader("Accept-Language", "zh-CN,zh;q=0.9");
        request.addHeader("Sec-Fetch-Mode", "cors");
        request.addHeader("Sec-Fetch-Site", "same-origin");
        HttpResponse response = null;
        BufferedReader rd = null;
        try {
            response = httpClient.execute(request);
            rd = new BufferedReader(new InputStreamReader(response.getEntity().getContent()));
        } catch (IOException e) {
            logError(start);
            return "";
        }
        String line = "";
        StringBuilder result = new StringBuilder();
        while (true) {
            try {
                line = rd.readLine();
                if (line == null) {
                    break;
                }
            } catch (IOException e) {
                logError(start);
                break;
            }
            // 请求返回了html页面
            if (line.equals("") || line.charAt(0) == '<') {
                break;
            }
            result.append(line);
        }
        System.out.println((start) + "--result:" + result);
        JSONObject res = JSONObject.parseObject(String.valueOf(result));
        if (res == null || !res.containsKey("data")) {
            logError(start);
            return "";
        }
        JSONArray jsonArray = res.getJSONArray("data");
        for (int i = 0; i < jsonArray.size(); i++) {
            JSONObject jo = jsonArray.getJSONObject(i);
            // 通过详情链接爬取电影详情
            finalResult.append(crawlDetails(jo.getString("url")));
        }
        return finalResult.toString();
    }

    // 爬取详情
    private String crawlDetails(String url) {
        String result = "";
        Random random = new Random();
        try {
            String proxyIp = myIPPool[random.nextInt(myIPPool.length)];
            // myUAPool这里可以换几个浏览器把useragent手写在变量里
            Connection con = Jsoup.connect(url)
                                  .proxy(proxyIp.split(":")[0], Integer.parseInt(proxyIp.split(
                                          ":")[1]))
                                  .timeout(10000)
                                  .userAgent(myUAPool[random.nextInt(myUAPool.length)])
                                  .header("Accept-Language", "zh-CN,zh;q=0.9")
                                  .header("Cookie", myCookies[random.nextInt(myCookies.length)])
                                  .timeout(CONNECTION_TIME_OUT); // 设置连接超时时间
            // 执行连接,获取页面
            Connection.Response response = con.execute();
            Document document = con.get();
            String info = document.select("#info").text();
            // ID
            result += url.substring(33, url.length() - 1);
            // 标题
            result += "," + document.select("#content > h1 > span:nth-child(1)").text();
            // 年份
            result += "," + document.select("#content > h1 > span.year").text();
            // 导演
            result += "," + document.select("#info > span:nth-child(1) > span.attrs > a").text();
            // 编剧
            result += "," + document.select("#info > span:nth-child(3) > span.attrs").text();
            // 主演
            result += "," + document.select("#info > span.actor > span.attrs").text();
            // 类型
            result += "," + document.select("[property=v:genre]").text();
            // 产地
            result += "," + info.substring(info.indexOf("制片国家/地区: "), info.indexOf(" 语言:"))
                                .substring("制片国家/地区: ".length());
            // 语言
            if (info.contains(" 上映日期:")) {
                result += "," + info.substring(info.indexOf("语言: "), info.indexOf(" 上映日期:"))
                                    .substring("语言: ".length());
            } else {
                result += "," + info.substring(info.indexOf("语言: ")).substring("语言: ".length());
            }
            // 片长
            result += "," + document.select("[property=v:genre]").attr("content");
            // 评分
            result += "," +
                      document.select("#interest_sectl > div > div.rating_self.clearfix > strong")
                              .text();
            // 5
            result += "," + document.select("#interest_sectl > div.rating_wrap.clearbox > div" +
                                            ".ratings-on-weight > div:nth-child(1) > span" +
                                            ".rating_per").text();
            // 4
            result += "," + document.select("#interest_sectl > div.rating_wrap.clearbox > div" +
                                            ".ratings-on-weight > div:nth-child(2) > span" +
                                            ".rating_per").text();
            // 3
            result += "," + document.select("#interest_sectl > div.rating_wrap.clearbox > div" +
                                            ".ratings-on-weight > div:nth-child(3) > span" +
                                            ".rating_per").text();
            // 2
            result += "," + document.select("#interest_sectl > div.rating_wrap.clearbox > div" +
                                            ".ratings-on-weight > div:nth-child(4) > span" +
                                            ".rating_per").text();
            // 1
            result += "," + document.select("#interest_sectl > div.rating_wrap.clearbox > div" +
                                            ".ratings-on-weight > div:nth-child(5) > span" +
                                            ".rating_per").text();
            // 评分人数
            result += "," + document.select("[property=v:votes]").text();
            // 评论数
            result +=
                    "," + document.select("#comments-section > div.mod-hd > h2 > span > a").text();
            System.out.println(proxyIp + " " + result);
        } catch (IOException e) {
            System.out.println(e.toString());
        }
        return result + "\n";
    }

获取代理IP的代码

python 复制代码
# coding=UTF-8

import requests
import json


class FreeIP():
    def __init__(self):
        # 代理ip网站
        self.url = "http://proxylist.fatezero.org/proxy.list"
        self.headers = {
            "User-Agent": "这里改为浏览器的useragent"}

    def check_ip(self, ip_list):
        correct_ip = []
        for ip in ip_list:
            if len(correct_ip) > 10:  # 可以根据自己的需求进行更改或者注释掉
                break
            ip_port = "{}:{}".format(ip["host"], ip["port"])
            proxies = {'https': ip_port}
            try:
                # 如果请求该网址,返回的IP地址与代理IP一致,则认为代理成功
                response = requests.get('https://icanhazip.com/', proxies=proxies,
                                        timeout=3).text  
                # 可以更改timeout时间
                if response.strip() == ip["host"]:
                    # print("可用的IP地址为:{}".format(ip_port))
                    correct_ip.append(ip_port)
            except:
        # print("不可用的IP地址为:{}".format(ip_port))
        return correct_ip

    def run(self):
        response = requests.get(url=self.url).content.decode()

        ip_list = []
        proxies_list = response.split('\n')

        for proxy_str in proxies_list:
            try:
                proxy = {}
                proxy_json = json.loads(proxy_str)
                if proxy_json["anonymity"] == "high_anonymous" and proxy_json["type"] == "https":
                    host = proxy_json['host']
                    port = proxy_json['port']
                    proxy["host"] = host
                    proxy["port"] = port
                    ip_list.append(proxy)
            except:

        correct_ip = self.check_ip(ip_list)
        file_path = 'ipPool.txt'
        # 写入这个文件
        with open(file_path, mode='w', encoding='utf-8') as file_obj:
            for i in correct_ip:
                file_obj.write(i + "\n")

if __name__ == '__main__':
    ip = FreeIP()
    ip.run()
相关推荐
blammmp6 分钟前
Java:数据结构-枚举
java·开发语言·数据结构
暗黑起源喵24 分钟前
设计模式-工厂设计模式
java·开发语言·设计模式
鸭鸭梨吖28 分钟前
产品经理笔记
笔记·产品经理
WaaTong29 分钟前
Java反射
java·开发语言·反射
齐 飞1 小时前
MongoDB笔记01-概念与安装
前端·数据库·笔记·后端·mongodb
九圣残炎1 小时前
【从零开始的LeetCode-算法】1456. 定长子串中元音的最大数目
java·算法·leetcode
wclass-zhengge1 小时前
Netty篇(入门编程)
java·linux·服务器
丫头,冲鸭!!!1 小时前
B树(B-Tree)和B+树(B+ Tree)
笔记·算法
Re.不晚2 小时前
Java入门15——抽象类
java·开发语言·学习·算法·intellij-idea
oliveira-time2 小时前
爬虫学习6
爬虫