Java开发笔记Ⅱ（Jsoup爬虫）

Jsoup 爬虫

Java 也能写爬虫！！！

Jsoup重要对象如下：

Document：文档对象，每个html页面都是一个Document对象

Element：元素对象，一个Document对象里有多个Element对象

Node：节点对象，用于存储数据，标签名称、属性都是节点对象

Jsoup的主要方法如下：

static Connection connect(String url) 创建URL连接

static Document parse(File in, String charsetName) 解析文件为 Document 对象

static Document parse(String html) 解析html代码为 Document 对象

（虽然上边是最主要的方法，但是下边这段代码中，是用 document对象 + css 选择器来获取的信息）

爬虫示例（豆瓣）

java 复制代码

   
    /**
     * 通过访问接口获取代理IP
     */
    public void initIPPool() {
        System.out.println("开始获取IP...");
        Process proc;
        try {
            // 这个代码之前是python改的，这里偷懒直接调用，这个文件贴在后边
            proc = Runtime.getRuntime().exec("python getIP.py");
            BufferedReader in = new BufferedReader(new InputStreamReader(proc.getInputStream()));
            String line = null;
            while ((line = in.readLine()) != null) {
                System.out.println(line);
            }
            in.close();
            proc.waitFor();
        } catch (Exception e) {
            System.out.println(e.toString());
        }
        System.out.println("成功获取代理IP");
    }
    
    /**
     * 从存储代理IP的文件获取代理IP
     */
    public void loadIPPool() {
        File file = new File("ipPool.txt");
        List<String> list = new ArrayList<String>();
        synchronized (this) {
            BufferedReader reader = null;
            try {
                reader = new BufferedReader(new FileReader(file));
                String tempString = null;
                // 一次读入一行，直到读入null为文件结束
                while ((tempString = reader.readLine()) != null) {
                    list.add(tempString);
                }
                reader.close();
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                if (reader != null) {
                    try {
                        reader.close();
                    } catch (IOException e1) {
                        System.out.println(e1.toString());
                    }
                }
            }
        }
        System.out.println(list);
        myIPPool = list.toArray(new String[list.size()]);
        System.out.println("成功载入IP代理池");
    }

    public String crawlOnce(Integer start) {
        StringBuilder finalResult = new StringBuilder();
        Random random = new Random();
        // 请求地址
        String url =
                "http://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=&start=" +
                (start);
        HttpGet request = new HttpGet(url);
        String proxyIp = myIPPool[random.nextInt(myIPPool.length)];
        while (proxyIp.split(":").length != 2) {
            // 在代理ip池里随机获取一个ip
            proxyIp = myIPPool[random.nextInt(myIPPool.length)];
        }
        HttpHost proxy = new HttpHost(proxyIp.split(":")[0],
                                      Integer.parseInt(proxyIp.split(":")[1]));
        SSLContextBuilder builder = new SSLContextBuilder();
        // 全部信任 不做身份鉴定
        PoolingHttpClientConnectionManager cm = null;
        SSLConnectionSocketFactory sslsf = null;
        try {
            builder.loadTrustMaterial(null, new TrustStrategy() {
                @Override
                public boolean isTrusted(X509Certificate[] x509Certificates, String s) throws CertificateException {
                    return true;
                }
            });
            sslsf = new SSLConnectionSocketFactory(builder.build(), new String[]{"SSLv2Hello",
                                                                                 "SSLv3", "TLSv1"
                    , "TLSv1.2"}, null, NoopHostnameVerifier.INSTANCE);
            Registry<ConnectionSocketFactory> registry =
                    RegistryBuilder.<ConnectionSocketFactory>create()
                                                                        .register("http",
                                                                                  new PlainConnectionSocketFactory())
                                                                        .register("https", sslsf)
                                                                        .build();
            cm = new PoolingHttpClientConnectionManager(registry);
            cm.setMaxTotal(200);//max connection
        } catch (Exception e) {
            System.out.println(e.toString());
            return "";
        }
        //设置认证
        CredentialsProvider provider = new BasicCredentialsProvider();
        //第一个参数对应代理httpHost，第二个参数设置代理的用户名和密码，如果代理不需要用户名和密码，填空
        provider.setCredentials(new AuthScope(proxy), new UsernamePasswordCredentials("", ""));
        //实例化CloseableHttpClient对象
        CloseableHttpClient httpClient = HttpClients.custom().setSSLSocketFactory(sslsf)
                                                    .setConnectionManager(cm)
                                                    .setConnectionManagerShared(true)
                                                    .setDefaultCredentialsProvider(provider)
                                                    .build();
        RequestConfig config = RequestConfig.custom().setProxy(proxy)
                                            .setConnectTimeout(CONNECTION_TIME_OUT)
                                            .setConnectionRequestTimeout(CONNECTION_TIME_OUT)
                                            .setSocketTimeout(CONNECTION_TIME_OUT).build();
        request.setConfig(config);
        //添加请求头
        request.addHeader("User-Agent", myUAPool[random.nextInt(myUAPool.length)]);
        request.addHeader("Cookie", myCookies[random.nextInt(myCookies.length)]);
        request.addHeader("Accept-Language", "zh-CN,zh;q=0.9");
        request.addHeader("Sec-Fetch-Mode", "cors");
        request.addHeader("Sec-Fetch-Site", "same-origin");
        HttpResponse response = null;
        BufferedReader rd = null;
        try {
            response = httpClient.execute(request);
            rd = new BufferedReader(new InputStreamReader(response.getEntity().getContent()));
        } catch (IOException e) {
            logError(start);
            return "";
        }
        String line = "";
        StringBuilder result = new StringBuilder();
        while (true) {
            try {
                line = rd.readLine();
                if (line == null) {
                    break;
                }
            } catch (IOException e) {
                logError(start);
                break;
            }
            // 请求返回了html页面
            if (line.equals("") || line.charAt(0) == '<') {
                break;
            }
            result.append(line);
        }
        System.out.println((start) + "--result:" + result);
        JSONObject res = JSONObject.parseObject(String.valueOf(result));
        if (res == null || !res.containsKey("data")) {
            logError(start);
            return "";
        }
        JSONArray jsonArray = res.getJSONArray("data");
        for (int i = 0; i < jsonArray.size(); i++) {
            JSONObject jo = jsonArray.getJSONObject(i);
            // 通过详情链接爬取电影详情
            finalResult.append(crawlDetails(jo.getString("url")));
        }
        return finalResult.toString();
    }

    // 爬取详情
    private String crawlDetails(String url) {
        String result = "";
        Random random = new Random();
        try {
            String proxyIp = myIPPool[random.nextInt(myIPPool.length)];
            // myUAPool这里可以换几个浏览器把useragent手写在变量里
            Connection con = Jsoup.connect(url)
                                  .proxy(proxyIp.split(":")[0], Integer.parseInt(proxyIp.split(
                                          ":")[1]))
                                  .timeout(10000)
                                  .userAgent(myUAPool[random.nextInt(myUAPool.length)])
                                  .header("Accept-Language", "zh-CN,zh;q=0.9")
                                  .header("Cookie", myCookies[random.nextInt(myCookies.length)])
                                  .timeout(CONNECTION_TIME_OUT); // 设置连接超时时间
            // 执行连接，获取页面
            Connection.Response response = con.execute();
            Document document = con.get();
            String info = document.select("#info").text();
            // ID
            result += url.substring(33, url.length() - 1);
            // 标题
            result += "," + document.select("#content > h1 > span:nth-child(1)").text();
            // 年份
            result += "," + document.select("#content > h1 > span.year").text();
            // 导演
            result += "," + document.select("#info > span:nth-child(1) > span.attrs > a").text();
            // 编剧
            result += "," + document.select("#info > span:nth-child(3) > span.attrs").text();
            // 主演
            result += "," + document.select("#info > span.actor > span.attrs").text();
            // 类型
            result += "," + document.select("[property=v:genre]").text();
            // 产地
            result += "," + info.substring(info.indexOf("制片国家/地区: "), info.indexOf(" 语言:"))
                                .substring("制片国家/地区: ".length());
            // 语言
            if (info.contains(" 上映日期:")) {
                result += "," + info.substring(info.indexOf("语言: "), info.indexOf(" 上映日期:"))
                                    .substring("语言: ".length());
            } else {
                result += "," + info.substring(info.indexOf("语言: ")).substring("语言: ".length());
            }
            // 片长
            result += "," + document.select("[property=v:genre]").attr("content");
            // 评分
            result += "," +
                      document.select("#interest_sectl > div > div.rating_self.clearfix > strong")
                              .text();
            // 5
            result += "," + document.select("#interest_sectl > div.rating_wrap.clearbox > div" +
                                            ".ratings-on-weight > div:nth-child(1) > span" +
                                            ".rating_per").text();
            // 4
            result += "," + document.select("#interest_sectl > div.rating_wrap.clearbox > div" +
                                            ".ratings-on-weight > div:nth-child(2) > span" +
                                            ".rating_per").text();
            // 3
            result += "," + document.select("#interest_sectl > div.rating_wrap.clearbox > div" +
                                            ".ratings-on-weight > div:nth-child(3) > span" +
                                            ".rating_per").text();
            // 2
            result += "," + document.select("#interest_sectl > div.rating_wrap.clearbox > div" +
                                            ".ratings-on-weight > div:nth-child(4) > span" +
                                            ".rating_per").text();
            // 1
            result += "," + document.select("#interest_sectl > div.rating_wrap.clearbox > div" +
                                            ".ratings-on-weight > div:nth-child(5) > span" +
                                            ".rating_per").text();
            // 评分人数
            result += "," + document.select("[property=v:votes]").text();
            // 评论数
            result +=
                    "," + document.select("#comments-section > div.mod-hd > h2 > span > a").text();
            System.out.println(proxyIp + " " + result);
        } catch (IOException e) {
            System.out.println(e.toString());
        }
        return result + "\n";
    }

获取代理IP的代码

python 复制代码

# coding=UTF-8

import requests
import json


class FreeIP():
    def __init__(self):
        # 代理ip网站
        self.url = "http://proxylist.fatezero.org/proxy.list"
        self.headers = {
            "User-Agent": "这里改为浏览器的useragent"}

    def check_ip(self, ip_list):
        correct_ip = []
        for ip in ip_list:
            if len(correct_ip) > 10:  # 可以根据自己的需求进行更改或者注释掉
                break
            ip_port = "{}:{}".format(ip["host"], ip["port"])
            proxies = {'https': ip_port}
            try:
                # 如果请求该网址，返回的IP地址与代理IP一致，则认为代理成功
                response = requests.get('https://icanhazip.com/', proxies=proxies,
                                        timeout=3).text  
                # 可以更改timeout时间
                if response.strip() == ip["host"]:
                    # print("可用的IP地址为：{}".format(ip_port))
                    correct_ip.append(ip_port)
            except:
        # print("不可用的IP地址为：{}".format(ip_port))
        return correct_ip

    def run(self):
        response = requests.get(url=self.url).content.decode()

        ip_list = []
        proxies_list = response.split('\n')

        for proxy_str in proxies_list:
            try:
                proxy = {}
                proxy_json = json.loads(proxy_str)
                if proxy_json["anonymity"] == "high_anonymous" and proxy_json["type"] == "https":
                    host = proxy_json['host']
                    port = proxy_json['port']
                    proxy["host"] = host
                    proxy["port"] = port
                    ip_list.append(proxy)
            except:

        correct_ip = self.check_ip(ip_list)
        file_path = 'ipPool.txt'
        # 写入这个文件
        with open(file_path, mode='w', encoding='utf-8') as file_obj:
            for i in correct_ip:
                file_obj.write(i + "\n")

if __name__ == '__main__':
    ip = FreeIP()
    ip.run()