JAVA爬虫基础 - 技术栈

第一章=>

1、概述

2、项目搭建

3、URL提取

4、内容解析并持久化

第二章=>

5、反爬虫及应对

****************************************************************************************************************************************************************************

复制代码

1、概述
【1】多核爬虫
【2】爬虫生命周期
页面下载。这是爬虫的基础
连接提取。初始URL，新的URL
ULR管理。对URL做区分，防止重复爬取
内容抽取及持久化。对爬取页面进行分析，抽取有价值的信息并存储下来。
通用型爬虫：难点是抓取更多的页面
Nutch Heritrix
垂直型爬虫。关注内容、准确率、效率。
难点是：如何高效定制一个爬虫，可以精确的抽取出网页的内容，并保存成结构化的数据。
使用到的技术
模拟浏览器：HttpClient
html解析：jsoup

****************************************************************************************************************************************************************************

复制代码

2、项目搭建
【1】依赖
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
	<dependency>
		<groupId>org.jsoup</groupId>
		<artifactId>jsoup</artifactId>
		<version>1.15.3</version>
	</dependency>
	<!--http core-->
	<dependency>
		<groupId>org.apache.httpcomponents</groupId>
		<artifactId>httpcore</artifactId>
		<version>4.4.10</version>
	</dependency>
	<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
	<dependency>
		<groupId>org.apache.httpcomponents</groupId>
		<artifactId>httpclient</artifactId>
		<version>4.5.13</version>
	</dependency>

****************************************************************************************************************************************************************************

复制代码

3、URL提取
【1】代码实现
package com.day.util;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class UrlPool {
    public static void main(String[] args) {
        getUrl("https://www.nipic.com/");
    }

    private static void getUrl(String url) {
        Map<String, Boolean> oldMap = new LinkedHashMap<>();
        String oldLinkHost = "";
        Pattern pattern = Pattern.compile("(https?://)?[^/\\s]*");
        Matcher matcher = pattern.matcher(url);
        if (matcher.find()) {
            oldLinkHost = matcher.group();
        }
        oldMap.put(url, false);
        oldMap = crawLinks(oldLinkHost, oldMap);
        for (String key : oldMap.keySet()) {
            System.out.println("连接：" + oldMap.get(key));
        }
    }

    private static Map<String, Boolean> crawLinks(String oldLinkHost, Map<String, Boolean> oldMap) {
        Map<String, Boolean> newMap = new LinkedHashMap<>();
        String oldLink = "";
        for (String key : oldMap.keySet()) {
            System.out.println("链接：" + key + "---check：" + oldMap.get(key));
            if (!oldMap.get(key)) {
                oldLink = key;
                try {
                    URL url = new URL(oldLink);
                    HttpURLConnection httpURLConnection = (HttpURLConnection) url.openConnection();
                    httpURLConnection.setRequestMethod("GET");
                    if (httpURLConnection.getResponseCode() == 200) {
                        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(httpURLConnection.getInputStream()));
                        Pattern pattern = Pattern.compile("<a.*?href=[\"']?((https?//)?/?[^\"']+)[\"']?.*?>(.+)</a>");
                        Matcher matcher = null;
                        String line = "";
                        while ((line = bufferedReader.readLine()) != null) {
                            matcher = pattern.matcher(line);
                            if (matcher.find()) {
                                String newLink = matcher.group(1).trim();
                                if (!newLink.startsWith("http")) {
                                    if (newLink.startsWith("/")) {
                                        newLink = oldLinkHost + newLink;
                                    } else {
                                        newLink = oldLinkHost + "/" + newLink;
                                    }
                                }
                                if (newLink.endsWith("/")) {
                                    newLink = newLink.substring(0, newLink.length() - 1);
                                }
                                if (!oldMap.containsKey(newLink) && !newMap.containsKey(newLink) && newLink.startsWith(oldLinkHost)) {
                                    newMap.put(newLink, false); // 标记没有进行过遍历
                                }
                            }
                        }
                    }
                } catch (Exception e) {
                    e.printStackTrace();
                } finally {

                }
                oldMap.replace(oldLink, false, true);
            }
        }
        if (!newMap.isEmpty()) {
            oldMap.putAll(newMap);
            oldMap.putAll(crawLinks(oldLinkHost, oldMap));
        }
        return oldMap;
    }
}

****************************************************************************************************************************************************************************

复制代码

4、内容解析并持久化 
【1】卧槽 我竟然成功爬取39张图片！！！！！！！！！！！！！！！！！！！
package com.day.util;

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import java.io.ByteArrayInputStream;
import java.io.File;

public class ImageCrawl {
    private static final String URL = "https://www.nipic.com/topic/show_27202_1.html";

    public static void main(String[] args) throws Exception {
        // apacheHttpClient();
        Document document = Jsoup.connect(URL).get(); // 一行代码就搞定了下面多行代码 卧槽
        Elements elements = document.select("li.new-search-works-item"); //类选择器
        for (int i = 0; i < elements.size(); i++) {
            Elements imgElements = elements.get(i).select("a > img");
            String imgPath = imgElements.attr("src");
            System.out.println(imgPath);
            Connection.Response response = Jsoup.connect("https:" + imgElements.attr("src")).userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36").ignoreContentType(true).execute(); // 可以设置代理IP .proxy
            ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(response.bodyAsBytes());
            FileUtils.copyInputStreamToFile(byteArrayInputStream, new File("src\\main\\resources\\crawlImages\\"
                    + imgPath.substring(imgPath.length() - 26))); // 可见我的智慧

        }
    }

    private static void apacheHttpClient() throws Exception {
        HttpClient httpClient = HttpClients.createDefault();
        HttpGet httpGet = new HttpGet(URL);
        // 包装浏览器
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36");
        HttpResponse httpResponse = null;
        try {
            httpResponse = httpClient.execute(httpGet);
            HttpEntity httpEntity = httpResponse.getEntity();
            System.out.println(EntityUtils.toString(httpEntity));
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
        }
    }
}

****************************************************************************************************************************************************************************

复制代码

5、反爬虫及应对
【1】爬虫访问频次要控制，不要爬对方服务器搞崩溃了
【2】设计到个人隐私的信息不能爬取
【3】突破网站的反爬措施，后果很严重
【4】不要爬取不正当竞争的内容
【5】付费内容，不能抓
【6】最后一条，如果突破了网站反爬措施，代码一定不要上传到网上。
【7】robots.txt 爬虫协议
baidu.com/robots.txt 协议
【8】反爬虫技术及应对策略
Header限制：user-agent、refer、cookie 。缺点：可能应用用户体验
IP限制：限制IP访问频率
账号限制：同一个账号短时间不能多次相同请求
蜜罐限制：display none 肉眼不可见（爬虫爬到的限制IP 账户）
数据污染：看到的是99 爬取到的是11
增加爬取难度：ajax动态请求，图片验证码！