爬虫基础

maven pom

复制代码
<dependencies>

    <!--前端jqury-->
    <dependency>
        <groupId>org.jsoup</groupId>
        <artifactId>jsoup</artifactId>
        <version>1.16.1</version>
    </dependency>

    <!--http工具-->
    <dependency>
        <groupId>org.apache.httpcomponents</groupId>
        <artifactId>httpcore</artifactId>
        <version>4.4.16</version>
    </dependency>
    <dependency>
        <groupId>org.apache.httpcomponents</groupId>
        <artifactId>httpclient</artifactId>
        <version>4.5.14</version>
    </dependency>

    <dependency>
        <groupId>commons-io</groupId>
        <artifactId>commons-io</artifactId>
        <version>2.13.0</version>
    </dependency>
</dependencies>

====================================

遍历网站内容爬取网站网址

复制代码
package com.xiaocao;

import com.sun.org.apache.bcel.internal.generic.NEW;
import com.sun.org.apache.regexp.internal.RE;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class UrlPool {

    public static void main(String[] args) {
        /*首页地址*/
        getUrl("https://www.nipic.com/");
    }

    private static void getUrl(String baseUrl) {
        Map<String, Boolean> oldMap = new LinkedHashMap<>();
        /*相对路径拼接*/
        String oldLinkHost = "";
        Pattern p = Pattern.compile("(https?://)?[^\\s]*");
        Matcher m = p.matcher(baseUrl);
        if (m.find()) {
            oldLinkHost = m.group();
        }
        oldMap.put(baseUrl, false);
        oldMap = crawlLinks(oldLinkHost, oldMap);
        for (Map.Entry<String, Boolean> mapping : oldMap.entrySet()) {
            System.out.println("连接:" + mapping.getKey());
        }
    }

    private static Map<String, Boolean> crawlLinks(String oldLinkHost, Map<String, Boolean> oldMap) {

        LinkedHashMap<String, Boolean> newMap = new LinkedHashMap<>();
        String oldLink = "";
        for (Map.Entry<String, Boolean> mapping : oldMap.entrySet()) {

            if (!mapping.getValue()){
                System.out.println(mapping.getKey()+"连接有参数:" + mapping.getKey());
                oldLink = mapping.getKey();
                try {
                    URL url = new URL(oldLink);

                    HttpURLConnection connection = (HttpURLConnection) url.openConnection();

                    connection.setRequestMethod("GET");
                    if (connection.getResponseCode() == 200) {
                        BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream()));
//                        Pattern p = Pattern.compile("<a.*?href=[\"']?(https?://)?/?[^\"']?.*?>(.+)</a>");
                        Pattern p = Pattern.compile("<a\\b[^>]+\\bhref=\"([^\"]*)\"[^>]*>([\\s\\S]*?)</a>");
                        Matcher matcher = null;
                        String line = "";
                        while ((line = reader.readLine()) != null) {
                            matcher = p.matcher(line);
                            if (matcher.find()) {
                                String newLink = matcher.group(1);
                                if (!newLink.startsWith("http")) {
                                    /*相对路径*/
                                    if (newLink.startsWith("/")) {
                                        newLink = oldLinkHost + newLink;
                                    } else {
                                        newLink = oldLinkHost + "/" + newLink;
                                    }
                                }
                                if (newLink.endsWith("/")) {
                                    newLink = newLink.substring(0, newLink.length() - 1);
                                }
                                if (!oldMap.containsKey(newLink) && !newMap.containsKey(newLink) && newLink.startsWith(oldLinkHost)) {
                                    newMap.put(newLink, false);
                                }

                            }
                        }
                    }
                } catch (Exception e) {


                } finally {

                    oldMap.replace(oldLink, true);

                }
            }
        }

        if (!newMap.isEmpty()) {
            oldMap.putAll(newMap);
            oldMap.putAll(crawlLinks(oldLinkHost, oldMap));
        }
        return oldMap;
    }
}

==============

下载网站内容

复制代码
package com.xiaocao;

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import sun.net.www.http.HttpClient;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpCookie;

public class ImageCraw {

    private static String url = "https://xxx";

    public static void main(String[] args) {
//        apacheHttpClient();
        try {
            Document document = Jsoup.connect(url).get();

            Elements select = document.select(".newdetail-skin #J_worksImg");

            try {
                Connection.Response src = Jsoup.connect("https:"+select.attr("src"))
                        .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0")
                        .ignoreContentType(true)
                        .execute();

                String name = select.attr("alt");
                System.out.println(name);
                ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(src.bodyAsBytes());
                FileUtils.copyInputStreamToFile(byteArrayInputStream,new File("F:\\filetest\\"+name+".jpg"));
            }catch (Exception e){
                e.printStackTrace();
            }


//            for (int i = 0; i < select.size(); i++) {
//                Elements img = select.get(i).select(".newdetail-skin #J_worksImg");
//
//                try {
//                    Connection.Response src = Jsoup.connect("https:"+img.attr("src"))
//                            .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0")
//                            .ignoreContentType(true)
//                            .execute();
//
//                    String name = img.attr("alt");
//                    System.out.println(name);
//                    ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(src.bodyAsBytes());
//                    FileUtils.copyInputStreamToFile(byteArrayInputStream,new File("F:\\filetest\\"+name+".jpg"));
//                }catch (Exception e){
//                    e.printStackTrace();
//                }
//
//            }
        } catch (IOException e) {
            e.printStackTrace();
        }


    }

    private static void apacheHttpClient() {


        CloseableHttpClient client = HttpClients.createDefault();
        HttpGet httpGet = new HttpGet(url);
        /*伪装浏览器*/
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0");
        
        try {
            CloseableHttpResponse execute = client.execute(httpGet);
            HttpEntity entity = execute.getEntity();
            String s = EntityUtils.toString(entity);
            System.out.println(s);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}
相关推荐
CoderYanger2 天前
优选算法-优先级队列(堆):75.数据流中的第K大元素
java·开发语言·算法·leetcode·职场和发展·1024程序员节
CoderYanger2 天前
C.滑动窗口——1423. 可获得的最大点数
java·开发语言·算法·leetcode·1024程序员节
CoderYanger2 天前
优选算法-队列+宽搜(BFS):72.二叉树的最大宽度
java·开发语言·算法·leetcode·职场和发展·宽度优先·1024程序员节
CoderYanger3 天前
优选算法-字符串:63.二进制求和
java·开发语言·算法·leetcode·职场和发展·1024程序员节
CoderYanger3 天前
优选算法-栈:67.基本计算器Ⅱ
java·开发语言·算法·leetcode·职场和发展·1024程序员节
讨厌下雨的天空3 天前
网络基础
网络·1024程序员节
金融小师妹4 天前
基于NLP语义解析的联储政策信号:强化学习框架下的12月降息概率回升动态建模
大数据·人工智能·深度学习·1024程序员节
mjhcsp4 天前
C++ 动态规划(Dynamic Programming)详解:从理论到实战
c++·动态规划·1024程序员节
金融小师妹6 天前
基于机器学习框架的上周行情复盘:非农数据与美联储政策信号的AI驱动解析
大数据·人工智能·深度学习·1024程序员节
渣渣盟6 天前
Flink分布式文件Sink实战解析
分布式·flink·scala·1024程序员节