爬虫基础

maven pom

复制代码
<dependencies>

    <!--前端jqury-->
    <dependency>
        <groupId>org.jsoup</groupId>
        <artifactId>jsoup</artifactId>
        <version>1.16.1</version>
    </dependency>

    <!--http工具-->
    <dependency>
        <groupId>org.apache.httpcomponents</groupId>
        <artifactId>httpcore</artifactId>
        <version>4.4.16</version>
    </dependency>
    <dependency>
        <groupId>org.apache.httpcomponents</groupId>
        <artifactId>httpclient</artifactId>
        <version>4.5.14</version>
    </dependency>

    <dependency>
        <groupId>commons-io</groupId>
        <artifactId>commons-io</artifactId>
        <version>2.13.0</version>
    </dependency>
</dependencies>

====================================

遍历网站内容爬取网站网址

复制代码
package com.xiaocao;

import com.sun.org.apache.bcel.internal.generic.NEW;
import com.sun.org.apache.regexp.internal.RE;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class UrlPool {

    public static void main(String[] args) {
        /*首页地址*/
        getUrl("https://www.nipic.com/");
    }

    private static void getUrl(String baseUrl) {
        Map<String, Boolean> oldMap = new LinkedHashMap<>();
        /*相对路径拼接*/
        String oldLinkHost = "";
        Pattern p = Pattern.compile("(https?://)?[^\\s]*");
        Matcher m = p.matcher(baseUrl);
        if (m.find()) {
            oldLinkHost = m.group();
        }
        oldMap.put(baseUrl, false);
        oldMap = crawlLinks(oldLinkHost, oldMap);
        for (Map.Entry<String, Boolean> mapping : oldMap.entrySet()) {
            System.out.println("连接:" + mapping.getKey());
        }
    }

    private static Map<String, Boolean> crawlLinks(String oldLinkHost, Map<String, Boolean> oldMap) {

        LinkedHashMap<String, Boolean> newMap = new LinkedHashMap<>();
        String oldLink = "";
        for (Map.Entry<String, Boolean> mapping : oldMap.entrySet()) {

            if (!mapping.getValue()){
                System.out.println(mapping.getKey()+"连接有参数:" + mapping.getKey());
                oldLink = mapping.getKey();
                try {
                    URL url = new URL(oldLink);

                    HttpURLConnection connection = (HttpURLConnection) url.openConnection();

                    connection.setRequestMethod("GET");
                    if (connection.getResponseCode() == 200) {
                        BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream()));
//                        Pattern p = Pattern.compile("<a.*?href=[\"']?(https?://)?/?[^\"']?.*?>(.+)</a>");
                        Pattern p = Pattern.compile("<a\\b[^>]+\\bhref=\"([^\"]*)\"[^>]*>([\\s\\S]*?)</a>");
                        Matcher matcher = null;
                        String line = "";
                        while ((line = reader.readLine()) != null) {
                            matcher = p.matcher(line);
                            if (matcher.find()) {
                                String newLink = matcher.group(1);
                                if (!newLink.startsWith("http")) {
                                    /*相对路径*/
                                    if (newLink.startsWith("/")) {
                                        newLink = oldLinkHost + newLink;
                                    } else {
                                        newLink = oldLinkHost + "/" + newLink;
                                    }
                                }
                                if (newLink.endsWith("/")) {
                                    newLink = newLink.substring(0, newLink.length() - 1);
                                }
                                if (!oldMap.containsKey(newLink) && !newMap.containsKey(newLink) && newLink.startsWith(oldLinkHost)) {
                                    newMap.put(newLink, false);
                                }

                            }
                        }
                    }
                } catch (Exception e) {


                } finally {

                    oldMap.replace(oldLink, true);

                }
            }
        }

        if (!newMap.isEmpty()) {
            oldMap.putAll(newMap);
            oldMap.putAll(crawlLinks(oldLinkHost, oldMap));
        }
        return oldMap;
    }
}

==============

下载网站内容

复制代码
package com.xiaocao;

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import sun.net.www.http.HttpClient;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpCookie;

public class ImageCraw {

    private static String url = "https://xxx";

    public static void main(String[] args) {
//        apacheHttpClient();
        try {
            Document document = Jsoup.connect(url).get();

            Elements select = document.select(".newdetail-skin #J_worksImg");

            try {
                Connection.Response src = Jsoup.connect("https:"+select.attr("src"))
                        .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0")
                        .ignoreContentType(true)
                        .execute();

                String name = select.attr("alt");
                System.out.println(name);
                ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(src.bodyAsBytes());
                FileUtils.copyInputStreamToFile(byteArrayInputStream,new File("F:\\filetest\\"+name+".jpg"));
            }catch (Exception e){
                e.printStackTrace();
            }


//            for (int i = 0; i < select.size(); i++) {
//                Elements img = select.get(i).select(".newdetail-skin #J_worksImg");
//
//                try {
//                    Connection.Response src = Jsoup.connect("https:"+img.attr("src"))
//                            .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0")
//                            .ignoreContentType(true)
//                            .execute();
//
//                    String name = img.attr("alt");
//                    System.out.println(name);
//                    ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(src.bodyAsBytes());
//                    FileUtils.copyInputStreamToFile(byteArrayInputStream,new File("F:\\filetest\\"+name+".jpg"));
//                }catch (Exception e){
//                    e.printStackTrace();
//                }
//
//            }
        } catch (IOException e) {
            e.printStackTrace();
        }


    }

    private static void apacheHttpClient() {


        CloseableHttpClient client = HttpClients.createDefault();
        HttpGet httpGet = new HttpGet(url);
        /*伪装浏览器*/
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0");
        
        try {
            CloseableHttpResponse execute = client.execute(httpGet);
            HttpEntity entity = execute.getEntity();
            String s = EntityUtils.toString(entity);
            System.out.println(s);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}
相关推荐
AORO20254 小时前
智能三防手机哪款好?22000mAh+夜视+露营灯打造专业户外装备
服务器·网络·智能手机·电脑·1024程序员节
java_logo8 小时前
SGLANG Docker容器化部署指南
linux·运维·docker·容器·eureka·1024程序员节
开利网络8 小时前
合规底线:健康产品营销的红线与避坑指南
大数据·前端·人工智能·云计算·1024程序员节
独行soc8 小时前
2025年渗透测试面试题总结-234(题目+回答)
网络·python·安全·web安全·渗透测试·1024程序员节·安全狮
金融小师妹11 小时前
多因子量化模型预警:美元强势因子压制金价失守4000关口,ADP数据能否重构黄金趋势?
人工智能·深度学习·1024程序员节
unable code11 小时前
攻防世界-Misc-SimpleRAR
网络安全·ctf·misc·1024程序员节
少林码僧13 小时前
2.3 Transformer 变体与扩展:BERT、GPT 与多模态模型
人工智能·gpt·ai·大模型·bert·transformer·1024程序员节
liu****15 小时前
11.Linux进程信号(三)
linux·运维·服务器·数据结构·1024程序员节
阿部多瑞 ABU16 小时前
Unicode全字符集加解密工具 - 命令行交互版:功能完整的终端解决方案
经验分享·交互·ai编程·1024程序员节
知识分享小能手19 小时前
jQuery 入门学习教程,从入门到精通, jQuery在HTML5中的应用(16)
前端·javascript·学习·ui·jquery·html5·1024程序员节