JAVA爬虫系列

准备工作

导入依赖

XML 复制代码
 <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.2</version>
        </dependency>

yml

XML 复制代码
logging:
  level:
    root: info
    com.lrm: debug

1.入门程序(获取到静态页面)

java 复制代码
package com.itheima.reggie.utils;


import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

/**
 * @Author lpc
 **/
public class CrawlerFirst {
    public static void main(String[] args) throws Exception {

        //1.打开浏览器,创建Httpclient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

       //2.输入网址,发起get请求创建HttpGet对象
        HttpGet httpGet = new HttpGet("http://www.itcast.cn");

        //3.按回车,发起请求,返回响应,使用Httpclient对象发起请求
        CloseableHttpResponse response = httpClient.execute(httpGet);
        //4.解析响应,获取数据
        //判斯状态码是否是200
        if (response.getStatusLine().getStatusCode()==200){
            HttpEntity httpEntity = response.getEntity();
            //获取前端静态页面
            String content = EntityUtils.toString(httpEntity,"utf8");
            System.out.println(content);
        }


    }
}

2.HttpClient---Get

java 复制代码
package com.itheima.reggie.utils;


import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

/**
 * @Author lpc
 * @Date 2024 03 12 00 23
 **/
public class CrawlerFirst {
    public static void main(String[] args){

        //1.打开浏览器,创建Httpclient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

       //2.输入网址,发起get请求创建HttpGet对象
        HttpGet httpGet = new HttpGet("http://www.itcast.cn");

        //3.按回车,发起请求,返回响应,使用Httpclient对象发起请求
        CloseableHttpResponse response = null;

        try {
            response = httpClient.execute(httpGet);
            //4.解析响应,获取数据
            //判斯状态码是否是200
            if (response.getStatusLine().getStatusCode()==200){
                HttpEntity httpEntity = response.getEntity();
                //获取前端静态页面
                String content = EntityUtils.toString(httpEntity,"utf8");
                System.out.println(content.length());
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }finally {
            try {
                //关闭response
                response.close();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
            try {
                //关闭浏览器
                httpClient.close();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }


    }
}
相关推荐
小白学大数据19 分钟前
Python爬虫伪装策略:如何模拟浏览器正常访问JSP站点
java·开发语言·爬虫·python
SEO_juper1 小时前
别再纠结LLMs.txt了!它背后的真相与最佳使用场景,一文讲透。
开发语言·ai·php·数字营销
程序员西西1 小时前
SpringBoot接口安全:APIKey保护指南
java·spring boot·计算机·程序员·编程·编程开发
g***B7381 小时前
JavaScript在Node.js中的模块系统
开发语言·javascript·node.js
summer_west_fish1 小时前
单体VS微服务:架构选择实战指南
java·微服务·架构
v***8571 小时前
Ubuntu介绍、与centos的区别、基于VMware安装Ubuntu Server 22.04、配置远程连接、安装jdk+Tomcat
java·ubuntu·centos
烤麻辣烫2 小时前
黑马程序员大事件后端概览(表现效果升级版)
java·开发语言·学习·spring·intellij-idea
q***96582 小时前
Spring总结(上)
java·spring·rpc
思密吗喽2 小时前
宠物商城系统
java·开发语言·vue·毕业设计·springboot·课程设计·宠物
csbysj20202 小时前
Lua 函数
开发语言