Jsoup 爬虫
Java 也能写爬虫!!!
Jsoup重要对象如下:
Document:文档对象,每个html页面都是一个Document对象
Element:元素对象,一个Document对象里有多个Element对象
Node:节点对象,用于存储数据,标签名称、属性都是节点对象
Jsoup的主要方法如下:
static Connection connect(String url) 创建URL连接
static Document parse(File in, String charsetName) 解析文件为 Document 对象
static Document parse(String html) 解析html代码为 Document 对象
(虽然上边是最主要的方法,但是下边这段代码中,是用 document对象 + css 选择器来获取的信息)
爬虫示例(豆瓣)
java
/**
* 通过访问接口获取代理IP
*/
public void initIPPool() {
System.out.println("开始获取IP...");
Process proc;
try {
// 这个代码之前是python改的,这里偷懒直接调用,这个文件贴在后边
proc = Runtime.getRuntime().exec("python getIP.py");
BufferedReader in = new BufferedReader(new InputStreamReader(proc.getInputStream()));
String line = null;
while ((line = in.readLine()) != null) {
System.out.println(line);
}
in.close();
proc.waitFor();
} catch (Exception e) {
System.out.println(e.toString());
}
System.out.println("成功获取代理IP");
}
/**
* 从存储代理IP的文件获取代理IP
*/
public void loadIPPool() {
File file = new File("ipPool.txt");
List<String> list = new ArrayList<String>();
synchronized (this) {
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(file));
String tempString = null;
// 一次读入一行,直到读入null为文件结束
while ((tempString = reader.readLine()) != null) {
list.add(tempString);
}
reader.close();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e1) {
System.out.println(e1.toString());
}
}
}
}
System.out.println(list);
myIPPool = list.toArray(new String[list.size()]);
System.out.println("成功载入IP代理池");
}
public String crawlOnce(Integer start) {
StringBuilder finalResult = new StringBuilder();
Random random = new Random();
// 请求地址
String url =
"http://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=&start=" +
(start);
HttpGet request = new HttpGet(url);
String proxyIp = myIPPool[random.nextInt(myIPPool.length)];
while (proxyIp.split(":").length != 2) {
// 在代理ip池里随机获取一个ip
proxyIp = myIPPool[random.nextInt(myIPPool.length)];
}
HttpHost proxy = new HttpHost(proxyIp.split(":")[0],
Integer.parseInt(proxyIp.split(":")[1]));
SSLContextBuilder builder = new SSLContextBuilder();
// 全部信任 不做身份鉴定
PoolingHttpClientConnectionManager cm = null;
SSLConnectionSocketFactory sslsf = null;
try {
builder.loadTrustMaterial(null, new TrustStrategy() {
@Override
public boolean isTrusted(X509Certificate[] x509Certificates, String s) throws CertificateException {
return true;
}
});
sslsf = new SSLConnectionSocketFactory(builder.build(), new String[]{"SSLv2Hello",
"SSLv3", "TLSv1"
, "TLSv1.2"}, null, NoopHostnameVerifier.INSTANCE);
Registry<ConnectionSocketFactory> registry =
RegistryBuilder.<ConnectionSocketFactory>create()
.register("http",
new PlainConnectionSocketFactory())
.register("https", sslsf)
.build();
cm = new PoolingHttpClientConnectionManager(registry);
cm.setMaxTotal(200);//max connection
} catch (Exception e) {
System.out.println(e.toString());
return "";
}
//设置认证
CredentialsProvider provider = new BasicCredentialsProvider();
//第一个参数对应代理httpHost,第二个参数设置代理的用户名和密码,如果代理不需要用户名和密码,填空
provider.setCredentials(new AuthScope(proxy), new UsernamePasswordCredentials("", ""));
//实例化CloseableHttpClient对象
CloseableHttpClient httpClient = HttpClients.custom().setSSLSocketFactory(sslsf)
.setConnectionManager(cm)
.setConnectionManagerShared(true)
.setDefaultCredentialsProvider(provider)
.build();
RequestConfig config = RequestConfig.custom().setProxy(proxy)
.setConnectTimeout(CONNECTION_TIME_OUT)
.setConnectionRequestTimeout(CONNECTION_TIME_OUT)
.setSocketTimeout(CONNECTION_TIME_OUT).build();
request.setConfig(config);
//添加请求头
request.addHeader("User-Agent", myUAPool[random.nextInt(myUAPool.length)]);
request.addHeader("Cookie", myCookies[random.nextInt(myCookies.length)]);
request.addHeader("Accept-Language", "zh-CN,zh;q=0.9");
request.addHeader("Sec-Fetch-Mode", "cors");
request.addHeader("Sec-Fetch-Site", "same-origin");
HttpResponse response = null;
BufferedReader rd = null;
try {
response = httpClient.execute(request);
rd = new BufferedReader(new InputStreamReader(response.getEntity().getContent()));
} catch (IOException e) {
logError(start);
return "";
}
String line = "";
StringBuilder result = new StringBuilder();
while (true) {
try {
line = rd.readLine();
if (line == null) {
break;
}
} catch (IOException e) {
logError(start);
break;
}
// 请求返回了html页面
if (line.equals("") || line.charAt(0) == '<') {
break;
}
result.append(line);
}
System.out.println((start) + "--result:" + result);
JSONObject res = JSONObject.parseObject(String.valueOf(result));
if (res == null || !res.containsKey("data")) {
logError(start);
return "";
}
JSONArray jsonArray = res.getJSONArray("data");
for (int i = 0; i < jsonArray.size(); i++) {
JSONObject jo = jsonArray.getJSONObject(i);
// 通过详情链接爬取电影详情
finalResult.append(crawlDetails(jo.getString("url")));
}
return finalResult.toString();
}
// 爬取详情
private String crawlDetails(String url) {
String result = "";
Random random = new Random();
try {
String proxyIp = myIPPool[random.nextInt(myIPPool.length)];
// myUAPool这里可以换几个浏览器把useragent手写在变量里
Connection con = Jsoup.connect(url)
.proxy(proxyIp.split(":")[0], Integer.parseInt(proxyIp.split(
":")[1]))
.timeout(10000)
.userAgent(myUAPool[random.nextInt(myUAPool.length)])
.header("Accept-Language", "zh-CN,zh;q=0.9")
.header("Cookie", myCookies[random.nextInt(myCookies.length)])
.timeout(CONNECTION_TIME_OUT); // 设置连接超时时间
// 执行连接,获取页面
Connection.Response response = con.execute();
Document document = con.get();
String info = document.select("#info").text();
// ID
result += url.substring(33, url.length() - 1);
// 标题
result += "," + document.select("#content > h1 > span:nth-child(1)").text();
// 年份
result += "," + document.select("#content > h1 > span.year").text();
// 导演
result += "," + document.select("#info > span:nth-child(1) > span.attrs > a").text();
// 编剧
result += "," + document.select("#info > span:nth-child(3) > span.attrs").text();
// 主演
result += "," + document.select("#info > span.actor > span.attrs").text();
// 类型
result += "," + document.select("[property=v:genre]").text();
// 产地
result += "," + info.substring(info.indexOf("制片国家/地区: "), info.indexOf(" 语言:"))
.substring("制片国家/地区: ".length());
// 语言
if (info.contains(" 上映日期:")) {
result += "," + info.substring(info.indexOf("语言: "), info.indexOf(" 上映日期:"))
.substring("语言: ".length());
} else {
result += "," + info.substring(info.indexOf("语言: ")).substring("语言: ".length());
}
// 片长
result += "," + document.select("[property=v:genre]").attr("content");
// 评分
result += "," +
document.select("#interest_sectl > div > div.rating_self.clearfix > strong")
.text();
// 5
result += "," + document.select("#interest_sectl > div.rating_wrap.clearbox > div" +
".ratings-on-weight > div:nth-child(1) > span" +
".rating_per").text();
// 4
result += "," + document.select("#interest_sectl > div.rating_wrap.clearbox > div" +
".ratings-on-weight > div:nth-child(2) > span" +
".rating_per").text();
// 3
result += "," + document.select("#interest_sectl > div.rating_wrap.clearbox > div" +
".ratings-on-weight > div:nth-child(3) > span" +
".rating_per").text();
// 2
result += "," + document.select("#interest_sectl > div.rating_wrap.clearbox > div" +
".ratings-on-weight > div:nth-child(4) > span" +
".rating_per").text();
// 1
result += "," + document.select("#interest_sectl > div.rating_wrap.clearbox > div" +
".ratings-on-weight > div:nth-child(5) > span" +
".rating_per").text();
// 评分人数
result += "," + document.select("[property=v:votes]").text();
// 评论数
result +=
"," + document.select("#comments-section > div.mod-hd > h2 > span > a").text();
System.out.println(proxyIp + " " + result);
} catch (IOException e) {
System.out.println(e.toString());
}
return result + "\n";
}
获取代理IP的代码
python
# coding=UTF-8
import requests
import json
class FreeIP():
def __init__(self):
# 代理ip网站
self.url = "http://proxylist.fatezero.org/proxy.list"
self.headers = {
"User-Agent": "这里改为浏览器的useragent"}
def check_ip(self, ip_list):
correct_ip = []
for ip in ip_list:
if len(correct_ip) > 10: # 可以根据自己的需求进行更改或者注释掉
break
ip_port = "{}:{}".format(ip["host"], ip["port"])
proxies = {'https': ip_port}
try:
# 如果请求该网址,返回的IP地址与代理IP一致,则认为代理成功
response = requests.get('https://icanhazip.com/', proxies=proxies,
timeout=3).text
# 可以更改timeout时间
if response.strip() == ip["host"]:
# print("可用的IP地址为:{}".format(ip_port))
correct_ip.append(ip_port)
except:
# print("不可用的IP地址为:{}".format(ip_port))
return correct_ip
def run(self):
response = requests.get(url=self.url).content.decode()
ip_list = []
proxies_list = response.split('\n')
for proxy_str in proxies_list:
try:
proxy = {}
proxy_json = json.loads(proxy_str)
if proxy_json["anonymity"] == "high_anonymous" and proxy_json["type"] == "https":
host = proxy_json['host']
port = proxy_json['port']
proxy["host"] = host
proxy["port"] = port
ip_list.append(proxy)
except:
correct_ip = self.check_ip(ip_list)
file_path = 'ipPool.txt'
# 写入这个文件
with open(file_path, mode='w', encoding='utf-8') as file_obj:
for i in correct_ip:
file_obj.write(i + "\n")
if __name__ == '__main__':
ip = FreeIP()
ip.run()