java
package com.tjp.www.xuexiangbanweb.webmagic;
import cn.hutool.core.io.FileUtil;
import com.alibaba.fastjson.JSONObject;
import com.tjp.www.xuexiangbanweb.entity.FHNewxVo;
import com.tjp.www.xuexiangbanweb.entity.FHResp;
import com.tjp.www.xuexiangbanweb.entity.NewsCollect;
import com.tjp.www.xuexiangbanweb.service.NewsCollectService;
import com.tjp.www.xuexiangbanweb.util.DownloadFile;
import com.tjp.www.xuexiangbanweb.util.HttpUtils;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Spider;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CopyOnWriteArrayList;
@Component
public class WebMagicStarterFenghuang {
@Autowired
NewsCollectService newsCollectService;
@Autowired
CustomerPipelinefh customerPipelinefh;
@Autowired
NewsPageProcessor2 newsPageProcessor2;
public void doStartReptile() {
String[] keywords=new String[]{"北海舰队","海军航空兵","南沙群岛","辽宁舰","海军人员","南海岛礁","中国海军","南海局势","中国领海","山东舰","军事强国","北海某海域","中国某海域",
"黄岩岛","黄海某海域","南海某海域","战区海军","南海舰队","海军航空兵","执法船","中国海洋资源","中国轮船","海事部门","海洋强国","渔业捕捞","海洋战略","海上应急",
"中国岛礁","渔政","海洋战略","水产品","船舶管理","海洋资源","中国轮船","海上执法","海洋部门","东海某海域","海上演练","海航","中国渔船","渔业政策","黄岩岛",
"驱逐舰","舰艇","海上演习","钢铁海洋","渔政部门","渔民","战斗舰","海洋鱼类","海洋生态","海上打捞","禁渔期","海上船只","海洋政策","海上巡航","建设海洋生态","发展海洋","海洋经济",
"船事故","海警部门","中国水产","台湾海峡","海军部队","沿海港航","沿海航行"};
List<Request> list = new ArrayList<>();
for (String keywordss : keywords) {
//爬取目标网站url,
String urls = "https://www.xxxxxx.com/xxxx/xxxxx/xxx/"+keywordss+"/1/xxxx?callback=getSoFengDataCallback&_=16502467818311";
String s = HttpUtils.get(urls);
String ss = s.replace("getSoFengDataCallback", "");
String substring = ss.substring(1, ss.length() - 1);
FHResp resp = JSONObject.parseObject(substring, FHResp.class);
int totalPage = resp.getData().getTotalPage();
for (int i = 1; i <=totalPage; i++) {
String url = "https://shankapi.ifeng.com/season/getSoFengData/all/"+keywordss+"/"+i+"/getSoFengDataCallback?callback=getSoFengDataCallback&_=16502467818311";
String sss = HttpUtils.get(url);
String spjson = sss.replace("getSoFengDataCallback", "");
String substrings = spjson.substring(1, spjson.length() - 1);
FHResp fhResp = JSONObject.parseObject(substrings, FHResp.class);
if(fhResp.getData().getItems().size()>0){
List<FHNewxVo> items = fhResp.getData().getItems();
for (FHNewxVo item : items) {
if(StringUtils.isNotBlank(item.getUrl())){
Map<String,Object> map=new ConcurrentHashMap<>();
map.put("keyword",keywordss);
String deatilUrl="https:"+item.getUrl();
Request request = new Request(deatilUrl);
request.setExtras(map);
list.add(request);
}
}
}
}
}
System.out.println("数量--》"+list.size());
//获取所有详情页URL,进行请求
Request[] requests = list.toArray(new Request[list.size()]);
Spider.create(newsPageProcessor2)
//.addUrl(url)
.addRequest(requests)
//开启5个线程抓取
.addPipeline(customerPipelinefh)
.thread(10)
//启动爬虫
.run();
System.out.println("读到--》"+CustomerPipelinefh.map.size());
//System.out.println("读到--》"+CustomerPipelinefh.map.toString());
/* 图片存放分文件夹 */
String dowFilePath = "E:/images";
if (!FileUtil.exist(dowFilePath)) {
FileUtil.mkdir(dowFilePath);
}
//取数据 存数据库
List<NewsCollect> lists=new CopyOnWriteArrayList<>();
CustomerPipelinefh.map.forEach((k, v) -> {
lists.add(v);
});
System.out.println("读取到的数据,去重前---》"+lists.size());
int bs=0;
int as=0;
for (NewsCollect newsCollect : lists) {
String s = newsCollect.getPublishtime().split("/")[0];
if (StringUtils.isNotBlank(newsCollect.getAttachments())) {
String jpg = newsCollect.getAttachments().substring(newsCollect.getAttachments().lastIndexOf("."));
String timePre = newsCollect.getPublishtime();
if (StringUtils.isNotBlank(newsCollect.getPublishtime())) {
String[] timeSpilt = newsCollect.getPublishtime().substring(0, 10).split("-");
System.out.println("时间---》"+timeSpilt.toString());
timePre = timeSpilt[0] + timeSpilt[1] + timeSpilt[2];
}
String jpgName = timePre + "_" +
newsCollect.getPublishpersoninfo() + "_" +
newsCollect.getTitle().replaceAll("\\s*", "") + "_" + newsCollect.getNewssource() + jpg;
boolean b = DownloadFile.downloadA(newsCollect.getAttachments(), "E:\\images\\" + jpgName);
if (b) {//成功了在设置
newsCollect.setAttachmentsurl("images/" + jpgName);
System.out.println("成功下载了---》" + (++bs));
}
System.out.println("下载了---》" + (++as));
}
}
System.out.println("数据量--》"+lists.size());
// System.out.println("数据量内容--》"+lists.toString());
//存数据库都是有效的
newsCollectService.saveBatch(lists);
CustomerPipelinefh.map.clear();
System.out.println("清除map-->"+CustomerPipelinefh.map.size());
}
}
Piepeline组件逻辑
java
package com.tjp.www.xuexiangbanweb.webmagic;
import cn.hutool.core.util.ObjectUtil;
import com.tjp.www.xuexiangbanweb.entity.NewsCollect;
import com.tjp.www.xuexiangbanweb.mapper.NewsCollectMapper;
import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import javax.annotation.Resource;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
@Component
public class CustomerPipelinefh implements Pipeline {
@Resource
NewsCollectMapper newsCollectMapper;
public static Map<String,NewsCollect> map=new ConcurrentHashMap<>();
@Override
public void process(ResultItems resultItems, Task task) {
if (!ObjectUtil.isEmpty(resultItems.get("newsObject"))) {
NewsCollect newsObject = resultItems.get("newsObject");
if(StringUtils.isNotBlank(newsObject.getTitle())) {
map.put(newsObject.getTitle()+newsObject.getPublishtime(), newsObject);
}
}
}
}
PageProcessor组件逻辑
java
package com.tjp.www.xuexiangbanweb.webmagic;
import cn.hutool.core.util.IdUtil;
import com.tjp.www.xuexiangbanweb.entity.NewsCollect;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
@Component
public class NewsPageProcessor2 implements PageProcessor {
@Override
public void process(Page page) {// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
detailPages(page);
}
public void detailPages(Page page){
String keyword = page.getRequest().getExtra("keyword").toString();
Document document = page.getHtml().getDocument();
String url = page.getRequest().getUrl();
Elements otherBody = document.getElementsByClass("atlas-4Dz3AOM8");
String title = document.getElementsByClass("title-14yWv8ay").text();
String person = document.getElementsByClass("source-3cecBclA").text();
String time = document.getElementsByClass("time-M6w87NaQ").text();
Elements p1 = document.getElementsByClass("smallFont-Z_OfA44W\n" +
" text-20BABGxP\n" +
" ").select("p");
String text = document.getElementsByClass("videoArea-3bC47kpL").text();
//不考虑视频 08/14 41:12 2021/18/01 17:52
if(!StringUtils.isNotBlank(text)&&StringUtils.isNotBlank(time)&&StringUtils.isNotBlank(p1.toString())) {
String[] splitTime = time.split("/");
if(splitTime.length<3){
time="2021/"+time;
}
if("2022".equals(time.split("/")[0])||"2021".equals(time.split("/")[0])) {
String body = "";
for (Element element : p1) {
body += element.text();
}
if (!StringUtils.isNotBlank(body)) {
for (Element element : otherBody) {
body += element.getElementsByClass("atlasBox-2CqWytIX").next().text();
}
}
if (body.length() > 32767) {
body = body.substring(0, 32767);
}
String imgUrl = "";
Elements img = p1.select("img");
if (!img.isEmpty()) {
for (Element element : img) {
String src = element.attr("src");
imgUrl = src;
break;
}
}
NewsCollect newsCollect = new NewsCollect();
newsCollect.setId(IdUtil.getSnowflakeNextId());
newsCollect.setTitle(title.replaceAll("\\s*", ""));
newsCollect.setPublishpersoninfo(person);// 01/17 18:30
newsCollect.setPublishtime(time.replaceAll("/","-"));
newsCollect.setNewsinfo(body);
newsCollect.setAttachments(imgUrl);
newsCollect.setNewssource("凤凰网");
newsCollect.setLinksnum("0");
newsCollect.setForwardnum("0");
newsCollect.setCommentnum("0");
newsCollect.setEventtype("文章");
newsCollect.setKeyword(keyword);
newsCollect.setUrl(url);
page.putField("newsObject", newsCollect);
}
}
}
@Override
public Site getSite() { // 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等
return Site.me().setRetryTimes(10)
.setCharset("utf-8").setTimeOut(6000)
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36");
}
}