一、引入webmagic包
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>${webmagic.core.version}</version>
<exclusions>
<exclusion>
<artifactId>slf4j-api</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
</exclusions>
</dependency>
二、启动爬虫
Request request = new Request();
request.setMethod("GET");
request.setUrl("http://***");
request.addHeader("client_signature", content);
Spider.create(new MPageProcessor())
.addRequest(request)
.addPipeline(new MPipeline(start))
.thread(1)
.start();
三、PageProcessor
当然还有更多html内容的分析,看其它文章
public class MPageProcessor implements PageProcessor {
private Site site = Site.me().setSleepTime(3000).setTimeOut(120000);
@Override
public void process(Page page) {
String text = page.getRawText();
// System.out.println(text);
JSONObject result = JSONUtil.parseObj(text);
page.putField("data", result.getJSONObject("data"));
}
@Override
public Site getSite() {
return site;
}
}
四、Pipeline
public class MPipeline implements Pipeline {
private Date start;
public MPipeline(Date start){
this.start = start;
}
@Override
public void process(ResultItems resultItems, Task task) {
JSONObject data = resultItems.get("data");
int total = data.getJSONObject("queryBean").getInt("total");
System.out.println(total);
Date end = new Date();
System.out.println(DateUtil.format(end, "yyyy-MM-dd HH:mm:ss.SSSSSS"));
System.out.println("总耗时:"+DateUtil.formatBetween(start, end, BetweenFormatter.Level.MILLISECOND));
}
}