因公司新业务行政执法建设需要,需对多个业务部门提供的目标网站相关行政复议文书进行爬取。
对多个目标网站的研究发现。在对不同目标网站进行爬取时,需要处理的方式不一样,有pdf、doc格式等,有的网站可以随意下载,有的是接口字段加密传参、需要通过接口解密处理,有的需要通过解析网页元素处理。
导包
java
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.12.1</version>
</dependency>
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.7.9</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.78</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.12</version>
</dependency>
<dependency>
<groupId>commons-httpclient</groupId>
<artifactId>commons-httpclient</artifactId>
<version>3.1</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.11.0</version>
</dependency>
Service
java
/***
* @Description:
* @Auther: lyonardo
* @Date: 2021/10/13 09:49
* @version : V1.0
*/
@Slf4j
@Service
public class SftjHttpSpiderServiceImpl implements SftjHttpSpiderService {
@Value("${admin_review_filePath}")
private String adminReviewFilePath;
@Resource
private AdministrativeDocumentsOriginalsMapper administrativeDocumentsOriginalsMapper;
@Override
public void spiderSftj(){
try {
Document pageDoc = JsoupUtil.buildDocument(SftjConstants.PageJsoupUrl);
Elements pageElements = pageDoc.select(".f12>script");
pageElements.forEach(x->{
String[] strings = x.data().split(";");
for (String s : strings){
if(s.contains("var countPage")){
int pageSize = RegexUtils.getNumberFromString(s);
for(int i=1;i<pageSize;i++){
Document doc = null;
try {
doc = JsoupUtil.buildDocument(SftjConstants.DocJsoupUrl+i+".html");
List<AdministrativeDocumentsOriginals> list = new ArrayList<>();
Elements elements = doc.select(".news .mf26 .overflow");
elements.forEach(
y-> {
Document doc1;
try {
doc1 = JsoupUtil.buildDocument(y.attr("abs:href"));
Elements elements1 = doc1.select(".attachments>ul>li>a");
elements1.forEach(t->{
log.debug("t.href::"+t.attr("abs:href"));
if(StringUtils.isNotEmpty(t.attr("abs:href"))){
FileUtil.createDirs(adminReviewFilePath);
String content = PDFBoxUtil.readAndSavePdfUrl(t.attr("abs:href"),adminReviewFilePath+y.ownText()+".pdf");
list.add(AdministrativeDocumentsOriginals.build("天津市司法局",y.ownText(),content,
adminReviewFilePath+y.ownText()+".pdf",y.select(".news-date").text(),null,2));
}
});
} catch (IOException e) {
log.error("JsoupUtil buildDocument");
}
}
);
try {
administrativeDocumentsOriginalsMapper.batchUpsert(list);
}catch (Exception e){
e.printStackTrace();
log.error("batchUpsert失败!!!");
}
} catch (IOException e) {
log.error("处理index页面失败==》本次爬虫失败!!!");
}
}
}
}
}
);
} catch (IOException e) {
log.error("处理Document失败==》本次爬虫失败!!!");
}
}
}
另外一个网站。分批处理
java
/***
* @Description:
* @Auther: lyonardo
* @Date: 2021/09/24 11:08
* @version : V1.0
*/
@Slf4j
@Component
public class CfwsHttpSpiderJob {
@Autowired
private DoCfwsHttpSpiderTask spiderTask;
@Value("${partition_size}")
private Integer partitionSize;
@Scheduled(cron = "${punish_jobs_cron}")
public void spiderCfws() {
List<List<String>> subQueryConditionList = Lists.partition(QueryConditionConstant.queryConditionList, partitionSize);
for (List<String> smallerList : subQueryConditionList) {
try {
spiderTask.doCfwsHttpSpider(smallerList);
} catch (Exception e) {
e.printStackTrace();
log.info("爬虫失败");
}
}
}
}
java
/***
* @Description:
* @Auther: lyonardo
* @Date: 2021/09/29 16:09
* @version : V1.0
*/
@Slf4j
@Component
public class DoCfwsHttpSpiderTask implements Runnable{
@Resource
private AdministrativeDocumentsOriginalsMapper administrativeDocumentsOriginalsMapper;
@Value("${admin_punish_filePath}")
public String adminPunishFilePath;
@Value("${ciphertext}")
public String ciphertext ;
@Value("${sleep_longtime}")
public Integer sleepLongtime;
private List<String> smallerList;
@Override
public void run() {
try {
System. out.println("run thread...");
doCfwsHttpSpider(smallerList);
} catch (Exception e) {
e.printStackTrace();
}
}
public void doCfwsHttpSpider(List<String> smallerList) throws Exception {
synchronized (this) {
doSpiderCfws(smallerList);
log.info("休眠"+sleepLongtime+"分钟==》"+LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")));
Thread. sleep(sleepLongtime*60*1000);
log.info(sleepLongtime+"分钟后wake up==》"+LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")));
}
}
public void doSpiderCfws(List<String> smallerList) {
List<String> resultStringList = resultList(smallerList);
if(CollectionUtils.isNotEmpty(resultStringList)){
List list = new ArrayList<>();
resultStringList.forEach(x->{
log.info("result==>{}",x);
if(null != x){
JSONObject jsonQueryDocResult = JSONObject.parseObject(x);
if(null!=jsonQueryDocResult.getJSONObject("result")){
JSONObject jsonQueryDocResultObject = jsonQueryDocResult.getJSONObject("result");
if(null!=jsonQueryDocResultObject.getJSONObject("queryResult")){
JSONObject queryResult = jsonQueryDocResultObject.getJSONObject("queryResult");
if(null!=queryResult.getJSONArray("resultList")){
Integer resultCount = queryResult.getInteger("resultCount");
log.info("resultCount==>{}",resultCount);
JSONArray resultList = queryResult.getJSONArray("resultList");
for (int i=0 ; i<resultList.size(); i++){
JSONObject resultJSONObject = resultList.getJSONObject(i) ;
String rowkey = resultJSONObject.getString("rowkey");
String getDocResult = HttpUtil.post(CfwsConstant.httpGetDocUrl, buildGetDocParamMap(rowkey));
if(StringUtils.isNotEmpty(getDocResult)){
if(null!= JSON.parseObject(getDocResult)){
JSONObject jsonObject = JSONObject.parseObject(getDocResult).getJSONObject("result");
if(null!=jsonObject){
String i0 = jsonObject.getString("i0");
log.info("i0 before==>{}",i0);
if(StringUtils.isNotEmpty(i0)){
String i7 = jsonObject.getString("i7");
if(null!=i7&&!"".equals(i7)){
try {
i7 = i7.replace(" ", "+");
FileUtil.createDirs(adminPunishFilePath);
String allFilePath = adminPunishFilePath + i0 + ".pdf";
if (FileUtil.isCreateNewFile(allFilePath)) {
String punishmentAuthority = "";
String dateOfPunishment = "";
String typesOfPunishment = "";
if (null != jsonObject.getString("i3")) {
punishmentAuthority = jsonObject.getString("i3");
}
if (null != jsonObject.getString("i1")) {
dateOfPunishment = jsonObject.getString("i1");
}
if (null != jsonObject.getString("i4")) {
typesOfPunishment = jsonObject.getString("i4");
}
list.add(AdministrativeDocumentsOriginals.build(punishmentAuthority, i0, PDFBoxUtil.readAndSavePdfStr(i7, allFilePath), allFilePath,
dateOfPunishment, typesOfPunishment, 1));
}
}catch (IOException e) {
log.info(i0+",PDF处理失败,此次爬虫失败!!!");
e.printStackTrace();
}
}
}
}
}
}
}
}
}
}
}
});
try {
log.info("list==>{}",list);
log.info("batchUpsert:==>{}",administrativeDocumentsOriginalsMapper.batchUpsert(list));
}catch (Exception e){
e.printStackTrace();
log.info("入库异常==》爬虫失败");
}
}
log.info("爬虫失败");
}
...
}
还有另外几个网站。
JDK等源码也有很多深嵌套,是if嵌套还是if反向判断throw new Exception,由团队代码风格和任务紧急度决定。
爬取数据最棘手的,一是面向监狱编程,二是IP池中IP的收集,三是加密破解,四是限流。
问题:前端防爬机制如何处理?
java
function cipher() {
var date = new Date();
var timestamp = date.getTime().toString();
var salt =random(24);
var year = date.getFullYear().toString();
var month = (date.getMonth() + 1 < 10 ? "0" + (date.getMonth() + 1) : date
.getMonth()+1).toString();
var day = (date.getDate() < 10 ? "0" + date.getDate() : date.getDate())
.toString();
var iv = year + month + day;
var enc = DES3.encrypt(timestamp, salt, iv).toString();
var str = salt + iv + enc;
var ciphertext = strTobinary(str);
return ciphertext;
}
function strTobinary(str) {
var result = [];
var list = str.split("");
for (var i = 0; i < list.length; i++) {
if (i != 0) {
result.push(" ");
}
var item = list[i];
var binaryStr = item.charCodeAt().toString(2);
result.push(binaryStr);
};
return result.join("");
}
function random (size){
var str = "",
arr = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'];
for(var i=0; i<size; i++){
str += arr[Math.round(Math.random() * (arr.length-1))];
}
return str;
}
/**
* 防爬接口回调数据处理方法
* @param obj
* @returns
*/
function getData(obj){
try{
obj = $.parseJSON(obj)
}catch(e){
}
if(obj.code =="1" && obj.secretKey){
var data = DES3.decrypt(obj.result, obj.secretKey);
obj.result= $.parseJSON(data);
return obj;
}
if(obj.code =="1" && obj.secretKey==null){
return obj;
}
if(obj.code =="-11"){
window.localStorage.setItem("url",obj.result.url);
window.localStorage.setItem("parameterMap",JSON.stringify(obj.result.parameterMap));
layer.open({
title:'访问受限',
content: "<div class='popMain'><p><span>输入内容</span><input id='yzm' type='text' placeholder='请输入验证码'/></p><p><span>验证码</span><a style='display:inline-block;width:70%;text-align: left;'><img title='点击更换验证码' id='imgcode' src='/captcha/captchaImage?key=antitheftImageCode' /><span class='yzmtip'>验证码不能为空</span></a></p></div>",
btn:'确定',
shade:0.8,
btnAlign:'c',
area:['400px','200px'],
yes:function(){
var ciphertext = cipher();
var yzm = $("#yzm").val();
if(yzm==null || $.trim(yzm)=="" || yzm=="undefined"){
$('.yzmtip').show();
return false;
}
var url = window.localStorage.getItem("url");
var parameterMap = window.localStorage.getItem("parameterMap");
parameterMap=$.parseJSON(parameterMap);
parameterMap.yzm=yzm;
parameterMap.ciphertext=ciphertext;
$.ajax({
type: "post",
async: true,
url: url,
datatype:"json",
data: parameterMap,
success: function(data) {
try{
data = $.parseJSON(data)
}catch(e){
}
if(data.code != -11 || !data.code){
//window.localStorage.setItem("result",JSON.stringify(data));
layer.closeAll();
window.location.reload();
return ;
}
if(data.code == -11 && data.description!=null && data.description!=""){
$(".yzmtip").html("<font color='red'>"+data.description+"</font>");
$('.yzmtip').show();
}
$("#imgcode").attr("src", "/captcha/captchaImage?key=antitheftImageCode&random="+Math.random());
},
error:function(data){
}
})
}
})
}
if(obj.code =="-12"){
layer.msg("访问受限");
}
}
/**
* 点击获取验证码
*/
$(function(){
$(document).on('click','#imgcode',function(){
$("#imgcode").attr("src", "/captcha/captchaImage?key=antitheftImageCode&random="+Math.random());
});
})
现有后端破解代码
java
/***
* @Description:
* @Auther: lyonardo
* @Date: 2021/10/22 16:07
* @version : V1.0
*/
public class CryptoUtil {
private static String CODE_TYPE = "UTF-8";
public static final String transformation = "DES/ECB/NOPadding";
public String encode(String KEY, String datasource){
try{
SecureRandom random = new SecureRandom();
DESKeySpec desKey = new DESKeySpec(KEY.getBytes(CODE_TYPE));
SecretKeyFactory keyFactory = SecretKeyFactory.getInstance("DES");
SecretKey securekey = keyFactory.generateSecret(desKey);
Cipher cipher = Cipher.getInstance("DES");
cipher.init(Cipher.ENCRYPT_MODE, securekey, random);
byte[] temp = Base64.encodeBase64(cipher.doFinal(datasource.getBytes()));
return IOUtils.toString(temp,"UTF-8");
}catch(Throwable e){
e.printStackTrace();
return null;
}
}
public static String encrypt(String key,String text,String vector) throws Exception {
try {
byte[] src = text.getBytes("utf-8");
DESedeKeySpec spec = new DESedeKeySpec(key.getBytes("utf-8"));
SecretKeyFactory factory = SecretKeyFactory.getInstance("DESede");
SecretKey secretKey = factory.generateSecret(spec);
Cipher cipher = Cipher.getInstance("DESede/ECB/PKCS7Padding");
String iv = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss").format(LocalDateTime.now());
cipher.init(Cipher.ENCRYPT_MODE, secretKey/**new IvParameterSpec(vector||iv)*/);
byte[] res = cipher.doFinal(src);
return new String(Base64.encodeBase64(res), "utf-8");
} catch (Exception e) {
System.out.println("error");
}
return null;
}
private static Cipher GetCipher(int opmode, String key) {
try {
DESKeySpec dks = new DESKeySpec(key.getBytes());
SecretKeyFactory keyFactory = SecretKeyFactory.getInstance("DES");
Key secretKey = keyFactory.generateSecret(dks);
Cipher cipher = Cipher.getInstance("DESede/ECB/PKCS7Padding");
cipher.init(opmode, secretKey);
return cipher;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
public static String encode1(String data , String key) {
if (data == null || data.isEmpty()) {
return null;
}
try {
Cipher cipher = GetCipher(Cipher.ENCRYPT_MODE,key);
if (cipher == null) {
return null;
} else {
byte[] byteHex = cipher.doFinal(data.getBytes("UTF-8"));
return byteToHexString(byteHex);
}
} catch (Exception e) {
e.printStackTrace();
return data;
}
}
public static String decode1(String data , String key) throws Exception {
if (data == null || data.isEmpty())
return null;
try {
byte[] b = HexUtil.decodeHex(data.toCharArray());
Cipher cipher = GetCipher(Cipher.DECRYPT_MODE ,key);
if (cipher != null)
return new String(cipher.doFinal(b), "UTF-8");
else
return null;
} catch (Exception e) {
e.printStackTrace();
return data;
}
}
public static String byteToHexString(byte[] bytes) {
StringBuffer sb = new StringBuffer(bytes.length);
String sTemp;
for (int i = 0; i < bytes.length; i++) {
sTemp = Integer.toHexString(0xFF & bytes[i]);
if (sTemp.length() < 2)
sb.append(0);
sb.append(sTemp.toUpperCase());
}
return sb.toString();
}
private static String KEY = "password111111";
public static String encode(String datasource){
try{
SecureRandom random = new SecureRandom();
DESKeySpec desKey = new DESKeySpec(KEY.getBytes(CODE_TYPE));
SecretKeyFactory keyFactory = SecretKeyFactory.getInstance("DES");
SecretKey securekey = keyFactory.generateSecret(desKey);
Cipher cipher = Cipher.getInstance("DES");
cipher.init(Cipher.ENCRYPT_MODE, securekey, random);
byte[] temp = Base64.encodeBase64(cipher.doFinal(datasource.getBytes()));
return IOUtils.toString(temp,"UTF-8");
}catch(Throwable e){
e.printStackTrace();
return null;
}
}
当时用js调试和改造,前端和后端用同样的时间和串,得到加密结果却不一样。
todo解密。