Java爬虫

因公司新业务行政执法建设需要,需对多个业务部门提供的目标网站相关行政复议文书进行爬取。

对多个目标网站的研究发现。在对不同目标网站进行爬取时,需要处理的方式不一样,有pdf、doc格式等,有的网站可以随意下载,有的是接口字段加密传参、需要通过接口解密处理,有的需要通过解析网页元素处理。

导包

java 复制代码
<dependency>
   <groupId>org.jsoup</groupId>
   <artifactId>jsoup</artifactId>
   <version>1.12.1</version>
</dependency>
<dependency>
   <groupId>cn.hutool</groupId>
   <artifactId>hutool-all</artifactId>
   <version>5.7.9</version>
</dependency>
<dependency>
    <groupId>com.alibaba</groupId>
    <artifactId>fastjson</artifactId>
    <version>1.2.78</version>
</dependency>
<dependency>
    <groupId>org.apache.httpcomponents</groupId>
    <artifactId>httpclient</artifactId>
    <version>4.5.12</version>
</dependency>
<dependency>
     <groupId>commons-httpclient</groupId>
     <artifactId>commons-httpclient</artifactId>
     <version>3.1</version>
</dependency>
<dependency>
     <groupId>commons-io</groupId>
     <artifactId>commons-io</artifactId>
     <version>2.11.0</version>
</dependency>

Service

java 复制代码
/***
 * @Description: 
 * @Auther: lyonardo
 * @Date: 2021/10/13 09:49
 * @version : V1.0
 */
@Slf4j
@Service
public class SftjHttpSpiderServiceImpl implements SftjHttpSpiderService {
    @Value("${admin_review_filePath}")
    private String adminReviewFilePath;
    @Resource
    private AdministrativeDocumentsOriginalsMapper administrativeDocumentsOriginalsMapper;
     @Override
     public void spiderSftj(){
          try {
              Document  pageDoc = JsoupUtil.buildDocument(SftjConstants.PageJsoupUrl);
              Elements pageElements = pageDoc.select(".f12>script");
              pageElements.forEach(x->{
                  String[] strings = x.data().split(";");
                  for (String s : strings){
                      if(s.contains("var countPage")){
                          int pageSize = RegexUtils.getNumberFromString(s);
                          for(int i=1;i<pageSize;i++){
                              Document  doc = null;
                              try {
                                  doc = JsoupUtil.buildDocument(SftjConstants.DocJsoupUrl+i+".html");
                                  List<AdministrativeDocumentsOriginals> list = new ArrayList<>();
                                  Elements elements = doc.select(".news .mf26 .overflow");
                                  elements.forEach(
                                          y-> {
                                              Document doc1;
                                              try {
                                                  doc1 = JsoupUtil.buildDocument(y.attr("abs:href"));
                                                  Elements elements1 = doc1.select(".attachments>ul>li>a");
                                                  elements1.forEach(t->{
                                                      log.debug("t.href::"+t.attr("abs:href"));
                                                      if(StringUtils.isNotEmpty(t.attr("abs:href"))){
                                                          FileUtil.createDirs(adminReviewFilePath);
                                                       String content = PDFBoxUtil.readAndSavePdfUrl(t.attr("abs:href"),adminReviewFilePath+y.ownText()+".pdf");
                                                          list.add(AdministrativeDocumentsOriginals.build("天津市司法局",y.ownText(),content,
                                                                  adminReviewFilePath+y.ownText()+".pdf",y.select(".news-date").text(),null,2));
                                                      }
                                                  });
                                              } catch (IOException e) {
                                                  log.error("JsoupUtil buildDocument");
                                              }
                                          }
                                  );
                                  try {
                                      administrativeDocumentsOriginalsMapper.batchUpsert(list);
                                  }catch (Exception e){
                                      e.printStackTrace();
                                      log.error("batchUpsert失败!!!");
                                  }
                              } catch (IOException e) {
                                  log.error("处理index页面失败==》本次爬虫失败!!!");
                              }
                          }
                      }
                   }
                 }
              );
          } catch (IOException e) {
              log.error("处理Document失败==》本次爬虫失败!!!");
          }
      }
}

另外一个网站。分批处理

java 复制代码
/***
 * @Description:
 * @Auther: lyonardo
 * @Date: 2021/09/24 11:08
 * @version : V1.0
 */
@Slf4j
@Component
public class CfwsHttpSpiderJob {
    @Autowired
    private DoCfwsHttpSpiderTask spiderTask;
    @Value("${partition_size}")
    private Integer partitionSize;
    @Scheduled(cron = "${punish_jobs_cron}")
    public void spiderCfws() {
        List<List<String>> subQueryConditionList = Lists.partition(QueryConditionConstant.queryConditionList, partitionSize);
        for (List<String> smallerList : subQueryConditionList) {
            try {
                spiderTask.doCfwsHttpSpider(smallerList);
            } catch (Exception e) {
                e.printStackTrace();
                log.info("爬虫失败");
            }
        }
    }
}
java 复制代码
/***
 * @Description:
 * @Auther: lyonardo
 * @Date: 2021/09/29 16:09
 * @version : V1.0
 */
@Slf4j
@Component
public class DoCfwsHttpSpiderTask implements Runnable{
    @Resource
    private AdministrativeDocumentsOriginalsMapper administrativeDocumentsOriginalsMapper;

    @Value("${admin_punish_filePath}")
    public String adminPunishFilePath;

    @Value("${ciphertext}")
    public String ciphertext ;

    @Value("${sleep_longtime}")
    public Integer sleepLongtime;

    private List<String> smallerList;

    @Override
    public void run() {
        try {
            System. out.println("run thread...");
            doCfwsHttpSpider(smallerList);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void doCfwsHttpSpider(List<String> smallerList) throws Exception {
        synchronized (this) {
            doSpiderCfws(smallerList);
            log.info("休眠"+sleepLongtime+"分钟==》"+LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")));
            Thread. sleep(sleepLongtime*60*1000);
            log.info(sleepLongtime+"分钟后wake up==》"+LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")));
        }
    }

    public void doSpiderCfws(List<String> smallerList) {
        List<String> resultStringList = resultList(smallerList);
        if(CollectionUtils.isNotEmpty(resultStringList)){
            List list = new ArrayList<>();
            resultStringList.forEach(x->{
                log.info("result==>{}",x);
                if(null != x){
                    JSONObject jsonQueryDocResult = JSONObject.parseObject(x);
                    if(null!=jsonQueryDocResult.getJSONObject("result")){
                        JSONObject jsonQueryDocResultObject = jsonQueryDocResult.getJSONObject("result");
                        if(null!=jsonQueryDocResultObject.getJSONObject("queryResult")){
                            JSONObject queryResult = jsonQueryDocResultObject.getJSONObject("queryResult");
                            if(null!=queryResult.getJSONArray("resultList")){
                                Integer resultCount = queryResult.getInteger("resultCount");
                                log.info("resultCount==>{}",resultCount);
                                JSONArray resultList = queryResult.getJSONArray("resultList");
                                for (int i=0 ; i<resultList.size(); i++){
                                    JSONObject resultJSONObject = resultList.getJSONObject(i) ;
                                    String rowkey = resultJSONObject.getString("rowkey");
                                    String getDocResult = HttpUtil.post(CfwsConstant.httpGetDocUrl, buildGetDocParamMap(rowkey));
                                    if(StringUtils.isNotEmpty(getDocResult)){
                                        if(null!= JSON.parseObject(getDocResult)){
                                            JSONObject jsonObject = JSONObject.parseObject(getDocResult).getJSONObject("result");
                                            if(null!=jsonObject){
                                                String i0 = jsonObject.getString("i0");
                                                log.info("i0 before==>{}",i0);
                                                if(StringUtils.isNotEmpty(i0)){
                                                    String i7 = jsonObject.getString("i7");
                                                    if(null!=i7&&!"".equals(i7)){
                                                        try {
                                                            i7 = i7.replace(" ", "+");
                                                            FileUtil.createDirs(adminPunishFilePath);
                                                            String allFilePath = adminPunishFilePath + i0 + ".pdf";
                                                            if (FileUtil.isCreateNewFile(allFilePath)) {
                                                                String punishmentAuthority = "";
                                                                String dateOfPunishment = "";
                                                                String typesOfPunishment = "";
                                                                if (null != jsonObject.getString("i3")) {
                                                                    punishmentAuthority = jsonObject.getString("i3");
                                                                }
                                                                if (null != jsonObject.getString("i1")) {
                                                                    dateOfPunishment = jsonObject.getString("i1");
                                                                }
                                                                if (null != jsonObject.getString("i4")) {
                                                                    typesOfPunishment = jsonObject.getString("i4");
                                                                }
                                                                list.add(AdministrativeDocumentsOriginals.build(punishmentAuthority, i0, PDFBoxUtil.readAndSavePdfStr(i7, allFilePath), allFilePath,
                                                                        dateOfPunishment, typesOfPunishment, 1));
                                                            }
                                                        }catch (IOException e) {
                                                            log.info(i0+",PDF处理失败,此次爬虫失败!!!");
                                                            e.printStackTrace();
                                                        }
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            });
            try {
                log.info("list==>{}",list);
                log.info("batchUpsert:==>{}",administrativeDocumentsOriginalsMapper.batchUpsert(list));
            }catch (Exception e){
                e.printStackTrace();
                log.info("入库异常==》爬虫失败");
            }
        }
        log.info("爬虫失败");
    }
    ...
}

还有另外几个网站。

JDK等源码也有很多深嵌套,是if嵌套还是if反向判断throw new Exception,由团队代码风格和任务紧急度决定。

爬取数据最棘手的,一是面向监狱编程,二是IP池中IP的收集,三是加密破解,四是限流。

问题:前端防爬机制如何处理?

java 复制代码
function cipher() {
	var date = new Date();
	var timestamp = date.getTime().toString();
	var salt =random(24);
	var year = date.getFullYear().toString();
	var month = (date.getMonth() + 1 < 10 ? "0" + (date.getMonth() + 1) : date
			.getMonth()+1).toString();
	var day = (date.getDate() < 10 ? "0" + date.getDate() : date.getDate())
			.toString();
	var iv = year + month + day;
	var enc = DES3.encrypt(timestamp, salt, iv).toString();
	var str = salt + iv + enc;
	var ciphertext = strTobinary(str);
	return ciphertext;
}
function strTobinary(str) {
	var result = [];
	var list = str.split("");
	for (var i = 0; i < list.length; i++) {
		if (i != 0) {
			result.push(" ");
		}
		var item = list[i];
		var binaryStr = item.charCodeAt().toString(2);
		result.push(binaryStr);
	};
	return result.join("");
}
function random (size){
	var str = "",
	arr = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'];
	for(var i=0; i<size; i++){
		str += arr[Math.round(Math.random() * (arr.length-1))];
	}
	return str;
}
/**
 * 防爬接口回调数据处理方法
 * @param obj
 * @returns
 */
function getData(obj){
	try{ 
		obj = $.parseJSON(obj)
	}catch(e){ 
		
	}
	if(obj.code =="1" && obj.secretKey){
		var data = DES3.decrypt(obj.result, obj.secretKey);
		obj.result= $.parseJSON(data);
		return obj;
	}
	if(obj.code =="1" && obj.secretKey==null){
		return obj;
	}
	if(obj.code =="-11"){
		window.localStorage.setItem("url",obj.result.url);
		window.localStorage.setItem("parameterMap",JSON.stringify(obj.result.parameterMap));
		layer.open({
				title:'访问受限',
				content: "<div class='popMain'><p><span>输入内容</span><input id='yzm' type='text' placeholder='请输入验证码'/></p><p><span>验证码</span><a style='display:inline-block;width:70%;text-align: left;'><img title='点击更换验证码' id='imgcode' src='/captcha/captchaImage?key=antitheftImageCode'  /><span class='yzmtip'>验证码不能为空</span></a></p></div>",
				btn:'确定',
				shade:0.8,
				btnAlign:'c',
				area:['400px','200px'],
				yes:function(){
					var ciphertext = cipher();
					var yzm = $("#yzm").val();
					if(yzm==null || $.trim(yzm)=="" || yzm=="undefined"){
						$('.yzmtip').show();
						return false;
					}
					var url = window.localStorage.getItem("url");
					var parameterMap = window.localStorage.getItem("parameterMap");
					parameterMap=$.parseJSON(parameterMap);
					parameterMap.yzm=yzm;
					parameterMap.ciphertext=ciphertext;
					$.ajax({
				        type: "post",
				        async: true,
				        url: url,
				        datatype:"json",
				        data: parameterMap,
				        success: function(data) {
				        	try{ 
				        		data = $.parseJSON(data)
				        	}catch(e){ 
				        	}
				        	if(data.code != -11 || !data.code){
				        		//window.localStorage.setItem("result",JSON.stringify(data));
				        		layer.closeAll();
				        		window.location.reload();
				        		return ;
				        	}
				        	if(data.code == -11 && data.description!=null && data.description!=""){
				        		$(".yzmtip").html("<font color='red'>"+data.description+"</font>");
					        	$('.yzmtip').show();
				        	}
				        	$("#imgcode").attr("src", "/captcha/captchaImage?key=antitheftImageCode&random="+Math.random());
				        },
				        error:function(data){
				        }
				    })
				}
			})
	
	}
	if(obj.code =="-12"){
		layer.msg("访问受限");
	}
}
/**
 * 点击获取验证码
 */
$(function(){
	$(document).on('click','#imgcode',function(){
		$("#imgcode").attr("src", "/captcha/captchaImage?key=antitheftImageCode&random="+Math.random());
	});
})

现有后端破解代码

java 复制代码
/***
 * @Description:
 * @Auther: lyonardo
 * @Date: 2021/10/22 16:07
 * @version : V1.0
 */
public class CryptoUtil {
    private static String CODE_TYPE = "UTF-8";
    public static final String transformation = "DES/ECB/NOPadding";

    public String encode(String KEY, String datasource){
        try{
            SecureRandom random = new SecureRandom();
            DESKeySpec desKey = new DESKeySpec(KEY.getBytes(CODE_TYPE));
            SecretKeyFactory keyFactory = SecretKeyFactory.getInstance("DES");
            SecretKey securekey = keyFactory.generateSecret(desKey);
            Cipher cipher = Cipher.getInstance("DES");
            cipher.init(Cipher.ENCRYPT_MODE, securekey, random);
            byte[] temp = Base64.encodeBase64(cipher.doFinal(datasource.getBytes()));
            return IOUtils.toString(temp,"UTF-8");
        }catch(Throwable e){
            e.printStackTrace();
            return null;
        }
    }

    public static String encrypt(String key,String text,String vector) throws  Exception {
        try {
                byte[] src = text.getBytes("utf-8");
                DESedeKeySpec spec = new DESedeKeySpec(key.getBytes("utf-8"));
                SecretKeyFactory factory = SecretKeyFactory.getInstance("DESede");
                SecretKey secretKey = factory.generateSecret(spec);
                Cipher cipher = Cipher.getInstance("DESede/ECB/PKCS7Padding");
                String iv  = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss").format(LocalDateTime.now());
                cipher.init(Cipher.ENCRYPT_MODE, secretKey/**new IvParameterSpec(vector||iv)*/);
                byte[] res = cipher.doFinal(src);
                return new String(Base64.encodeBase64(res), "utf-8");
            } catch (Exception e) {
                System.out.println("error");
            }
        return null;
   }

    private static Cipher GetCipher(int opmode, String key) {
        try {
            DESKeySpec dks = new DESKeySpec(key.getBytes());
            SecretKeyFactory keyFactory = SecretKeyFactory.getInstance("DES");
            Key secretKey = keyFactory.generateSecret(dks);
            Cipher cipher = Cipher.getInstance("DESede/ECB/PKCS7Padding");
            cipher.init(opmode, secretKey);
            return cipher;
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        }
    }

    public static String encode1(String data , String key) {
        if (data == null || data.isEmpty()) {
            return null;
        }
        try {
            Cipher cipher = GetCipher(Cipher.ENCRYPT_MODE,key);
            if (cipher == null) {
                return null;
            } else {
                byte[] byteHex = cipher.doFinal(data.getBytes("UTF-8"));
                return byteToHexString(byteHex);
            }
        } catch (Exception e) {
            e.printStackTrace();
            return data;
        }
    }

    public static String decode1(String data , String key) throws Exception {
        if (data == null || data.isEmpty())
            return null;
        try {
            byte[] b = HexUtil.decodeHex(data.toCharArray());
            Cipher cipher = GetCipher(Cipher.DECRYPT_MODE ,key);
            if (cipher != null)
                return new String(cipher.doFinal(b), "UTF-8");
            else
                return null;
        } catch (Exception e) {
            e.printStackTrace();
            return data;
        }
    }

    public static String byteToHexString(byte[] bytes) {
        StringBuffer sb = new StringBuffer(bytes.length);
        String sTemp;
        for (int i = 0; i < bytes.length; i++) {
            sTemp = Integer.toHexString(0xFF & bytes[i]);
            if (sTemp.length() < 2)
                sb.append(0);
            sb.append(sTemp.toUpperCase());
        }
        return sb.toString();
    }
    private static String KEY = "password111111";

    public static String encode(String datasource){
        try{
            SecureRandom random = new SecureRandom();
            DESKeySpec desKey = new DESKeySpec(KEY.getBytes(CODE_TYPE));
            SecretKeyFactory keyFactory = SecretKeyFactory.getInstance("DES");
            SecretKey securekey = keyFactory.generateSecret(desKey);
            Cipher cipher = Cipher.getInstance("DES");
            cipher.init(Cipher.ENCRYPT_MODE, securekey, random);
            byte[] temp = Base64.encodeBase64(cipher.doFinal(datasource.getBytes()));
            return IOUtils.toString(temp,"UTF-8");
        }catch(Throwable e){
            e.printStackTrace();
            return null;
        }
}

当时用js调试和改造,前端和后端用同样的时间和串,得到加密结果却不一样。

todo解密。

相关推荐
转调几秒前
每日一练:地下城游戏
开发语言·c++·算法·leetcode
Java探秘者1 分钟前
Maven下载、安装与环境配置详解:从零开始搭建高效Java开发环境
java·开发语言·数据库·spring boot·spring cloud·maven·idea
攸攸太上2 分钟前
Spring Gateway学习
java·后端·学习·spring·微服务·gateway
2301_786964367 分钟前
3、练习常用的HBase Shell命令+HBase 常用的Java API 及应用实例
java·大数据·数据库·分布式·hbase
2303_8120444610 分钟前
Bean,看到P188没看了与maven
java·开发语言
苹果醋311 分钟前
大模型实战--FastChat一行代码实现部署和各个组件详解
java·运维·spring boot·mysql·nginx
秋夫人13 分钟前
idea 同一个项目不同模块如何设置不同的jdk版本
java·开发语言·intellij-idea
m0_6640470218 分钟前
数字化采购管理革新:全过程数字化采购管理平台的架构与实施
java·招投标系统源码
不穿格子衬衫29 分钟前
常用排序算法(下)
c语言·开发语言·数据结构·算法·排序算法·八大排序
萧鼎35 分钟前
Python调试技巧:高效定位与修复问题
服务器·开发语言·python