爬虫学习日记第九篇(爬取seebug)

目标:https://www.seebug.org/vuldb/vulnerabilities

需求:爬取cve_id及影响组件

单线程

cookie是有时效的(过一段时间就不行了,大概半小时左右),但是并不需要登录(直接抓包拿到的请求头)

python 复制代码
import base64
import json
import urllib
from concurrent.futures import ThreadPoolExecutor

import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from lxml import etree
from time import sleep
import concurrent.futures
import mysql.connector
#直接添加这四行代码
from selenium.webdriver.chrome.options import Options

# 设置请求标头
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "Cookie": "__jsluid_s=09c4b4cdc8614c76343a202fd17a6f9f; csrftoken=jzHGJ9n7g5qGMp0JDDcUFNUH4uZMIos4; __jsl_clearance_s=1702017676.384|0|ZDiM6js7b4c8curqpLu%2FluWgrQk%3D",
    "Host": "www.seebug.org",
    "Sec-Ch-Ua": "\"Microsoft Edge\";v=\"119\", \"Chromium\";v=\"119\", \"Not?A_Brand\";v=\"24\"",
    "Sec-Ch-Ua-Mobile": "?0",
    "Sec-Ch-Ua-Platform": "\"Windows\"",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-User": "?1",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
}


def onePort(i):
    conn = mysql.connector.connect(
        host='127.0.0.1',
        user='root',
        password='123456',
        database='test'
    )
    cursor = conn.cursor()

    # 发送带有请求标头的 HTTP 请求
    url = "https://www.seebug.org/vuldb/ssvid-"+str(i)
    response = requests.get(url, headers=headers)

    page_text = response.text

    tree = etree.HTML(page_text)

    cve = tree.xpath(
        '//div[@class="col-md-4"][3]//a/text()')[0]

    if not cve.startswith('CVE'):
        return

    try:
        zujian = tree.xpath(
            '//div[@class="col-md-4"][2]//dd[@class="hover-scroll"]/a/text()')[0]
    except:
        return

    cve = cve.strip(' ')
    zujian = zujian.strip(' ')
    zujian = zujian.strip('\n')
    zujian = zujian.strip(' ')

    query = "UPDATE vule_detail SET fingerprint = %s WHERE cve_id = %s"
    cursor.execute(query, (zujian, cve))
    conn.commit()

    # 打印数据
    print(cve,zujian)

for i in range(1,100000):

    try:
        onePort(i)
    except Exception as e:
        # 异常发生时的处理代码
        print("发生了异常:", e)

经过反复调试,大概测出来开四个线程,每次操作sleep(1.5)结果大概是准确的。并将所有非正常没有测出来的结果加入到err列表中,之后再测一遍

多线程版本

python 复制代码
import base64
import json
import urllib
from concurrent.futures import ThreadPoolExecutor

import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from lxml import etree
from time import sleep
import concurrent.futures
import mysql.connector
from selenium.webdriver.chrome.options import Options

# 设置请求标头
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "Cookie": "__jsluid_s=09c4b4cdc8614c76343a202fd17a6f9f; csrftoken=jzHGJ9n7g5qGMp0JDDcUFNUH4uZMIos4; __jsluid_h=2a28fe38e039c4da1c96c5210fc7efe2; __jsl_clearance=1702018884.088|0|oBrD%2FgcZqnzC%2ByluySVPTWe2ppY%3D; __jsl_clearance_s=1702021353.773|0|RGEwoMzRAd4O927zrqdiZ%2BNiY0s%3D",
    "Host": "www.seebug.org",
    "Sec-Ch-Ua": "\"Microsoft Edge\";v=\"119\", \"Chromium\";v=\"119\", \"Not?A_Brand\";v=\"24\"",
    "Sec-Ch-Ua-Mobile": "?0",
    "Sec-Ch-Ua-Platform": "\"Windows\"",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-User": "?1",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
}


err=[]

def onePort(i):
    sleep(1.5)
    conn = mysql.connector.connect(
        host='127.0.0.1',
        user='root',
        password='123456',
        database='test'
    )
    cursor = conn.cursor()

    # 发送带有请求标头的 HTTP 请求
    url = "https://www.seebug.org/vuldb/ssvid-" + str(i)
    response = requests.get(url, headers=headers)

    page_text = response.text

    tree = etree.HTML(page_text)
    #print(page_text)
    try:
        cve = tree.xpath(
            '//div[@class="col-md-4"][3]//a/text()')[0]
    except Exception as e:
    # 异常发生时的处理代码
            print("发生了异常:", e,i)
            err.append(i)
            return

    if not cve.startswith('CVE'):
        print("非CVE",i)
        return

    try:
        zujian = tree.xpath(
            '//div[@class="col-md-4"][2]//dd[@class="hover-scroll"]/a/text()')[0]
    except Exception as e:
        print("组件为空:", i)
        return

    cve = cve.strip(' ')
    zujian = zujian.strip(' ')
    zujian = zujian.strip('\n')
    zujian = zujian.strip(' ')

    query = "UPDATE vule_detail SET fingerprint = %s WHERE cve_id = %s"
    cursor.execute(query, (zujian, cve))
    conn.commit()

    # 打印数据
    print(cve, zujian)


# 创建 ThreadPoolExecutor 对象
executor = ThreadPoolExecutor(max_workers=4)

# 提交任务给线程池执行
for i in range(1,100000):
    executor.submit(onePort, i)

# 关闭线程池
executor.shutdown()

print(err)

TODO:自动生成cookie,来对抗cookie的时效。要不然既限制速率,又限制时效,实在太伤了

应该会去扒这个项目:

https://github.com/seishinkouki/seebug_crawler

相关推荐
杜子不疼.3 小时前
Python 爬虫 + AI 总结:自动生成行业日报系统
人工智能·爬虫·python
ZC跨境爬虫1 天前
Scrapy多级请求实战:5sing伴奏网爬取踩坑与优化全记录(JSON提取+Xpath解析)
爬虫·scrapy·html·json
willhuo1 天前
基于Playwright的抖音网页自动化浏览器项目使用指南
爬虫·c#·.netcore·webview
-To be number.wan1 天前
Python爬取百度指数保姆级教程
爬虫·python
程序员老邢1 天前
【产品底稿 04】商助慧 V1.1 里程碑:爬虫入库 + MySQL + Milvus 全链路打通
java·爬虫·mysql·ai·springboot·milvus
ZC跨境爬虫2 天前
【爬虫实战对比】Requests vs Scrapy 笔趣阁小说爬虫,从单线程到高效并发的全方位升级
前端·爬虫·scrapy·html
ZC跨境爬虫2 天前
【Scrapy实战避坑】5sing网站爬虫从0到1,踩遍动态渲染、正则匹配全坑(附完整解决方案)
爬虫·scrapy
ZC跨境爬虫2 天前
Scrapy实战爬取5sing网站:Pipeline优化+全流程踩坑复盘,从报错到数据落地
前端·爬虫·python·scrapy
码农很忙2 天前
爬虫与反爬虫攻防战:技术解析与实战指南
爬虫
大數據精準工單獲取2 天前
【数据抓取】 编写爬虫基本请求:使用爬虫框架发送 HTTP 请求,获取网页内容
爬虫·网络协议·http