爬虫学习日记第九篇(爬取seebug)

目标:https://www.seebug.org/vuldb/vulnerabilities

需求:爬取cve_id及影响组件

单线程

cookie是有时效的(过一段时间就不行了,大概半小时左右),但是并不需要登录(直接抓包拿到的请求头)

python 复制代码
import base64
import json
import urllib
from concurrent.futures import ThreadPoolExecutor

import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from lxml import etree
from time import sleep
import concurrent.futures
import mysql.connector
#直接添加这四行代码
from selenium.webdriver.chrome.options import Options

# 设置请求标头
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "Cookie": "__jsluid_s=09c4b4cdc8614c76343a202fd17a6f9f; csrftoken=jzHGJ9n7g5qGMp0JDDcUFNUH4uZMIos4; __jsl_clearance_s=1702017676.384|0|ZDiM6js7b4c8curqpLu%2FluWgrQk%3D",
    "Host": "www.seebug.org",
    "Sec-Ch-Ua": "\"Microsoft Edge\";v=\"119\", \"Chromium\";v=\"119\", \"Not?A_Brand\";v=\"24\"",
    "Sec-Ch-Ua-Mobile": "?0",
    "Sec-Ch-Ua-Platform": "\"Windows\"",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-User": "?1",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
}


def onePort(i):
    conn = mysql.connector.connect(
        host='127.0.0.1',
        user='root',
        password='123456',
        database='test'
    )
    cursor = conn.cursor()

    # 发送带有请求标头的 HTTP 请求
    url = "https://www.seebug.org/vuldb/ssvid-"+str(i)
    response = requests.get(url, headers=headers)

    page_text = response.text

    tree = etree.HTML(page_text)

    cve = tree.xpath(
        '//div[@class="col-md-4"][3]//a/text()')[0]

    if not cve.startswith('CVE'):
        return

    try:
        zujian = tree.xpath(
            '//div[@class="col-md-4"][2]//dd[@class="hover-scroll"]/a/text()')[0]
    except:
        return

    cve = cve.strip(' ')
    zujian = zujian.strip(' ')
    zujian = zujian.strip('\n')
    zujian = zujian.strip(' ')

    query = "UPDATE vule_detail SET fingerprint = %s WHERE cve_id = %s"
    cursor.execute(query, (zujian, cve))
    conn.commit()

    # 打印数据
    print(cve,zujian)

for i in range(1,100000):

    try:
        onePort(i)
    except Exception as e:
        # 异常发生时的处理代码
        print("发生了异常:", e)

经过反复调试,大概测出来开四个线程,每次操作sleep(1.5)结果大概是准确的。并将所有非正常没有测出来的结果加入到err列表中,之后再测一遍

多线程版本

python 复制代码
import base64
import json
import urllib
from concurrent.futures import ThreadPoolExecutor

import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from lxml import etree
from time import sleep
import concurrent.futures
import mysql.connector
from selenium.webdriver.chrome.options import Options

# 设置请求标头
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "Cookie": "__jsluid_s=09c4b4cdc8614c76343a202fd17a6f9f; csrftoken=jzHGJ9n7g5qGMp0JDDcUFNUH4uZMIos4; __jsluid_h=2a28fe38e039c4da1c96c5210fc7efe2; __jsl_clearance=1702018884.088|0|oBrD%2FgcZqnzC%2ByluySVPTWe2ppY%3D; __jsl_clearance_s=1702021353.773|0|RGEwoMzRAd4O927zrqdiZ%2BNiY0s%3D",
    "Host": "www.seebug.org",
    "Sec-Ch-Ua": "\"Microsoft Edge\";v=\"119\", \"Chromium\";v=\"119\", \"Not?A_Brand\";v=\"24\"",
    "Sec-Ch-Ua-Mobile": "?0",
    "Sec-Ch-Ua-Platform": "\"Windows\"",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-User": "?1",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
}


err=[]

def onePort(i):
    sleep(1.5)
    conn = mysql.connector.connect(
        host='127.0.0.1',
        user='root',
        password='123456',
        database='test'
    )
    cursor = conn.cursor()

    # 发送带有请求标头的 HTTP 请求
    url = "https://www.seebug.org/vuldb/ssvid-" + str(i)
    response = requests.get(url, headers=headers)

    page_text = response.text

    tree = etree.HTML(page_text)
    #print(page_text)
    try:
        cve = tree.xpath(
            '//div[@class="col-md-4"][3]//a/text()')[0]
    except Exception as e:
    # 异常发生时的处理代码
            print("发生了异常:", e,i)
            err.append(i)
            return

    if not cve.startswith('CVE'):
        print("非CVE",i)
        return

    try:
        zujian = tree.xpath(
            '//div[@class="col-md-4"][2]//dd[@class="hover-scroll"]/a/text()')[0]
    except Exception as e:
        print("组件为空:", i)
        return

    cve = cve.strip(' ')
    zujian = zujian.strip(' ')
    zujian = zujian.strip('\n')
    zujian = zujian.strip(' ')

    query = "UPDATE vule_detail SET fingerprint = %s WHERE cve_id = %s"
    cursor.execute(query, (zujian, cve))
    conn.commit()

    # 打印数据
    print(cve, zujian)


# 创建 ThreadPoolExecutor 对象
executor = ThreadPoolExecutor(max_workers=4)

# 提交任务给线程池执行
for i in range(1,100000):
    executor.submit(onePort, i)

# 关闭线程池
executor.shutdown()

print(err)

TODO:自动生成cookie,来对抗cookie的时效。要不然既限制速率,又限制时效,实在太伤了

应该会去扒这个项目:

https://github.com/seishinkouki/seebug_crawler

相关推荐
在放️25 分钟前
Python 爬虫 · PyQuery 模块基础
爬虫·python
数据知道3 小时前
指纹浏览器本地存储“孤岛化”:IndexedDB、LocalStorage、SessionStorage 的安全隔离
爬虫·安全·数据采集·指纹浏览器
小白学大数据1 天前
线上故障急救:依托 OpenClaw 日志排查 403 和 503 问题
爬虫·python·selenium·数据分析
有味道的男人1 天前
利用爬虫获取中国制造网商品详情:高效采集完整方案
爬虫·制造
anew___1 天前
2026年Python爬虫技术完全指南:从入门到实战
开发语言·爬虫·python
深蓝电商API1 天前
无头浏览器性能优化:内存占用从2GB降到200MB
爬虫·性能优化
深蓝电商API1 天前
CDP协议深度解析:不通过WebDriver直接操控浏览器
爬虫
北极星日淘1 天前
Python代理池动态适配日淘爬虫|解决高频抓取IP封禁终极方案(含完整源码)
爬虫·python·tcp/ip
赵大大宝1 天前
Selenium 从入门到精通:自动化测试与爬虫实战全攻略
爬虫·selenium·测试工具
北极星日淘2 天前
Python爬虫断点续爬实战|基于Redis实现日淘商品增量抓取(解决重启全量重爬问题)
redis·爬虫·python