爬虫学习日记第九篇(爬取seebug)

目标:https://www.seebug.org/vuldb/vulnerabilities

需求:爬取cve_id及影响组件

单线程

cookie是有时效的(过一段时间就不行了,大概半小时左右),但是并不需要登录(直接抓包拿到的请求头)

python 复制代码
import base64
import json
import urllib
from concurrent.futures import ThreadPoolExecutor

import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from lxml import etree
from time import sleep
import concurrent.futures
import mysql.connector
#直接添加这四行代码
from selenium.webdriver.chrome.options import Options

# 设置请求标头
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "Cookie": "__jsluid_s=09c4b4cdc8614c76343a202fd17a6f9f; csrftoken=jzHGJ9n7g5qGMp0JDDcUFNUH4uZMIos4; __jsl_clearance_s=1702017676.384|0|ZDiM6js7b4c8curqpLu%2FluWgrQk%3D",
    "Host": "www.seebug.org",
    "Sec-Ch-Ua": "\"Microsoft Edge\";v=\"119\", \"Chromium\";v=\"119\", \"Not?A_Brand\";v=\"24\"",
    "Sec-Ch-Ua-Mobile": "?0",
    "Sec-Ch-Ua-Platform": "\"Windows\"",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-User": "?1",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
}


def onePort(i):
    conn = mysql.connector.connect(
        host='127.0.0.1',
        user='root',
        password='123456',
        database='test'
    )
    cursor = conn.cursor()

    # 发送带有请求标头的 HTTP 请求
    url = "https://www.seebug.org/vuldb/ssvid-"+str(i)
    response = requests.get(url, headers=headers)

    page_text = response.text

    tree = etree.HTML(page_text)

    cve = tree.xpath(
        '//div[@class="col-md-4"][3]//a/text()')[0]

    if not cve.startswith('CVE'):
        return

    try:
        zujian = tree.xpath(
            '//div[@class="col-md-4"][2]//dd[@class="hover-scroll"]/a/text()')[0]
    except:
        return

    cve = cve.strip(' ')
    zujian = zujian.strip(' ')
    zujian = zujian.strip('\n')
    zujian = zujian.strip(' ')

    query = "UPDATE vule_detail SET fingerprint = %s WHERE cve_id = %s"
    cursor.execute(query, (zujian, cve))
    conn.commit()

    # 打印数据
    print(cve,zujian)

for i in range(1,100000):

    try:
        onePort(i)
    except Exception as e:
        # 异常发生时的处理代码
        print("发生了异常:", e)

经过反复调试,大概测出来开四个线程,每次操作sleep(1.5)结果大概是准确的。并将所有非正常没有测出来的结果加入到err列表中,之后再测一遍

多线程版本

python 复制代码
import base64
import json
import urllib
from concurrent.futures import ThreadPoolExecutor

import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from lxml import etree
from time import sleep
import concurrent.futures
import mysql.connector
from selenium.webdriver.chrome.options import Options

# 设置请求标头
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "Cookie": "__jsluid_s=09c4b4cdc8614c76343a202fd17a6f9f; csrftoken=jzHGJ9n7g5qGMp0JDDcUFNUH4uZMIos4; __jsluid_h=2a28fe38e039c4da1c96c5210fc7efe2; __jsl_clearance=1702018884.088|0|oBrD%2FgcZqnzC%2ByluySVPTWe2ppY%3D; __jsl_clearance_s=1702021353.773|0|RGEwoMzRAd4O927zrqdiZ%2BNiY0s%3D",
    "Host": "www.seebug.org",
    "Sec-Ch-Ua": "\"Microsoft Edge\";v=\"119\", \"Chromium\";v=\"119\", \"Not?A_Brand\";v=\"24\"",
    "Sec-Ch-Ua-Mobile": "?0",
    "Sec-Ch-Ua-Platform": "\"Windows\"",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-User": "?1",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
}


err=[]

def onePort(i):
    sleep(1.5)
    conn = mysql.connector.connect(
        host='127.0.0.1',
        user='root',
        password='123456',
        database='test'
    )
    cursor = conn.cursor()

    # 发送带有请求标头的 HTTP 请求
    url = "https://www.seebug.org/vuldb/ssvid-" + str(i)
    response = requests.get(url, headers=headers)

    page_text = response.text

    tree = etree.HTML(page_text)
    #print(page_text)
    try:
        cve = tree.xpath(
            '//div[@class="col-md-4"][3]//a/text()')[0]
    except Exception as e:
    # 异常发生时的处理代码
            print("发生了异常:", e,i)
            err.append(i)
            return

    if not cve.startswith('CVE'):
        print("非CVE",i)
        return

    try:
        zujian = tree.xpath(
            '//div[@class="col-md-4"][2]//dd[@class="hover-scroll"]/a/text()')[0]
    except Exception as e:
        print("组件为空:", i)
        return

    cve = cve.strip(' ')
    zujian = zujian.strip(' ')
    zujian = zujian.strip('\n')
    zujian = zujian.strip(' ')

    query = "UPDATE vule_detail SET fingerprint = %s WHERE cve_id = %s"
    cursor.execute(query, (zujian, cve))
    conn.commit()

    # 打印数据
    print(cve, zujian)


# 创建 ThreadPoolExecutor 对象
executor = ThreadPoolExecutor(max_workers=4)

# 提交任务给线程池执行
for i in range(1,100000):
    executor.submit(onePort, i)

# 关闭线程池
executor.shutdown()

print(err)

TODO:自动生成cookie,来对抗cookie的时效。要不然既限制速率,又限制时效,实在太伤了

应该会去扒这个项目:

https://github.com/seishinkouki/seebug_crawler

相关推荐
镜花照无眠4 小时前
Python爬虫使用实例-mdrama
开发语言·爬虫·python
小白学大数据11 小时前
User-Agent在WebMagic爬虫中的重要性
开发语言·爬虫·http
大神薯条老师21 小时前
Python从入门到高手4.3节-掌握跳转控制语句
后端·爬虫·python·深度学习·机器学习·数据分析
wdxylb1 天前
Pyhton爬虫使用Selenium实现浏览器自动化操作抓取网页
爬虫·selenium·测试工具
菜鸡中的奋斗鸡→挣扎鸡2 天前
初始爬虫11
开发语言·爬虫·python
凡人的AI工具箱2 天前
15分钟学 Python 第35天 :Python 爬虫入门(一)
开发语言·数据结构·人工智能·后端·爬虫·python
新缸中之脑2 天前
ScrapeGraphAI 大模型增强的网络爬虫
爬虫
易辰君2 天前
python爬虫 - 初识爬虫
开发语言·爬虫·python
人生の三重奏2 天前
爬虫——同步与异步加载
爬虫·jsonpath·同步与异步·腾讯新闻
人生の三重奏2 天前
爬虫——爬取小音乐网站
爬虫