爬虫学习日记第九篇(爬取seebug)

目标:https://www.seebug.org/vuldb/vulnerabilities

需求:爬取cve_id及影响组件

单线程

cookie是有时效的(过一段时间就不行了,大概半小时左右),但是并不需要登录(直接抓包拿到的请求头)

python 复制代码
import base64
import json
import urllib
from concurrent.futures import ThreadPoolExecutor

import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from lxml import etree
from time import sleep
import concurrent.futures
import mysql.connector
#直接添加这四行代码
from selenium.webdriver.chrome.options import Options

# 设置请求标头
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "Cookie": "__jsluid_s=09c4b4cdc8614c76343a202fd17a6f9f; csrftoken=jzHGJ9n7g5qGMp0JDDcUFNUH4uZMIos4; __jsl_clearance_s=1702017676.384|0|ZDiM6js7b4c8curqpLu%2FluWgrQk%3D",
    "Host": "www.seebug.org",
    "Sec-Ch-Ua": "\"Microsoft Edge\";v=\"119\", \"Chromium\";v=\"119\", \"Not?A_Brand\";v=\"24\"",
    "Sec-Ch-Ua-Mobile": "?0",
    "Sec-Ch-Ua-Platform": "\"Windows\"",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-User": "?1",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
}


def onePort(i):
    conn = mysql.connector.connect(
        host='127.0.0.1',
        user='root',
        password='123456',
        database='test'
    )
    cursor = conn.cursor()

    # 发送带有请求标头的 HTTP 请求
    url = "https://www.seebug.org/vuldb/ssvid-"+str(i)
    response = requests.get(url, headers=headers)

    page_text = response.text

    tree = etree.HTML(page_text)

    cve = tree.xpath(
        '//div[@class="col-md-4"][3]//a/text()')[0]

    if not cve.startswith('CVE'):
        return

    try:
        zujian = tree.xpath(
            '//div[@class="col-md-4"][2]//dd[@class="hover-scroll"]/a/text()')[0]
    except:
        return

    cve = cve.strip(' ')
    zujian = zujian.strip(' ')
    zujian = zujian.strip('\n')
    zujian = zujian.strip(' ')

    query = "UPDATE vule_detail SET fingerprint = %s WHERE cve_id = %s"
    cursor.execute(query, (zujian, cve))
    conn.commit()

    # 打印数据
    print(cve,zujian)

for i in range(1,100000):

    try:
        onePort(i)
    except Exception as e:
        # 异常发生时的处理代码
        print("发生了异常:", e)

经过反复调试,大概测出来开四个线程,每次操作sleep(1.5)结果大概是准确的。并将所有非正常没有测出来的结果加入到err列表中,之后再测一遍

多线程版本

python 复制代码
import base64
import json
import urllib
from concurrent.futures import ThreadPoolExecutor

import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from lxml import etree
from time import sleep
import concurrent.futures
import mysql.connector
from selenium.webdriver.chrome.options import Options

# 设置请求标头
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "Cookie": "__jsluid_s=09c4b4cdc8614c76343a202fd17a6f9f; csrftoken=jzHGJ9n7g5qGMp0JDDcUFNUH4uZMIos4; __jsluid_h=2a28fe38e039c4da1c96c5210fc7efe2; __jsl_clearance=1702018884.088|0|oBrD%2FgcZqnzC%2ByluySVPTWe2ppY%3D; __jsl_clearance_s=1702021353.773|0|RGEwoMzRAd4O927zrqdiZ%2BNiY0s%3D",
    "Host": "www.seebug.org",
    "Sec-Ch-Ua": "\"Microsoft Edge\";v=\"119\", \"Chromium\";v=\"119\", \"Not?A_Brand\";v=\"24\"",
    "Sec-Ch-Ua-Mobile": "?0",
    "Sec-Ch-Ua-Platform": "\"Windows\"",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-User": "?1",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
}


err=[]

def onePort(i):
    sleep(1.5)
    conn = mysql.connector.connect(
        host='127.0.0.1',
        user='root',
        password='123456',
        database='test'
    )
    cursor = conn.cursor()

    # 发送带有请求标头的 HTTP 请求
    url = "https://www.seebug.org/vuldb/ssvid-" + str(i)
    response = requests.get(url, headers=headers)

    page_text = response.text

    tree = etree.HTML(page_text)
    #print(page_text)
    try:
        cve = tree.xpath(
            '//div[@class="col-md-4"][3]//a/text()')[0]
    except Exception as e:
    # 异常发生时的处理代码
            print("发生了异常:", e,i)
            err.append(i)
            return

    if not cve.startswith('CVE'):
        print("非CVE",i)
        return

    try:
        zujian = tree.xpath(
            '//div[@class="col-md-4"][2]//dd[@class="hover-scroll"]/a/text()')[0]
    except Exception as e:
        print("组件为空:", i)
        return

    cve = cve.strip(' ')
    zujian = zujian.strip(' ')
    zujian = zujian.strip('\n')
    zujian = zujian.strip(' ')

    query = "UPDATE vule_detail SET fingerprint = %s WHERE cve_id = %s"
    cursor.execute(query, (zujian, cve))
    conn.commit()

    # 打印数据
    print(cve, zujian)


# 创建 ThreadPoolExecutor 对象
executor = ThreadPoolExecutor(max_workers=4)

# 提交任务给线程池执行
for i in range(1,100000):
    executor.submit(onePort, i)

# 关闭线程池
executor.shutdown()

print(err)

TODO:自动生成cookie,来对抗cookie的时效。要不然既限制速率,又限制时效,实在太伤了

应该会去扒这个项目:

https://github.com/seishinkouki/seebug_crawler

相关推荐
遇事不決洛必達17 小时前
【Python基础】GIL 锁是什么及其对爬虫的影响
爬虫·python·线程·进程·gil锁
綝~18 小时前
爬虫数据采集工程师岗位面试题
爬虫·面试·请求
跨境数据猎手19 小时前
大数据在电商行业的应用
大数据·运维·爬虫
tang777891 天前
异步爬虫与代理IP池结合:用aiohttp提升10倍抓取效率
爬虫·网络爬虫·爬虫代理·代理ip·代理ip池
深蓝电商API1 天前
行为模拟的艺术:如何让爬虫的鼠标轨迹像真人
爬虫
嫂子的姐夫1 天前
047-MD5:飞卢网
爬虫·python·js逆向·逆向
数据知道1 天前
从Playwright到自研:构建指纹浏览器的技术栈选型与路线图
爬虫·数据采集·指纹浏览器
嫂子的姐夫1 天前
050-wx小程序合肥住房
爬虫·python·小程序·逆向
yijianace1 天前
Python爬虫学习记录—— BooksToScrape分页爬取与图片下载
爬虫·python
小白学大数据1 天前
如何自动追踪 eBay 售价?Python 爬虫实战解析
开发语言·人工智能·爬虫·python