爬虫学习日记第九篇(爬取seebug)

目标:https://www.seebug.org/vuldb/vulnerabilities

需求:爬取cve_id及影响组件

单线程

cookie是有时效的(过一段时间就不行了,大概半小时左右),但是并不需要登录(直接抓包拿到的请求头)

python 复制代码
import base64
import json
import urllib
from concurrent.futures import ThreadPoolExecutor

import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from lxml import etree
from time import sleep
import concurrent.futures
import mysql.connector
#直接添加这四行代码
from selenium.webdriver.chrome.options import Options

# 设置请求标头
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "Cookie": "__jsluid_s=09c4b4cdc8614c76343a202fd17a6f9f; csrftoken=jzHGJ9n7g5qGMp0JDDcUFNUH4uZMIos4; __jsl_clearance_s=1702017676.384|0|ZDiM6js7b4c8curqpLu%2FluWgrQk%3D",
    "Host": "www.seebug.org",
    "Sec-Ch-Ua": "\"Microsoft Edge\";v=\"119\", \"Chromium\";v=\"119\", \"Not?A_Brand\";v=\"24\"",
    "Sec-Ch-Ua-Mobile": "?0",
    "Sec-Ch-Ua-Platform": "\"Windows\"",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-User": "?1",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
}


def onePort(i):
    conn = mysql.connector.connect(
        host='127.0.0.1',
        user='root',
        password='123456',
        database='test'
    )
    cursor = conn.cursor()

    # 发送带有请求标头的 HTTP 请求
    url = "https://www.seebug.org/vuldb/ssvid-"+str(i)
    response = requests.get(url, headers=headers)

    page_text = response.text

    tree = etree.HTML(page_text)

    cve = tree.xpath(
        '//div[@class="col-md-4"][3]//a/text()')[0]

    if not cve.startswith('CVE'):
        return

    try:
        zujian = tree.xpath(
            '//div[@class="col-md-4"][2]//dd[@class="hover-scroll"]/a/text()')[0]
    except:
        return

    cve = cve.strip(' ')
    zujian = zujian.strip(' ')
    zujian = zujian.strip('\n')
    zujian = zujian.strip(' ')

    query = "UPDATE vule_detail SET fingerprint = %s WHERE cve_id = %s"
    cursor.execute(query, (zujian, cve))
    conn.commit()

    # 打印数据
    print(cve,zujian)

for i in range(1,100000):

    try:
        onePort(i)
    except Exception as e:
        # 异常发生时的处理代码
        print("发生了异常:", e)

经过反复调试,大概测出来开四个线程,每次操作sleep(1.5)结果大概是准确的。并将所有非正常没有测出来的结果加入到err列表中,之后再测一遍

多线程版本

python 复制代码
import base64
import json
import urllib
from concurrent.futures import ThreadPoolExecutor

import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from lxml import etree
from time import sleep
import concurrent.futures
import mysql.connector
from selenium.webdriver.chrome.options import Options

# 设置请求标头
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "Cookie": "__jsluid_s=09c4b4cdc8614c76343a202fd17a6f9f; csrftoken=jzHGJ9n7g5qGMp0JDDcUFNUH4uZMIos4; __jsluid_h=2a28fe38e039c4da1c96c5210fc7efe2; __jsl_clearance=1702018884.088|0|oBrD%2FgcZqnzC%2ByluySVPTWe2ppY%3D; __jsl_clearance_s=1702021353.773|0|RGEwoMzRAd4O927zrqdiZ%2BNiY0s%3D",
    "Host": "www.seebug.org",
    "Sec-Ch-Ua": "\"Microsoft Edge\";v=\"119\", \"Chromium\";v=\"119\", \"Not?A_Brand\";v=\"24\"",
    "Sec-Ch-Ua-Mobile": "?0",
    "Sec-Ch-Ua-Platform": "\"Windows\"",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-User": "?1",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
}


err=[]

def onePort(i):
    sleep(1.5)
    conn = mysql.connector.connect(
        host='127.0.0.1',
        user='root',
        password='123456',
        database='test'
    )
    cursor = conn.cursor()

    # 发送带有请求标头的 HTTP 请求
    url = "https://www.seebug.org/vuldb/ssvid-" + str(i)
    response = requests.get(url, headers=headers)

    page_text = response.text

    tree = etree.HTML(page_text)
    #print(page_text)
    try:
        cve = tree.xpath(
            '//div[@class="col-md-4"][3]//a/text()')[0]
    except Exception as e:
    # 异常发生时的处理代码
            print("发生了异常:", e,i)
            err.append(i)
            return

    if not cve.startswith('CVE'):
        print("非CVE",i)
        return

    try:
        zujian = tree.xpath(
            '//div[@class="col-md-4"][2]//dd[@class="hover-scroll"]/a/text()')[0]
    except Exception as e:
        print("组件为空:", i)
        return

    cve = cve.strip(' ')
    zujian = zujian.strip(' ')
    zujian = zujian.strip('\n')
    zujian = zujian.strip(' ')

    query = "UPDATE vule_detail SET fingerprint = %s WHERE cve_id = %s"
    cursor.execute(query, (zujian, cve))
    conn.commit()

    # 打印数据
    print(cve, zujian)


# 创建 ThreadPoolExecutor 对象
executor = ThreadPoolExecutor(max_workers=4)

# 提交任务给线程池执行
for i in range(1,100000):
    executor.submit(onePort, i)

# 关闭线程池
executor.shutdown()

print(err)

TODO:自动生成cookie,来对抗cookie的时效。要不然既限制速率,又限制时效,实在太伤了

应该会去扒这个项目:

https://github.com/seishinkouki/seebug_crawler

相关推荐
ONE_Gua4 小时前
chromium魔改——CDP(Chrome DevTools Protocol)检测01
前端·后端·爬虫
ONE_Gua8 小时前
chromium魔改——navigator.webdriver 检测
前端·后端·爬虫
z_mazin13 小时前
JavaScript逆向魔法:Chrome开发者工具探秘之旅
javascript·chrome·爬虫
攻城狮7号1 天前
Python爬虫第2节-网页基础和爬虫基本原理
爬虫·python爬虫
π2701 天前
爬虫:网络请求(通信)步骤,http和https协议
网络·爬虫
叫我王富贵i1 天前
0基础入门scrapy 框架,获取豆瓣top250存入mysql
爬虫·python·scrapy
小爬虫程序猿1 天前
利用 PHP 爬虫按关键字搜索淘宝商品
开发语言·爬虫·php
小爬虫程序猿1 天前
淘宝商品信息如何存储到数据库?
数据库·爬虫·php
南玖yy1 天前
Python网络爬虫:从入门到实践
爬虫·python
莓事哒1 天前
selenium和pytessarct提取古诗文网的验证码(python爬虫)
爬虫·python·selenium·测试工具·pycharm