爬虫学习案例3

爬取美女图片

优美图库地址

一页图片

安装依赖库文件
复制代码
pip install selenium requests beautifulsoup4


import time
import requests
import random
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
# 设置Chrome选项
chrome_options = Options()
chrome_options.add_argument("--headless")  # 无头模式,不打开浏览器窗口
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")

# 设置ChromeDriver路径
service = Service('D:envpython3chromedriver.exe')
url = 'https://www.umei.cc/touxiangtupian/nvshengtouxiang/'
baseUrl = "https://www.umei.cc"
# 初始化WebDriver
driver = webdriver.Chrome(service=service, options=chrome_options)
driver.get(url)
time.sleep(random.uniform(5, 10))  # 等待页面加载
html = driver.page_source # 原页面
soup = BeautifulSoup(html, 'html.parser')
# print(soup)
# BeautifulSoup分析提取元素
divList = soup.find_all("div",class_= "item masonry_brick")
# print(divList)
# 一个美女信息
for divItem in divList:
    linkImage = divItem.find("div",class_ = "item_t").find("div",class_ = "img").find("a")["href"]
    linkImage = baseUrl + linkImage
    # 拿去子页面的大图
    driver.get(linkImage)
    time.sleep(random.uniform(5, 10))
    html = driver.page_source
    sonSoup = BeautifulSoup(html, 'html.parser')
    imgUrl = sonSoup.find("div",class_ = "tsmaincont-main-cont-txt").find("img")["src"]
    print(f"准备下载图片{imgUrl}")
    # 下载图片
    img_response = requests.get(imgUrl)
    img_name = imgUrl.split('/')[-1]
    with open("img"+img_name, "wb") as f:
        f.write(img_response.content)
    print(f"图片{img_name}下载完成")
print("第一页图片全部下载到当前目录了.....")
driver.quit()  # 关闭浏览器
爬取多页
复制代码
import time
import requests
import random
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
# 设置Chrome选项
chrome_options = Options()
chrome_options.add_argument("--headless")  # 无头模式,不打开浏览器窗口
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")

# 设置ChromeDriver路径
service = Service('D:envpython3chromedriver.exe')
url = 'https://www.umei.cc/touxiangtupian/nvshengtouxiang/'
baseUrl = "https://www.umei.cc"
# 初始化WebDriver
driver = webdriver.Chrome(service=service, options=chrome_options)

def getImage(url,page):
    driver.get(url)
    print(f"正在爬取第{page}页图片资源源...")
    print(url)
    time.sleep(random.uniform(5, 10))  # 等待页面加载
    html = driver.page_source # 原页面
    soup = BeautifulSoup(html, 'html.parser')
    # BeautifulSoup分析提取元素
    divList = soup.find_all("div",class_= "item masonry_brick")
    for divItem in divList:
        linkImage = divItem.find("div",class_ = "item_t").find("div",class_ = "img").find("a")["href"]
        linkImage = baseUrl + linkImage
        # 拿取子页面的大图
        driver.get(linkImage)
        time.sleep(random.uniform(5, 10))
        html = driver.page_source
        sonSoup = BeautifulSoup(html, 'html.parser')
        imgUrl = sonSoup.find("div",class_ = "tsmaincont-main-cont-txt").find("img")["src"]
        print(f"准备下载图片{imgUrl}")
        # 下载图片
        img_response = requests.get(imgUrl)
        img_name = imgUrl.split('/')[-1]
        with open("img"+img_name, "wb") as f:
            f.write(img_response.content)
        print(f"图片{img_name}下载完成")
    print(f"第{page}页图片全部下载到当前img目录了.....")

# 爬取1-10页
# 控制爬取的页面数
for page in range(1, 11):
    if page == 1:
        getImage(url,page)
    else:
        pageUrl = f"{url}index_{page}.htm"
        getImage(pageUrl,page)
driver.quit()  # 关闭浏览器
相关推荐
花酒锄作田9 小时前
使用 pkgutil 实现动态插件系统
python
前端付豪13 小时前
LangChain链 写一篇完美推文?用SequencialChain链接不同的组件
人工智能·python·langchain
曲幽13 小时前
FastAPI实战:打造本地文生图接口,ollama+diffusers让AI绘画更听话
python·fastapi·web·cors·diffusers·lcm·ollama·dreamshaper8·txt2img
老赵全栈实战14 小时前
Pydantic配置管理最佳实践(一)
python
阿尔的代码屋19 小时前
[大模型实战 07] 基于 LlamaIndex ReAct 框架手搓全自动博客监控 Agent
人工智能·python
AI探索者2 天前
LangGraph StateGraph 实战:状态机聊天机器人构建指南
python
AI探索者2 天前
LangGraph 入门:构建带记忆功能的天气查询 Agent
python
FishCoderh2 天前
Python自动化办公实战:批量重命名文件,告别手动操作
python
躺平大鹅2 天前
Python函数入门详解(定义+调用+参数)
python
曲幽2 天前
我用FastAPI接ollama大模型,差点被asyncio整崩溃(附对话窗口实战)
python·fastapi·web·async·httpx·asyncio·ollama