基于Deep Web爬虫的当当网图书信息采集

实验八 基于 Deep Web 爬虫的当当网图书信息采集
一、实验目的

1.掌握Deep Web爬虫的基本概念。
2.综合掌握Deep Web爬虫提取当当网图书信息和图书信息保存的过程。
二、实验内容

网址为：url=' http://search.dangdang.com/advsearch', 爬取当当网中"清华大学出版社"图书信息，并进行保存。
三、程序代码及分步功能解析

python 复制代码
# -*- coding: utf-8 -*-

import requests

from bs4 import BeautifulSoup

import traceback

import os

import urllib.parse

from pathlib import Path

import warnings



# 忽略无关警告

warnings.filterwarnings('ignore')



# 配置请求头（模拟浏览器，降低反爬概率）

HEADERS = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',

    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

    'Accept-Language': 'zh-CN,zh;q=0.9',

    'Connection': 'keep-alive'

}



def read_list(txt_path):

    """读取出版社列表（修复编码问题 + 异常处理）"""

    press_list = []

    try:

        # 显式指定 UTF-8 编码读取，解决 GBK 解码失败问题

        with open(txt_path, 'r', encoding='utf-8') as f:

            for line in f.readlines():

                line = line.strip('\n').strip()  # 额外去除首尾空格

                if line:  # 跳过空行

                    press_list.append(line)

        print(f"成功读取 {len(press_list)} 个出版社名称")

    except FileNotFoundError:

        print(f"错误：未找到文件 {txt_path}")

        traceback.print_exc()

    except PermissionError:

        print(f"错误：无权限读取文件 {txt_path}")

        traceback.print_exc()

    except Exception as e:

        print(f"读取出版社列表失败：{e}")

        traceback.print_exc()

    return press_list



def build_form(press_name):

    """定位input标签，拼接URL（优化编码 + 容错）"""

    try:

        res = requests.get('http://search.dangdang.com/advsearch', headers=HEADERS, timeout=10)

        res.encoding = 'gbk'  # 当当网页面实际编码为 GBK（GB2312 超集）

        soup = BeautifulSoup(res.text, 'html.parser')

       

        # 定位出版社对应的input标签（优化选择器，提高稳定性）

        input_tag_name = ''

        conditions = soup.select('.detail_condition label')

        print(f'共找到{len(conditions)}项基本条件,正在寻找出版社input标签')

       

        for item in conditions:

            span_text = item.find('span')

            if span_text and '出版社' in span_text.get_text(strip=True):

                input_tag = item.find('input')

                if input_tag and input_tag.get('name'):

                    input_tag_name = input_tag.get('name')

                    print(f'找到出版社input标签，name: {input_tag_name}')

                    break

       

        if not input_tag_name:

            print("警告：未找到出版社对应的input标签，使用默认值 'press'")

            input_tag_name = 'press'  # 兜底默认值

       

        # 拼接URL（优化编码逻辑，避免GB2312编码失败）

        keyword = {

            'medium': '01',

            input_tag_name: press_name,  # 交由urlencode自动处理编码

            'category_path': '01.00.00.00.00.00',

            'sort_type': 'sort_pubdate_desc'

        }

        # 使用GBK编码URL参数（适配当当网）

        url = 'http://search.dangdang.com/?' + urllib.parse.urlencode(keyword, encoding='gbk')

        print(f'入口地址: {url}')

        return url

    except Exception as e:

        print(f"构建URL失败：{e}")

        traceback.print_exc()

        return ''



def get_info(entry_url):

    """抓取图书信息（增强容错 + 元素定位优化）"""

    if not entry_url:

        print("入口URL为空，跳过抓取")

        return {'title': [], 'price': [], 'date': [], 'comment': []}

   

    books_title = []

    books_price = []

    books_date = []

    books_comment = []

   

    try:

        res = requests.get(entry_url, headers=HEADERS, timeout=10)

        res.encoding = 'gbk'

        soup = BeautifulSoup(res.text, 'html.parser')

       

        # 获取页数（优化选择器，避免索引越界）

        page_elem = soup.select('.data > span')

        page_num = 1

        if len(page_elem) >= 2:

            try:

                page_num = int(page_elem[1].get_text(strip=True).strip('/'))

            except ValueError:

                page_num = 1

        print(f'共 {page_num} 页待抓取，测试采集1页')

        page_num = 1  # 测试仅抓1页

       

        for i in range(1, page_num + 1):

            now_url = f"{entry_url}&page_index={i}"

            print(f'正在获取第{i}页, URL: {now_url}')

           

            res = requests.get(now_url, headers=HEADERS, timeout=10)

            res.encoding = 'gbk'

            soup = BeautifulSoup(res.text, 'html.parser')

           

            # 定位图书列表项（优化选择器）

            book_items = soup.select('ul.bigimg > li[ddt-pit]')

            if not book_items:

                print("未找到图书列表项，跳过当前页")

                continue

            

            for item in book_items:

                # 书名

                title_elem = item.find('a', attrs={'title': True})

                title = title_elem.get('title', '未知书名') if title_elem else '未知书名'

                books_title.append(title)

               

                # 价格

                price_elem = item.select_one('p.price > span.search_now_price')

                price = price_elem.get_text(strip=True) if price_elem else '未知价格'

                books_price.append(price)

                

                # 评论数

                comment_elem = item.select_one('p.search_star_line > a')

                comment = comment_elem.get_text(strip=True) if comment_elem else '0条评论'

                books_comment.append(comment)

               

                # 出版日期（优化索引容错）

                date_elems = item.select('p.search_book_author > span')

                date = '未知日期'

                if len(date_elems) >= 2:

                    date_text = date_elems[1].get_text(strip=True)

                    date = date_text[2:] if len(date_text) >= 2 else date_text

                books_date.append(date)

       

        print(f"成功抓取 {len(books_title)} 本图书信息")

    except Exception as e:

        print(f"抓取信息失败：{e}")

        traceback.print_exc()

   

    return {'title': books_title, 'price': books_price, 'date': books_date, 'comment': books_comment}



def save_info(file_dir, press_name, books_dict):

    """保存数据（自动创建目录 + 异常处理）"""

    # 自动创建保存目录（不存在则创建）

    Path(file_dir).mkdir(parents=True, exist_ok=True)

   

    res = ''

    try:

        # 容错：取最短列表长度，避免索引越界

        max_len = min(len(books_dict['title']), len(books_dict['price']),

                      len(books_dict['date']), len(books_dict['comment']))

       

        for i in range(max_len):

            res += (f"{i+1}.书名: {books_dict['title'][i]}\r\n"

                    f"价格: {books_dict['price'][i]}\r\n"

                    f"出版日期: {books_dict['date'][i]}\r\n"

                    f"评论数量: {books_dict['comment'][i]}\r\n\r\n")

    except Exception as e:

        print(f"拼接数据出错：{e}")

        traceback.print_exc()

    finally:

        # 处理出版社名称中的非法文件名字符

        safe_press_name = press_name.replace('/', '_').replace('\\', '_').replace(':', '_')

        file_path = os.path.join(file_dir, f"{safe_press_name}.txt")

        try:

            with open(file_path, "w", encoding="utf-8") as f:

                f.write(res)

            print(f"数据已保存至：{file_path}")

        except Exception as e:

            print(f"保存文件失败：{e}")

            traceback.print_exc()



def start_spider(press_path, saved_file_dir):

    """入口函数（批量处理出版社）"""

    press_list = read_list(press_path)

    if not press_list:

        print("无有效出版社列表，终止爬取")

        return

   

    for press_name in press_list:

        print(f"\n------ 开始抓取 {press_name} ------")

        press_page_url = build_form(press_name)

        books_dict = get_info(press_page_url)

        save_info(saved_file_dir, press_name, books_dict)

        print(f"------- 出版社: {press_name} 抓取完毕 -------")



if __name__ == '__main__':

    # 出版社名列表所在文件路径

    press_txt_path = r'C:\Users\Administrator\Desktop\press.txt'

    # 抓取信息保存路径

    saved_file_dir = r'C:\Users\Administrator\Desktop\图书信息采集'

    # 启动

    start_spider(press_txt_path, saved_file_dir)
四、程序调试结果（要求截取详细步骤）

五、实验总结

实验成功爬取目标出版社图书信息，验证了 Deep Web 爬虫通过模拟表单请求访问隐藏数据的核心逻辑。过程中需注意页面编码适配、HTML 元素定位容错性，以及反爬策略规避，加深了对爬虫请求构造、数据解析和持久化的综合应用能力。