python基础入门：8.1项目1：爬虫与数据分析

Python爬虫与数据分析全流程实战：从数据采集到可视化呈现

python 复制代码

# 综合案例：电商价格监控分析系统
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt

# 配置参数
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                  'AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/91.0.4472.124 Safari/537.36'
}

def scrape_products(url):
    """爬取商品信息"""
    products = []
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')
      
        items = soup.select('div.product-item')
        for item in items:
            name = item.select_one('h2.product-title').text.strip()
            price = item.select_one('span.price').text.strip()
            rating = item.select_one('div.rating').attrs['data-score']
            reviews = item.select_one('a.reviews-count').text.split()[0]
          
            products.append({
                'name': name,
                'price': price,
                'rating': float(rating),
                'reviews': int(reviews.replace(',', ''))
            })
          
    except Exception as e:
        print(f"爬取失败: {str(e)}")
  
    return products

def clean_data(df):
    """数据清洗处理"""
    # 价格处理
    df['price'] = df['price'].str.replace('$', '').astype(float)
  
    # 过滤异常值
    df = df[(df['price'] > 0) & (df['price'] < 10000)]
  
    # 分类处理
    df['category'] = df['name'].str.extract(r'([A-Za-z]+) Pro')
    df['category'] = df['category'].fillna('Other')
  
    return df

def visualize_data(df):
    """数据可视化展示"""
    plt.figure(figsize=(15, 8))
  
    # 价格分布直方图
    plt.subplot(2, 2, 1)
    df['price'].plot(kind='hist', bins=20, color='skyblue')
    plt.title('价格分布')
    plt.xlabel('价格 ($)')
  
    # 评分与价格散点图
    plt.subplot(2, 2, 2)
    plt.scatter(df['rating'], df['price'], alpha=0.6)
    plt.title('评分 vs 价格')
    plt.xlabel('评分')
    plt.ylabel('价格 ($)')
  
    # 类别销量柱状图
    plt.subplot(2, 2, 3)
    df['category'].value_counts().plot(kind='bar', color='salmon')
    plt.title('商品类别分布')
    plt.xticks(rotation=45)
  
    # 价格趋势折线图
    plt.subplot(2, 2, 4)
    df.sort_values('rating').groupby('rating')['price'].mean().plot(
        marker='o', color='green'
    )
    plt.title('不同评分的平均价格')
    plt.xlabel('评分')
    plt.ylabel('平均价格 ($)')
  
    plt.tight_layout()
    plt.savefig('product_analysis.png', dpi=300)
    plt.show()

# 主程序
if __name__ == "__main__":
    # 示例电商网站（需替换实际目标网站）
    base_url = "https://example-store.com/products?page="
  
    all_products = []
    for page in range(1, 6):  # 爬取前5页
        url = f"{base_url}{page}"
        print(f"正在爬取: {url}")
        all_products.extend(scrape_products(url))
  
    df = pd.DataFrame(all_products)
    df = clean_data(df)
  
    print("\n数据概览:")
    print(df.describe())
  
    print("\n保存数据到products.csv")
    df.to_csv('products.csv', index=False)
  
    visualize_data(df)

一、高效爬虫开发技巧

网页解析优化策略

python 复制代码

# 使用CSS选择器最佳实践
def optimized_parser(html):
    soup = BeautifulSoup(html, 'lxml')  # 使用更快的解析器
  
    # 选择器优化技巧
    products = soup.select('div[data-product-id]')  # 通过属性选择
    for product in products:
        # 链式查找减少查询次数
        name = product.find(class_='title').get_text(strip=True)
        # 使用data属性获取信息
        price = product.find('meta', {'itemprop': 'price'})['content']
      
        # 异常处理
        try:
            rating = product.select_one('.stars').attrs['title']
        except (AttributeError, KeyError):
            rating = None

反爬虫应对方案

python 复制代码

# 高级请求配置
session = requests.Session()
session.proxies = {
    'http': 'http://10.10.1.10:3128',
    'https': 'http://10.10.1.10:1080',
}

# 随机延迟
from random import uniform
from time import sleep

def safe_request(url):
    sleep(uniform(1, 3))  # 随机延迟1-3秒
    return session.get(url)

# 使用代理中间件示例
class ProxyMiddleware:
    def process_request(self, request, spider):
        request.meta['proxy'] = "http://user:pass@proxy_ip:port"

二、数据清洗实战技巧

常见数据问题处理

python 复制代码

def advanced_cleaning(df):
    # 处理缺失值
    df['rating'] = df['rating'].fillna(df['rating'].median())
  
    # 处理重复值
    df = df.drop_duplicates(subset=['name'], keep='last')
  
    # 处理异常值
    q_low = df['price'].quantile(0.01)
    q_high = df['price'].quantile(0.99)
    df = df[(df['price'] > q_low) & (df['price'] < q_high)]
  
    # 日期处理
    df['release_date'] = pd.to_datetime(
        df['release_date'], errors='coerce', format='%Y-%m'
    )
  
    # 文本清洗
    df['name'] = df['name'].str.replace(r'[^\w\s]', '', regex=True)
  
    return df

数据转换技巧

python 复制代码

# 创建价格分段
bins = [0, 50, 100, 200, 500, 1000]
labels = ['<50', '50-100', '100-200', '200-500', '500+']
df['price_range'] = pd.cut(df['price'], bins=bins, labels=labels)

# 计算价格指数
df['price_index'] = (df['price'] / df.groupby('category')['price'].transform('mean')).round(2)

# 时间序列转换
monthly_sales = df.resample('M', on='date')['price'].sum()

三、可视化进阶技巧

交互式可视化（使用Plotly）

python 复制代码

import plotly.express as px

# 创建交互式散点图
fig = px.scatter(df, x='rating', y='price', color='category',
                 hover_data=['name'], title='商品分布分析')
fig.show()

# 创建桑基图（Sankey Diagram）
category_flow = df.groupby(['category', 'price_range']).size().reset_index(name='count')
fig = px.sankey(category_flow, 
                nodes={'label': list(df['category'].unique()) + labels},
                link=dict(
                    source=category_flow['category'],
                    target=category_flow['price_range'],
                    value=category_flow['count']
                ))
fig.show()

自动化报告生成

python 复制代码

from pandas_profiling import ProfileReport

# 生成数据分析报告
profile = ProfileReport(df, title="商品数据分析报告")
profile.to_file("product_report.html")

# 使用Jupyter Notebook集成
from IPython.display import HTML
HTML(profile.to_html())

性能优化指南：

使用lxml解析器替代默认的html.parser
批量处理数据时使用pandas向量化操作
避免在循环中多次访问DataFrame
使用Dask处理超大规模数据
缓存已爬取的页面内容
使用异步请求（aiohttp）提升爬虫效率
对数值型数据使用category类型节省内存
使用内存映射文件处理超大数据集

是否开始发送HTTP请求成功? 解析HTML内容错误处理提取数据存储原始数据数据清洗数据分析可视化呈现生成报告结束

项目扩展方向：

增加自动化邮件报警功能
集成数据库存储（MySQL/MongoDB）
开发Web仪表盘（Flask/Django）
添加机器学习价格预测模块
实现分布式爬虫架构
构建RESTful API数据接口
开发浏览器扩展程序
制作自动化日报系统

避坑指南：

遵守robots.txt协议
设置合理的请求间隔（>2秒）
处理SSL证书验证问题
注意网站内容的版权限制
使用try-except处理网络异常
定期检查选择器有效性
监控数据质量异常
做好数据备份机制