python爬取当日水果价格并制作网页(flask)

首先创建一个文件,1.py,2.pytemplates/fruilt,我的程序是3.py,4.py.这个无所谓,自己命名

爬取的网站是

复制代码
https://www.guo68.com

网站页面如下,读者可以自行了解

下面是爬取代码(完整版)

复制代码
import requests
from bs4 import BeautifulSoup
import concurrent.futures
import threading
from tqdm import tqdm
import pandas as pd
import time
from urllib.parse import quote
#水果爬虫
# 定义基本URL模板
base_url_template = "https://www.guo68.com/sell?kw={}&page="

# 支持爬取的水果列表及其中文名称映射
FRUIT_MAPPING = {
    '1': ('xigua', '西瓜'),
    '2': ('pingguo', '苹果'),
    '3': ('chengzi', '橙子'),
    '4': ('xiangjiao', '香蕉'),
    '5': ('putao', '葡萄')
}

# 定义请求头
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0",
    "Referer": "https://www.guo68.com/",
}

data = []
data_lock = threading.Lock()


def get_fruit_choice():
    """获取用户选择的水果"""
    print("请选择要爬取的水果(可多选,用逗号分隔):")
    for num, (eng, chs) in FRUIT_MAPPING.items():
        print(f"{num}. {chs}")

    while True:
        choices = input("请输入编号(例如 1,3): ").split(',')
        valid_choices = [c for c in choices if c in FRUIT_MAPPING]
        if valid_choices:
            return [FRUIT_MAPPING[c] for c in valid_choices]
        print("输入无效,请重新选择!")


def fetch_page(fruit_info, page, retries=3):
    """爬取指定水果和页码的数据"""
    eng_name, chs_name = fruit_info
    encoded_name = quote(chs_name)  # URL编码中文名
    url = base_url_template.format(encoded_name) + str(page)

    for attempt in range(retries):
        try:
            response = requests.get(url, headers=headers, timeout=20)
            response.raise_for_status()
            return parse_html(response.text, chs_name)
        except Exception as e:
            print(f"请求失败: {chs_name}第{page}页,尝试重试 ({attempt + 1}/{retries})...")
            time.sleep(2)
    return []


def parse_html(html, fruit_type):
    """解析HTML并提取数据"""
    soup = BeautifulSoup(html, 'lxml')
    items = soup.find_all('li', class_='fruit')

    page_data = []
    for item in items:
        try:
            info = {
                '种类': fruit_type,
                '价格': item.find('span', class_='price').get_text(strip=True),
                '名称': item.find('span', class_='name').get_text(strip=True),
                '描述': item.find('p', class_='describe').get_text(strip=True),
                '地址': item.find('p', class_='address').get_text(strip=True),
                '认证': item.find('span', class_='simin').get_text(strip=True)
            }
            page_data.append(info)
        except AttributeError as e:
            continue
    return page_data


def worker(params):
    """多线程工作函数"""
    fruit_info, page = params
    page_data = fetch_page(fruit_info, page)
    with data_lock:
        data.extend(page_data)


def main():
    # 获取用户输入
    selected_fruits = get_fruit_choice()
    pages = int(input("请输入每类水果要爬取的页数:"))

    # 准备任务参数
    task_params = []
    for fruit in selected_fruits:
        for page in range(1, pages + 1):
            task_params.append((fruit, page))

    # 多线程爬取
    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
        list(tqdm(executor.map(worker, task_params),
                  total=len(task_params),
                  desc="爬取进度",
                  unit="页"))

    # 保存结果
    if data:
        df = pd.DataFrame(data)
        timestamp = time.strftime("%Y%m%d%H%M", time.localtime())

        # 保存为CSV
        csv_file = f'水果数据_{timestamp}.csv'
        df.to_csv(csv_file, index=False, encoding='utf_8_sig')

        # 保存为Excel
        excel_file = f'水果数据_{timestamp}.xlsx'
        df.to_excel(excel_file, index=False)

        print(f"数据已保存: {csv_file} 和 {excel_file}")
        print(f"共爬取 {len(df)} 条记录,种类分布:")
        print(df['种类'].value_counts())
    else:
        print("没有爬取到有效数据")


if __name__ == "__main__":
    main()

需要安装库pip install requests beautifulsoup4 pandas tqdm openpyxl

运行之后会生成

读者要 换成自己的储存地址的话问ai,这里程序默认和程序同文件夹目录下

下面是flask后端主程序用来构建网页

复制代码
from flask import Flask, render_template, request
import pandas as pd
import os
from datetime import datetime

app = Flask(__name__)

# 配置数据路径(使用原始字符串处理Windows路径)
DATA_DIR = r"C:\Users\林\Desktop\FLASK与mysql\新建文件夹 (8)"
CSV_PREFIX = "水果数据_"


def get_latest_data():
    """获取最新的CSV文件"""
    csv_files = [f for f in os.listdir(DATA_DIR) if f.startswith(CSV_PREFIX) and f.endswith('.csv')]
    if not csv_files:
        return None, None

    # 按修改时间排序获取最新文件
    latest_file = max(csv_files, key=lambda f: os.path.getmtime(os.path.join(DATA_DIR, f)))
    file_path = os.path.join(DATA_DIR, latest_file)
    modify_time = datetime.fromtimestamp(os.path.getmtime(file_path))
    return file_path, modify_time


@app.route('/', methods=['GET', 'POST'])
def index():
    # 获取最新数据文件
    data_path, modify_time = get_latest_data()

    if not data_path:
        return "未找到数据文件,请先运行爬虫程序"

    # 读取数据
    try:
        df = pd.read_csv(data_path, encoding='utf-8-sig')
    except Exception as e:
        return f"读取数据失败: {str(e)}"

    # 处理搜索
    search_term = request.form.get('search', '').lower()
    if search_term:
        mask = df.apply(lambda row: row.astype(str).str.lower().str.contains(search_term).any(), axis=1)
        results = df[mask]
    else:
        results = df

    return render_template('fruits.html',
                           data=results.to_dict(orient='records'),
                           search_term=search_term,
                           update_time=modify_time.strftime('%Y-%m-%d %H:%M:%S'),
                           total=len(df),
                           showing=len(results))


if __name__ == '__main__':
    app.run(debug=True, port=5000)

fruits.html代码

复制代码
<!DOCTYPE html>
<html lang="zh-CN">
<head>
    <meta charset="UTF-8">
    <title>多品种水果数据看板</title>
    <link href="https://cdn.bootcdn.net/ajax/libs/twitter-bootstrap/5.1.3/css/bootstrap.min.css" rel="stylesheet">
    <link rel="stylesheet" href="https://cdn.datatables.net/1.13.6/css/dataTables.bootstrap5.min.css">
    <style>
        .dashboard-header {
            background: linear-gradient(45deg, #4CAF50, #8BC34A);
            color: white;
            padding: 2rem;
            margin-bottom: 2rem;
            border-radius: 8px;
        }
        .stats-card {
            background: white;
            border-radius: 8px;
            padding: 1rem;
            margin-bottom: 1rem;
            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
        }
        .highlight {
            color: #4CAF50;
            font-weight: bold;
        }
    </style>
</head>
<body>
    <div class="container-fluid">
        <!-- 数据看板头部 -->
        <div class="dashboard-header">
            <h1>多品种水果市场实时数据</h1>
            <div class="update-time">最后更新: {{ update_time }}</div>
        </div>

        <!-- 统计卡片 -->
        <div class="row">
            <div class="col-md-4">
                <div class="stats-card">
                    总数据量 <span class="highlight">{{ total }}</span> 条
                </div>
            </div>
            <div class="col-md-4">
                <div class="stats-card">
                    当前显示 <span class="highlight">{{ showing }}</span> 条
                </div>
            </div>
        </div>

        <!-- 搜索框 -->
        <div class="card mb-4">
            <div class="card-body">
                <form method="post">
                    <div class="input-group">
                        <input type="text" class="form-control" name="search"
                               placeholder="输入水果名称、种类、价格范围或地址搜索..."
                               value="{{ search_term }}">
                        <button type="submit" class="btn btn-success">智能搜索</button>
                    </div>
                    <small class="form-text text-muted mt-2">
                        示例搜索:苹果 | 5元 | 北京 | 已认证
                    </small>
                </form>
            </div>
        </div>

        <!-- 数据表格 -->
        <div class="card">
            <div class="card-body">
                <table class="table table-hover" id="dataTable">
                    <thead>
                        <tr>
                            <th>种类</th>
                            <th>名称</th>
                            <th>价格</th>
                            <th>地址</th>
                            <th>认证状态</th>
                            <th>详细描述</th>
                        </tr>
                    </thead>
                    <tbody>
                        {% for item in data %}
                        <tr>
                            <td>{{ item.种类 }}</td>
                            <td>{{ item.名称 }}</td>
                            <td class="text-success">{{ item.价格 }}</td>
                            <td>{{ item.地址 }}</td>
                            <td>
                                <span class="badge rounded-pill bg-{% if item.认证 == '已认证' %}success{% else %}warning text-dark{% endif %}">
                                    {{ item.认证 }}
                                </span>
                            </td>
                            <td>{{ item.描述 }}</td>
                        </tr>
                        {% endfor %}
                    </tbody>
                </table>
            </div>
        </div>
    </div>

    <!-- 脚本 -->
    <script src="https://code.jquery.com/jquery-3.7.0.min.js"></script>
    <script src="https://cdn.datatables.net/1.13.6/js/jquery.dataTables.min.js"></script>
    <script src="https://cdn.datatables.net/1.13.6/js/dataTables.bootstrap5.min.js"></script>
    <script>
    $(document).ready(function() {
        $('#dataTable').DataTable({
            "language": {
                "url": "//cdn.datatables.net/plug-ins/1.13.6/i18n/zh-CN.json"
            },
            "dom": '<"row"<"col-sm-12 col-md-6"l><"col-sm-12 col-md-6"f>>rt<"row"<"col-sm-12 col-md-5"i><"col-sm-12 col-md-7"p>>',
            "pageLength": 25,
            "order": [[2, 'asc']]
        });
    });
    </script>
</body>
</html>
相关推荐
why1517 分钟前
字节头条golang二面
开发语言·后端·golang
南玖yy7 分钟前
C 语言的未来:在变革中坚守与前行
c语言·开发语言
浪费笔墨10 分钟前
一种简洁的python指令处理脚本
开发语言·python
Bug-Free生活11 分钟前
Go语言入门到入土——三、处理并返回异常
开发语言·后端·golang
背太阳的牧羊人19 分钟前
python中MongoDB 的两个驱动
开发语言·python·mongodb
珹洺19 分钟前
Jsp技术入门指南【九】详细讲解JSTL
java·linux·开发语言·前端·jsp
独行soc23 分钟前
2025年渗透测试面试题总结-拷打题库06(题目+回答)
java·开发语言·前端·中间件·数据挖掘·php·xss
向哆哆25 分钟前
Java 并发性能优化:线程池的最佳实践
java·开发语言·性能优化
末央&28 分钟前
【C++】深入浅出之多态
开发语言·c++
AI量化投资实验室29 分钟前
年化26.9%的稳健策略|polars重构因子计算引擎(python策略下载)
开发语言·python·重构