Python数据分析实战-爬取DouBan电影前250的相关信息并写入Excel表中(附源码和实现效果)

实现功能

在win10操作系统环境下,基于python3.10解释器,爬取豆瓣电影Top250的相关信息并将爬取的信息写入Excel表中。

实现代码

采集爬取模块:scraper.py

python 复制代码
import requests
from bs4 import BeautifulSoup
from typing import List
import re

class Movie:
    def __init__(self, detail_link: str, image_link: str, chinese_name: str, foreign_name: str, rating: float, review_count: int, overview: str, director: str, actors: str, year: int, region: str, category: str):
        self.detail_link = detail_link
        self.image_link = image_link
        self.chinese_name = chinese_name
        self.foreign_name = foreign_name
        self.rating = rating
        self.review_count = review_count
        self.overview = overview
        self.director = director
        self.actors = actors
        self.year = year
        self.region = region
        self.category = category

class Scraper:
    def __init__(self, base_url: str):
        self.base_url = base_url
        self.movies = []

    def scrape(self) -> List[Movie]:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
            "Cookie": "bid=m9sDMeuTWp4; ap_v=0,6.0; _pk_id.100001.4cf6=d6615bd2530852c6.1700447648.; _pk_ses.100001.4cf6=1; __utma=30149280.633232779.1700447649.1700447649.1700447649.1; __utmb=30149280.0.10.1700447649; __utmc=30149280; __utmz=30149280.1700447649.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=223695111.1435231277.1700447649.1700447649.1700447649.1; __utmb=223695111.0.10.1700447649; __utmc=223695111; __utmz=223695111.1700447649.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _cc_id=748927837a892b664c1f1ab42fbe510a; panoramaId_expiry=1700534054317; panoramaId=18a92c0e9b136f927d0f0871ae33a9fb927a9d987bb8aa39557c58077684bc2c; panoramaIdType=panoDevice; _pbjs_userid_consent_data=3524755945110770; __gads=ID=7617c807b66fd695:T=1700447653:RT=1700448285:S=ALNI_MY0jxMNVX0GooLXe8dtdh74vfdLvQ; __gpi=UID=00000cdbaaf33934:T=1700447653:RT=1700448285:S=ALNI_MYekZkuVr46VHfZjhuhdX2kpLxOkw; cto_bundle=xIP-n181MjZFSVBGdlMlMkJEY3hvY3dycER1QjhISjdGU2dzOWxWZUFSMmNZd25VQ1Y0REdtaXZPdTh2aEJGUCUyQlo3WjVETzVNc2VUSFR3dHFXQVRRZU1ZejdOMXk5RDM4VjV1WkJsRWVXd1dQdjRvRE1JQjhEVkJQUVEyV0M1dlgzVkFBclZDTnJWM1g3MWZERDltRFR1UDZZNXp3JTNEJTNE; cto_bidid=vr7nBV8lMkZGJTJCOGVQWjhWREJUelpJYm1UdFBWaWd5bk9WT1JCdyUyRjlpN1duSWFZd3JPR2dkdmh1Q2tNa3NJa25rQTExSFlPM1p2YzdpT1U2cDE5UUowU3p1VHk3YkhVWWw4aFBmUExiZmtZdWtPS3U4byUzRA; cto_dna_bundle=14GGU181MjZFSVBGdlMlMkJEY3hvY3dycER1QiUyQmxhTVFwSEdNWHZ6OE5MZ2olMkJQbjlyODR2SWtIJTJCUGZmYm40Z3p5b1AxbSUyRkJKVDBVUVlXbGE1ZWRQeVUlMkJmeTR5dyUzRCUzRA",
        }

        for i in range(0, 10):  # 左闭右开
            self.url = self.base_url + str(i * 25)  # 字符串的拼接,调用获取页面信息的函数,10次(一共10页)
            response = requests.get(self.url, headers=headers)
            soup = BeautifulSoup(response.text, 'html.parser')
            movie_elements = soup.find_all('div', class_='item')

            for movie_element in movie_elements:
                detail_link = movie_element.find('a')['href']
                image_link = movie_element.find('img')['src']
                title_element = movie_element.find('div', class_='hd')
                chinese_name = title_element.find('span', class_='title').text
                foreign_name = title_element.find('span', class_='other').text.strip()[2:]
                rating = float(movie_element.find('span', class_='rating_num').text)

                # review_count = int(movie_element.find('span', class_='rating_people').find('span').text)
                review_count = re.findall(re.compile(r'<span>(\d*)人评价</span>'), str(movie_element))[0]

                overview = movie_element.find('span', class_='inq').text if movie_element.find('span', class_='inq') else ''
                info_text = movie_element.find('div', class_='bd').find('p').text
                director = info_text.split('导演: ')[1].split(' ')[0]
                actors = info_text.split('主演: ')[1].split(' ')[0] if '主演: ' in info_text else ''
                year_region_category = info_text.split('\n')[-2].strip().split('/')
                try:
                    year = int(year_region_category[0].strip())
                except ValueError as e:
                    print(e)
                    year = None
                region = year_region_category[-2].strip()
                category = year_region_category[-1].strip()

                movie = Movie(detail_link, image_link, chinese_name, foreign_name, rating, review_count, overview, director, actors, year, region, category)
                self.movies.append(movie)

        return self.movies

写入文件模块:writer.py

python 复制代码
import pandas as pd
from typing import List
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from scraper import Movie  # Import the Movie class

class Writer:
    def __init__(self, file_path: str):
        self.file_path = file_path

    def write(self, movies: List[Movie]):  # Specify the type of objects in the list
        data = {
            'Detail Link': [movie.detail_link for movie in movies],
            'Image Link': [movie.image_link for movie in movies],
            'Chinese Name': [movie.chinese_name for movie in movies],
            'Foreign Name': [movie.foreign_name for movie in movies],
            'Rating': [movie.rating for movie in movies],
            'Review Count': [movie.review_count for movie in movies],
            'Overview': [movie.overview for movie in movies],
            'Director': [movie.director for movie in movies],
            'Actors': [movie.actors for movie in movies],
            'Year': [movie.year for movie in movies],
            'Region': [movie.region for movie in movies],
            'Category': [movie.category for movie in movies]
        }
        df = pd.DataFrame(data)

        wb = Workbook()
        ws = wb.active

        for r in dataframe_to_rows(df, index=False, header=True):
            ws.append(r)

        wb.save(self.file_path)

主程序模块:main.py

python 复制代码
from scraper import Scraper, Movie
from writer import Writer

def main():
    # base_url = 'https://movie.douban.com/top250'
    base_url = "https://movie.douban.com/top250?start="
    file_path = 'douban_movies.xlsx'

    # Initialize scraper and scrape data
    scraper = Scraper(base_url)
    movies = scraper.scrape()

    # Initialize writer and write data to file
    writer = Writer(file_path)
    writer.write(movies)

if __name__ == '__main__':
    main()

实现效果

写在后面

本人读研期间发表5篇SCI数据挖掘相关论文,现在某研究院从事数据算法相关科研工作,对Python有一定认知和理解,会结合自身科研实践经历不定期分享关于python、机器学习、深度学习等基础知识与应用案例。

致力于只做原创,以最简单的方式理解和学习,关注我一起交流成长。

1、邀请三个朋友关注本订阅号或2、分享/在看任意订阅号的三篇文章即可在后台联系我获取相关数据集和源码。

2、关注"数据杂坛"公众号,点击"领资料"即可免费领取资料书籍。

3、如果对本文有疑问,或者有论文指导的相关需求,点击"联系我"添加作者微信直接交流。

相关推荐
山海青风11 分钟前
OpenAI 实战进阶教程 - 第七节: 与数据库集成 - 生成 SQL 查询与优化
数据库·人工智能·python·sql
金融OG2 小时前
98.2 AI量化开发:基于DeepSeek打造个人专属金融消息面-AI量化分析师(理论+全套Python代码)
人工智能·python·算法·机器学习·数学建模·金融
遗落凡尘的萤火-生信小白2 小时前
【单细胞第二节:单细胞示例数据分析-GSE218208】
数据库·mysql·数据分析
山海青风3 小时前
OpenAI 实战进阶教程 - 第四节: 结合 Web 服务:构建 Flask API 网关
前端·人工智能·python·chatgpt·flask
白嫖勇者3 小时前
Python(Pandas)数据分析学习
python·数据分析·pandas
纠结哥_Shrek4 小时前
pytorch实现简单的情感分析算法
人工智能·pytorch·python
Swift社区4 小时前
深度学习探索:ChatGPT数据分析精髓 & 梯度下降优化方法深度剖析
深度学习·chatgpt·数据分析
AIGC大时代4 小时前
对比DeepSeek、ChatGPT和Kimi的学术写作中搜集参考文献能力
论文阅读·人工智能·chatgpt·数据分析·llama
Python大数据分析@5 小时前
使用八爪鱼爬虫和Web Scraper抓取数据实战案例,附详细教程
爬虫·网络爬虫
山海青风5 小时前
OpenAI 实战进阶教程 - 第二节:生成与解析结构化数据:从文本到表格
人工智能·python