作业2

作业要求

一、作业内容

作业①

  • 要求 :在中国气象网(http://www.weather.com.cn)给定城市集的7日天气预报,并保存在数据库。
  • 作业代码和图片

点击查看代码

复制代码
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3

class WeatherDB:
    def openDB(self):
        self.con=sqlite3.connect("weathers.db")
        self.cursor=self.con.cursor()
        try:
            self.cursor.execute("create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key (wCity,wDate))")
        except:
            self.cursor.execute("delete from weathers")

    def closeDB(self):
        self.con.commit()
        self.con.close()
    def insert(self, city, date, weather, temp):
        try:
            self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)",
                                (city, date, weather, temp))
        except Exception as err:
            print(err)

        def show(self):
            self.cursor.execute("select * from weathers")
            rows = self.cursor.fetchall()
            print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
            for row in rows:
                print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))


class WeatherForecast:
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36 Edg/141.0.0.0"}

        self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "廊坊": "101090601"}

    def forecastCity(self, city):
        if city not in self.cityCode.keys():
            print(city + " code cannot be found")
            return

        url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
        try:
            req = urllib.request.Request(url, headers=self.headers)
            data = urllib.request.urlopen(req)
            data = data.read()
            dammit = UnicodeDammit(data, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            soup = BeautifulSoup(data, "lxml")
            lis = soup.select("ul[class='t clearfix'] li")
            for li in lis:
                try:
                    date = li.select('h1')[0].text
                    weather = li.select('p[class="wea"]')[0].text
                    temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
                    print(city, date, weather, temp)
                    self.db.insert(city, date, weather, temp)
                except Exception as err:
                    print(err)
        except Exception as err:
            print(err)

    def process(self, cities):
        self.db = WeatherDB()
        self.db.openDB()
        for city in cities:
            self.forecastCity(city)
        self.db.closeDB()

ws = WeatherForecast()
ws.process(["北京", "上海", "广州", "廊坊"])
print("completed")
  • 作业心得
    这份作业做的主要是复现,所以难度还好。

作业②

  • 要求:用requests和BeautifulSoup库方法定向爬取股票相关信息,并存储在数据库中。
  • 候选网站
  • 技巧:在谷歌浏览器中进入F12调试模式进行抓包,查找股票列表加载使用的url,并分析api返回的值,并根据所要求的参数可适当更改api的请求参数。根据URL可观察请求的参数f1、f2可获取不同的数值,根据情况可删减请求的参数。
  • 参考链接https://zhuanlan.zhihu.com/p/50099084
  • 作业代码和图片

点击查看代码

复制代码
import requests
import pandas as pd
import json
import re
import time


class SimpleStockSpider:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36 Edg/141.0.0.0'
        }

    def get_stock_data(self):
        """获取股票数据"""
        stocks_data = []

        # 主要股票代码
        stock_codes = [
            ('1.000001', '上证指数'),
            ('0.399001', '深证成指'),
            ('0.399006', '创业板指'),
            ('1.600036', '招商银行'),
            ('0.000858', '五粮液')
        ]

        for code, name in stock_codes:
            try:
                url = f'http://push2.eastmoney.com/api/qt/stock/get'
                params = {
                    'ut': 'fa5fd1943c7b386f172d6893dbfba10b',
                    'invt': '2',
                    'fltt': '2',
                    'fields': 'f43,f44,f45,f46,f60,f84,f85,f86,f169,f170',
                    'secid': code,
                    'cb': f'jQuery1124_{int(time.time() * 1000)}',
                    '_': str(int(time.time() * 1000))
                }

                response = requests.get(url, params=params, headers=self.headers, timeout=10)

                if response.status_code == 200:
                    content = response.text
                    json_str = re.search(r'\{.*\}', content)
                    if json_str:
                        data = json.loads(json_str.group())
                        if data.get('data'):
                            stock = data['data']
                            stock_info = {
                                '股票代码': code.split('.')[-1],
                                '股票名称': name,
                                '最新报价': stock.get('f43', 0) / 100 if stock.get('f43') else 0,
                                '涨跌幅': round(stock.get('f170', 0) / 100, 2),
                                '涨跌额': round(stock.get('f169', 0) / 100, 2),
                                '成交量': f"{stock.get('f84', 0) / 10000:.2f}万",
                                '成交额': f"{stock.get('f86', 0) / 100000000:.2f}亿",
                                '振幅': round(stock.get('f171', 0) / 100, 2),
                                '最高': stock.get('f44', 0) / 100 if stock.get('f44') else 0,
                                '最低': stock.get('f45', 0) / 100 if stock.get('f45') else 0,
                                '今开': stock.get('f46', 0) / 100 if stock.get('f46') else 0,
                                '昨收': stock.get('f60', 0) / 100 if stock.get('f60') else 0
                            }
                            stocks_data.append(stock_info)

                time.sleep(0.5)

            except Exception as e:
                print(f"获取股票 {name} 数据失败: {e}")
                continue

        return stocks_data

    def save_to_csv(self, stocks_data):
        """保存到CSV文件"""
        if stocks_data:
            df = pd.DataFrame(stocks_data)
            df.to_csv('stock_data.csv', index=False, encoding='utf-8-sig')
            print(f"数据已保存到 stock_data.csv,共 {len(stocks_data)} 条记录")

            # 显示数据
            print("\n爬取的股票数据:")
            print(df.to_string(index=False))
        else:
            print("没有获取到数据")

    def run(self):
        """运行爬虫"""
        print("开始爬取股票数据...")
        stocks_data = self.get_stock_data()
        self.save_to_csv(stocks_data)



if __name__ == "__main__":
    spider = SimpleStockSpider()
    spider.run()
  • 作业心得
  1. 教程是比较早的,代码里面的url不能用,这个要自己去抓包,找包找了好久

作业③

点击查看代码

复制代码
import requests
import json
import pymysql
from sqlalchemy import create_engine
import pandas as pd


class UniversityRankingSpider:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36 Edg/141.0.0.0',
            'Referer': 'https://www.shanghairanking.cn/'
        }
        self.api_url = 'https://www.shanghairanking.cn/api/pub/v1/bcur?bcur_type=11&year=2021'
        self.db_engine = None
        self.init_database()

    def init_database(self):
        """初始化数据库连接"""
        try:
            # MySQL数据库配置 - 请根据你的环境修改
            db_config = {
                'host': 'localhost',
                'port': 3306,
                'user': 'root',
                'password': 'your_password',
                'database': 'university_ranking'
            }

            self.db_engine = create_engine(
                f"mysql+pymysql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}?charset=utf8mb4"
            )

            # 创建数据表
            self.create_table()
            print("数据库连接成功!")

        except Exception as e:
            print(f"数据库连接失败: {e}")
            print("将数据保存到CSV文件")

    def create_table(self):
        """创建大学排名数据表"""
        create_table_sql = """
        CREATE TABLE IF NOT EXISTS university_ranking_2021 (
            id INT AUTO_INCREMENT PRIMARY KEY,
            排名 INT NOT NULL,
            学校名称 VARCHAR(100) NOT NULL,
            省市 VARCHAR(50),
            类型 VARCHAR(20),
            总分 DECIMAL(8,2),
            更新时间 TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            UNIQUE KEY unique_rank_school (排名, 学校名称)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
        """

        try:
            with self.db_engine.connect() as conn:
                conn.execute(create_table_sql)
        except Exception as e:
            print(f"创建表失败: {e}")

    def get_university_data(self):
        """获取大学排名数据"""
        try:
            print("正在从API获取数据...")
            response = requests.get(self.api_url, headers=self.headers, timeout=30)
            response.raise_for_status()

            # 解析JSON数据
            data = json.loads(response.text)
            rankings = data['data']['rankings']

            universities = []
            for item in rankings:
                university_info = {
                    '排名': item['rankOverall'],
                    '学校名称': item['univNameCn'],
                    '省市': item['province'],
                    '类型': item['univCategory'],
                    '总分': float(item['score']) if item['score'] else 0
                }
                universities.append(university_info)

            print(f"成功获取 {len(universities)} 所大学的数据")
            return universities

        except Exception as e:
            print(f"获取数据失败: {e}")
            return []

    def save_to_database(self, universities):
        """保存数据到数据库"""
        if not universities:
            print("没有数据可保存")
            return False

        try:
            df = pd.DataFrame(universities)

            if self.db_engine:
                # 保存到MySQL数据库
                df.to_sql('university_ranking_2021', self.db_engine, if_exists='replace', index=False)
                print(f"成功保存 {len(universities)} 条数据到数据库")
            else:
                # 保存到CSV文件
                df.to_csv('university_ranking_2021.csv', index=False, encoding='utf-8-sig')
                print(f"成功保存 {len(universities)} 条数据到CSV文件")

            return True

        except Exception as e:
            print(f"保存数据失败: {e}")
            return False

    def display_data(self, universities, num=20):
        """显示前N条数据"""
        if not universities:
            print("没有数据可显示")
            return

        print("\n" + "=" * 80)
        print("2021年中国大学排名(主榜)前{}名".format(num))
        print("=" * 80)
        print(f"{'排名':<8} {'学校名称':<20} {'省市':<10} {'类型':<8} {'总分':<8}")
        print("-" * 80)

        for uni in universities[:num]:
            print(f"{uni['排名']:<8} {uni['学校名称']:<20} {uni['省市']:<10} {uni['类型']:<8} {uni['总分']:<8.1f}")

        print("=" * 80)

    def run(self):
        """运行爬虫"""
        print("开始爬取2021年中国大学排名数据...")

        # 获取数据
        universities = self.get_university_data()

        if universities:
            # 显示前20条数据
            self.display_data(universities, 20)

            # 保存数据
            self.save_to_database(universities)

            print(f"\n爬取完成!共获取 {len(universities)} 所大学的数据")
        else:
            print("未能获取到数据")


# 简化版本(无需数据库配置)
class SimpleUniversitySpider:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36 Edg/141.0.0.0'
        }
        self.api_url = 'https://www.shanghairanking.cn/api/pub/v1/bcur?bcur_type=11&year=2021'

    def get_and_save_data(self):
        """获取并保存数据"""
        try:
            print("正在获取2021年中国大学排名数据...")
            response = requests.get(self.api_url, headers=self.headers, timeout=30)
            response.raise_for_status()

            data = json.loads(response.text)[[]()]()
            rankings = data['data']['rankings']

            universities = []
            for item in rankings:
                universities.append([
                    item['rankOverall'],
                    item['univNameCn'],
                    item['province'],
                    item['univCategory'],
                    float(item['score']) if item['score'] else 0
                ])

            # 保存到CSV文件
            df = pd.DataFrame(universities, columns=['排名', '学校名称', '省市', '类型', '总分'])
            df.to_csv('中国大学排名2021.csv', index=False, encoding='utf-8-sig')

            # 显示前20名
            print("\n前20名大学排名:")
            print("排名\t学校名称\t\t省市\t类型\t总分")
            print("-" * 60)
            for i in range(min(20, len(universities))):
                uni = universities[i]
                print(f"{uni[0]}\t{uni[1]}\t{uni[2]}\t{uni[3]}\t{uni[4]:.1f}")

            print(f"\n成功获取并保存 {len(universities)} 所大学的数据到 '中国大学排名2021.csv'")

        except Exception as e:
            print(f"获取数据失败: {e}")


# 运行示例
if __name__ == "__main__":

    spider = SimpleUniversitySpider()
    spider.get_and_save_data()

    # 如果需要使用数据库版本,取消注释下面的代码
    spider = UniversityRankingSpider()
    spider.run()
  • 作业心得
    这个网站看起来简单,实际非常有难度。在发现翻页后url居然不变时感觉不妙(这意味着不能用传统的翻页办法了。F12检查时发现这个网站的数据是通过配套的js动态部署的,所有的元素都在payload.js里面,配套的脚本提取后执行并呈现。

GITEE代码仓库