动手学深度学习(Pytorch版)代码实践 -深度学习基础-12Kaggle竞赛:预测房价

12Kaggle竞赛:预测房价

python 复制代码
import numpy as np
import pandas as pd
import torch
import hashlib
import os
import tarfile
import zipfile
import requests
from torch import nn
from d2l import torch as d2l

# url:https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/overview
#@save
DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'

def download(name, cache_dir=os.path.join('..', 'data')):  #@save
    """下载一个DATA_HUB中的文件,返回本地文件名"""
    assert name in DATA_HUB, f"{name} 不存在于 {DATA_HUB}"
    url, sha1_hash = DATA_HUB[name]
    os.makedirs(cache_dir, exist_ok=True)
    fname = os.path.join(cache_dir, url.split('/')[-1])
    if os.path.exists(fname):
        sha1 = hashlib.sha1()
        with open(fname, 'rb') as f:
            while True:
                data = f.read(1048576)
                if not data:
                    break
                sha1.update(data)
        if sha1.hexdigest() == sha1_hash:
            return fname  # 命中缓存
    print(f'正在从{url}下载{fname}...')
    r = requests.get(url, stream=True, verify=True)
    with open(fname, 'wb') as f:
        f.write(r.content)
    return fname

def download_extract(name, folder=None):  #@save
    """下载并解压zip/tar文件"""
    fname = download(name)
    base_dir = os.path.dirname(fname)
    data_dir, ext = os.path.splitext(fname)
    if ext == '.zip':
        fp = zipfile.ZipFile(fname, 'r')
    elif ext in ('.tar', '.gz'):
        fp = tarfile.open(fname, 'r')
    else:
        assert False, '只有zip/tar文件可以被解压缩'
    fp.extractall(base_dir)
    return os.path.join(base_dir, folder) if folder else data_dir

def download_all():  #@save
    """下载DATA_HUB中的所有文件"""
    for name in DATA_HUB:
        download(name)

#访问和读取数据集
DATA_HUB['kaggle_house_train'] = (  #@save
    DATA_URL + 'kaggle_house_pred_train.csv',
    '585e9cc93e70b39160e7921475f9bcd7d31219ce')

DATA_HUB['kaggle_house_test'] = (  #@save
    DATA_URL + 'kaggle_house_pred_test.csv',
    'fa19780a7b011d9b009e8bff8e99922a8ee2eb90')

train_data = pd.read_csv(download('kaggle_house_train'))
test_data = pd.read_csv(download('kaggle_house_test'))

#查看每个数据集的样本数和特征数
# print(train_data.shape)
# print(test_data.shape)
"""
(1460, 81)
(1459, 80)
"""

#查看前四个和最后两个特征,以及相应标签(房价)。
# print(train_data.iloc[0:4,[0,1,2,3,-3,-2,-1]])

#第一列是ID,它不携带任何用于预测的信息,删除
#合并,默认上下合并
all_features = pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]))
print(all_features.shape)

#数据预处理
#对数据集中的数值型特征进行标准化处理,并将缺失值填充为 0

#所有数值型特征的列名,并将这些列名存储在 numeric_features
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index


#得到均值为 0,标准差为 1 的分布
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x : (x - x.mean()) / (x.std())
)

#将所有数值型特征中的缺失值填充为 0
#fillna(0) 方法将这些列中的所有缺失值(NaN)填充为 0。

all_features[numeric_features] = all_features[numeric_features].fillna(0)


#独热编码,并将缺失值也编码成虚拟变量
all_features = pd.get_dummies(all_features, dummy_na=True,dtype = int)
print(all_features.shape)
"""
pd.get_dummies: pandas 函数,用于将分类变量(categorical variables)转换为独热编码(one-hot encoding)的形式。

参数 all_features: 要进行独热编码的DataFrame。

参数 dummy_na=True: 这个参数指定是否将缺失值(NaN)也作为一类进行编码。如果设置为 True,缺失值将被转换为一个单独的虚拟变量。
"""

n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype = torch.float32)
test_features = torch.tensor(all_features[n_train:].values, dtype = torch.float32)

train_labels = torch.tensor(
    train_data.SalePrice.values.reshape(-1, 1),  
    dtype=torch.float32
)
"""
    1.从 train_data 数据框中选择 Saleprice 列
    2.values: 将 Saleprice 列转换为一个 NumPy 数组。
    3.reshape(-1, 1): 将 NumPy 数组的形状重塑为一个二维数组,具有 n 行和 1 列。
    4.重塑后的 NumPy 数组转换为一个 PyTorch 张量
"""

#训练
#均方误差
loss = nn.MSELoss()
in_features = train_features.shape[1]

# def get_net():
#     net = nn.Sequential(nn.Linear(in_features,1))
#     return net

def get_net(): #调参数
    net = nn.Sequential(
        nn.Flatten(),
        nn.Linear(in_features,64),
        nn.ReLU(),
        nn.Linear(64,1))
    return net

def log_rmse(net, features, labels):
    """
    使用 torch.clamp 函数将预测值的下限限制在 1,确保所有预测值至少为 1。
    这是为了避免在取对数时出现负值或零值,因为对数在这些点上未定义或会导致数值问题。
    """
    clipped_preds = torch.clamp(net(features), 1, float('inf'))
    rmse = torch.sqrt(loss(torch.log(clipped_preds),torch.log(labels)))

    #将 PyTorch 张量转换为 Python 标量
    return rmse.item()

#借助Adam优化器进行训练
#Adam优化器它对初始学习率不那么敏感
def train(net, train_features, train_labels, test_features, test_labels,
          num_epochs, learning_rate, weight_decay, batch_size):
    
    train_ls, tets_ls = [], [] #用于存储每个epoch的训练和测试损失
    train_iter = d2l.load_array((train_features, train_labels), batch_size) #创建训练数据迭代器
    optimizer = torch.optim.Adam(net.parameters(), lr= learning_rate,
                                 weight_decay= weight_decay) #定义Adam优化器
    #weight_decay: 权重衰减,用于L2正则化。

    for epoch in range(num_epochs):
        for X, y in train_iter:
            optimizer.zero_grad()  # 梯度清零
            l = loss(net(X), y)  # 计算损失
            l.backward()  # 反向传播
            optimizer.step()  # 更新模型参数

        #计算并记录训练集上的对数均方根误差。
        train_ls.append(log_rmse(net, train_features, train_labels))
        if test_labels is not None:
        # 计算并记录测试集上的对数均方根误差
            tets_ls.append(log_rmse(net,test_features, test_labels))

    return train_ls, tets_ls

#K折交叉验证
#它选择第i个切片作为验证数据,其余部分作为训练数据
def get_k_fold_data(k, i, X, y):
    assert k > 1
    fold_size = X.shape[0] // k
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat([X_train, X_part], 0)
            y_train = torch.cat([y_train, y_part], 0)
    return X_train, y_train, X_valid, y_valid

#在K折交叉验证中训练K次后,返回训练和验证误差的平均值。
def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay,
           batch_size):
    train_l_sum, valid_l_sum = 0, 0
    for i in range(k):
        data = get_k_fold_data(k, i, X_train, y_train)
        net = get_net()
        train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,
                                   weight_decay, batch_size)
        train_l_sum += train_ls[-1]
        #将 train_ls 列表中的最新值(即当前 epoch 的训练损失)累加到 train_l_sum 变量中。
        valid_l_sum += valid_ls[-1]
        if i == 0:
            d2l.plot(list(range(1, num_epochs + 1)), [train_ls, valid_ls],
                     xlabel='epoch', ylabel='rmse', xlim=[1, num_epochs],
                     legend=['train', 'valid'], yscale='log')
        print(f'折{i + 1},训练log rmse{float(train_ls[-1]):f}, '
              f'验证log rmse{float(valid_ls[-1]):f}')
    return train_l_sum / k, valid_l_sum / k

k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 128

train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr,
                          weight_decay, batch_size)
print(f'{k}-折验证: 平均训练log rmse: {float(train_l):f}, '
      f'平均验证log rmse: {float(valid_l):f}')

d2l.plt.show() 

#提交Kaggle预测
def train_and_pred(train_features, test_features, train_labels, test_data,
                   num_epochs, lr, weight_decay, batch_size):
    net = get_net()
    train_ls, _ = train(net, train_features, train_labels, None, None,
                        num_epochs, lr, weight_decay, batch_size)
    d2l.plot(np.arange(1, num_epochs + 1), [train_ls], xlabel='epoch',
             ylabel='log rmse', xlim=[1, num_epochs], yscale='log')
    print(f'训练log rmse:{float(train_ls[-1]):f}')
    # 将网络应用于测试集。
    preds = net(test_features).detach().numpy()
    # 将其重新格式化以导出到Kaggle
    test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
    submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
    submission.to_csv('submission.csv', index=False)

train_and_pred(train_features, test_features, train_labels, test_data,
               num_epochs, lr, weight_decay, batch_size)

d2l.plt.show() 

运行结果:

python 复制代码
折1,训练log rmse0.076504, 验证log rmse0.150307
折2,训练log rmse0.074193, 验证log rmse0.170535
折3,训练log rmse0.068504, 验证log rmse0.182418
折4,训练log rmse0.067275, 验证log rmse0.133314
折5,训练log rmse0.105472, 验证log rmse0.197220
5-折验证: 平均训练log rmse: 0.078390, 平均验证log rmse: 0.166759
训练log rmse:0.079534
相关推荐
七牛云行业应用几秒前
iOS 19.3 突发崩溃!Gemini 3 导致 JSON 解析失败的紧急修复
人工智能·ios·swift·json解析·大模型应用
2301_800256114 分钟前
【人工智能引论期末复习】第6章 深度学习3-CNN
人工智能·深度学习·cnn
易晨 微盛·企微管家4 分钟前
2026企业微信社群管理:智能质检如何助力企业高效服务与合规运营
人工智能
NimoXie10 分钟前
Windows CUDA + cuDNN + TensorFlow + PyTorch 识别 GPU 的简单整合
pytorch·windows·tensorflow
لا معنى له18 分钟前
学习笔记:少样本学习
人工智能·笔记·深度学习·学习·机器学习
一见19 分钟前
Skills、Rules和KnowledgeBase的概念和区别
人工智能·ai编程
Deepoch20 分钟前
从“机械执行”到“意图理解”:Deepoc如何重塑人机交互新范式
人工智能·机器人·开发板·具身模型·deepoc
小郭团队20 分钟前
1_1_七段式SVPWM (传统算法反正切)算法理论与 MATLAB 实现详解
人工智能·stm32·嵌入式硬件·算法·dsp开发
民乐团扒谱机21 分钟前
【微实验】多目标背包问题的整数规划解法对比(MATLAB 实现)
人工智能·多目标优化·01背包问题·蒙特卡罗·帕累托前沿
love530love22 分钟前
突破 ComfyUI 环境枷锁:RTX 3090 强行开启 comfy-kitchen 官方全后端加速库实战
人工智能·windows·python·cuda·comfyui·triton·comfy-kitchen