数据挖掘目标(价格预测挑战)

复制代码
import time
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import Ridge

from sklearn.pipeline import FeatureUnion

In [3]:

复制代码
train_data = pd.read_csv('../data/4/train.csv', sep="\t")
test_data = pd.read_csv('../data/4/test.csv',sep='\t')
# train_data = pd.concat([train_data_1, train_data_1, train_data_1, train_data_1, train_data_1], axis=0)
# pre_data = pd.concat([pre_data_1, pre_data_1, pre_data_1, pre_data_1, pre_data_1], axis=0)

In [5]:

复制代码
train_data.info()
# train_id – 训练序号		             name – 商品名称	
# item_condition_id – 物品当前状态    brand_name – 品牌名称 
# shipping – 是否包邮		             item_description – 商品描述
# category_name – 商品类别   	price – 商品价格
复制代码
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 474710 entries, 0 to 474709
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   train_id           474710 non-null  int64  
 1   name               474710 non-null  object 
 2   item_condition_id  474710 non-null  int64  
 3   category_name      472655 non-null  object 
 4   brand_name         272297 non-null  object 
 5   price              474710 non-null  float64
 6   shipping           474710 non-null  int64  
 7   item_description   474708 non-null  object 
dtypes: float64(1), int64(3), object(4)
memory usage: 29.0+ MB

In [7]:

复制代码
df = pd.concat([train_data, test_data], axis=0)

In [6]:

复制代码
#两个缺失值字段都是字符串类型,给缺失值填充标识符,将整数型转换为字符串
def featureProcessing(df):

    # delete the data that will not be used
    df = df.drop(['price', 'test_id', 'train_id'], axis=1)
    # deal with the missing value with a default value
    df['category_name'] = df['category_name'].fillna('MISS').astype(str)
    df['brand_name'] = df['brand_name'].fillna('missing').astype(str)
    df['item_description'] = df['item_description'].fillna('No')
    # convert the data : int -> str
    df['shipping'] = df['shipping'].astype(str)
    df['item_condition_id'] = df['item_condition_id'].astype(str)

    return df

In [4]:

复制代码
# df = pd.concat([train_data, test_data], axis=0)
复制代码
c:\users\skd621\anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  """Entry point for launching an IPython kernel.

In [8]:

复制代码
df = featureProcessing(df)

In [10]:

复制代码
y_train = np.log1p(train_data['price'])

In [11]:

复制代码
default_preprocessor = CountVectorizer().build_preprocessor()

In [12]:

复制代码
def build_preprocessor_1(field):
    field_idx = list(df.columns).index(field)
    return lambda x: default_preprocessor(x[field_idx])

In [13]:

复制代码
vectorizer = FeatureUnion([
    ('name', CountVectorizer(ngram_range=(1, 2), max_features=50000, preprocessor=build_preprocessor_1('name'))),
    ('category_name', CountVectorizer(token_pattern='.+', preprocessor=build_preprocessor_1('category_name'))),
    ('brand_name', CountVectorizer(token_pattern='.+', preprocessor=build_preprocessor_1('brand_name'))),
    ('shipping', CountVectorizer(token_pattern='\d+', preprocessor=build_preprocessor_1('shipping'))),
    ('item_condition_id', CountVectorizer(token_pattern='\d+', preprocessor=build_preprocessor_1('item_condition_id'))),
    ('item_description', TfidfVectorizer(ngram_range=(1, 3), max_features=100000, preprocessor=build_preprocessor_1('item_description'))),
])

In [19]:

复制代码
# # 传入数据集进行处理
X = vectorizer.fit_transform(df.values)
# # 训练数据的行数
nrow_train = train_data.shape[0]
# # 处理后的训练数据
X_train = X[:nrow_train]
# # 处理后的测试数据
X_test = X[nrow_train:]
# df.values

In [22]:

复制代码
def ridgeClassify(train_data, train_label):
    ridgeClf = Ridge(
        solver='auto',
        fit_intercept=True,
        alpha=0.5,
        max_iter=500,
#         normalize=False,
        tol=0.05)
    # 训练
    ridgeClf.fit(train_data, train_label)
    return ridgeClf

In [24]:

复制代码
ridgeClf = ridgeClassify(X_train, y_train)
# 结果预测
test_price = np.expm1(ridgeClf.predict(X_test))

In [25]:

复制代码
true_price = pd.read_csv("../data/4/label_test.csv", sep="\t").price.tolist()

In [26]:

复制代码
from sklearn.metrics import mean_squared_log_error

In [27]:

复制代码
mean_squared_log_error(true_price, test_price)

Out[27]:

复制代码
0.2398692547251235

In [28]:

复制代码
def score(predict_label, true_label):
    res = 0
    for p, t in zip(predict_label, true_label):
        res += np.power((np.log(p + 1) - np.log(t + 1)), 2)
    return res / len(predict_label)
相关推荐
闪电麦坤9519 分钟前
数据结构:二维数组(2D Arrays)
数据结构·算法
之歆21 分钟前
Python-封装和解构-set及操作-字典及操作-解析式生成器-内建函数迭代器-学习笔记
笔记·python·学习
凌肖战31 分钟前
力扣网C语言编程题:快慢指针来解决 “寻找重复数”
c语言·算法·leetcode
麻雀无能为力38 分钟前
CAU数据挖掘 支持向量机
人工智能·支持向量机·数据挖掘·中国农业大学计算机
智能汽车人1 小时前
Robot---能打羽毛球的机器人
人工智能·机器人·强化学习
埃菲尔铁塔_CV算法1 小时前
基于 TOF 图像高频信息恢复 RGB 图像的原理、应用与实现
人工智能·深度学习·数码相机·算法·目标检测·计算机视觉
ζั͡山 ั͡有扶苏 ั͡✾1 小时前
AI辅助编程工具对比分析:Cursor、Copilot及其他主流选择
人工智能·copilot·cursor
东临碣石821 小时前
【AI论文】数学推理能否提升大型语言模型(LLM)的通用能力?——探究大型语言模型推理能力的可迁移性
人工智能·语言模型·自然语言处理
天天爱吃肉82181 小时前
ZigBee通信技术全解析:从协议栈到底层实现,全方位解读物联网核心无线技术
python·嵌入式硬件·物联网·servlet
IT古董1 小时前
【第二章:机器学习与神经网络概述】04.回归算法理论与实践 -(3)决策树回归模型(Decision Tree Regression)
神经网络·机器学习·回归