lightgbm做分类

python 复制代码
```python
import pandas as pd#导入csv文件的库
import numpy as np#进行矩阵运算的库
import json#用于读取和写入json数据格式

#model lgb分类模型,日志评估,早停防止过拟合
from  lightgbm import LGBMClassifier,log_evaluation,early_stopping
#metric
from sklearn.metrics import roc_auc_score#导入roc_auc曲线
#KFold是直接分成k折,StratifiedKFold还要考虑每种类别的占比
from sklearn.model_selection import StratifiedKFold

#config
class Config():
    seed=2024#随机种子
    num_folds=10#K折交叉验证
    TARGET_NAME ='label'#标签
import random#提供了一些用于生成随机数的函数
#设置随机种子,保证模型可以复现
def seed_everything(seed):
    np.random.seed(seed)#numpy的随机种子
    random.seed(seed)#python内置的随机种子
seed_everything(Config.seed)


path='/kaggle/input/'
#sample: Iki037dt dict_keys(['name', 'normal_data', 'outliers'])
with open(path+"whoiswho-ind-kdd-2024/IND-WhoIsWho/train_author.json") as f:
    train_author=json.load(f)
#sample : 6IsfnuWU dict_keys(['id', 'title', 'authors', 'abstract', 'keywords', 'venue', 'year'])   
with open(path+"whoiswho-ind-kdd-2024/IND-WhoIsWho/pid_to_info_all.json") as f:
    pid_to_info=json.load(f)
#efQ8FQ1i dict_keys(['name', 'papers'])
with open(path+"whoiswho-ind-kdd-2024/IND-WhoIsWho/ind_valid_author.json") as f:
    valid_author=json.load(f)

with open(path+"whoiswho-ind-kdd-2024/IND-WhoIsWho/ind_valid_author_submit.json") as f:
    submission=json.load(f)

train_feats=[]
labels=[]
for id,person_info in train_author.items():
    for text_id in person_info['normal_data']:#正样本
        feat=pid_to_info[text_id]
        #['title', 'abstract', 'keywords', 'authors', 'venue', 'year']
        try:
            train_feats.append(
                [len(feat['title']),len(feat['abstract']),len(feat['keywords']),len(feat['authors'])
                 ,len(feat['keywords']),int(feat['year'])]
                 )
        except:
            train_feats.append(
                [len(feat['title']),len(feat['abstract']),len(feat['keywords']),len(feat['authors'])
                 ,len(feat['keywords']),2000]
                 )
        labels.append(1)
    for text_id in person_info['outliers']:#负样本
        feat=pid_to_info[text_id]
        #['title', 'abstract', 'keywords', 'authors', 'venue', 'year']
        try:
            train_feats.append(
                [len(feat['title']),len(feat['abstract']),len(feat['keywords']),len(feat['authors'])
                 ,len(feat['keywords']),int(feat['year'])]
                 )
        except:
            train_feats.append(
                [len(feat['title']),len(feat['abstract']),len(feat['keywords']),len(feat['authors'])
                 ,len(feat['keywords']),2000]
                 )
        labels.append(0)   
train_feats=np.array(train_feats)
labels=np.array(labels)
print(f"train_feats.shape:{train_feats.shape},labels.shape:{labels.shape}")
print(f"np.mean(labels):{np.mean(labels)}")
train_feats=pd.DataFrame(train_feats)
train_feats['label']=labels
train_feats.head()

valid_feats=[]
for id,person_info in valid_author.items():
    for text_id in person_info['papers']:
        feat=pid_to_info[text_id]
        #['title', 'abstract', 'keywords', 'authors', 'venue', 'year']
        try:
            valid_feats.append(
                [len(feat['title']),len(feat['abstract']),len(feat['keywords']),len(feat['authors'])
                 ,len(feat['keywords']),int(feat['year'])]
                 )
        except:
            valid_feats.append(
                [len(feat['title']),len(feat['abstract']),len(feat['keywords']),len(feat['authors'])
                 ,len(feat['keywords']),2000]
                 )
valid_feats=np.array(valid_feats)
print(f"valid_feats.shape:{valid_feats.shape}")
valid_feats=pd.DataFrame(valid_feats)
valid_feats.head()

choose_cols=[col for col in valid_feats.columns]
def fit_and_predict(model,train_feats=train_feats,test_feats=valid_feats,name=0):
    X=train_feats[choose_cols].copy()
    y=train_feats[Config.TARGET_NAME].copy()
    test_X=test_feats[choose_cols].copy()
    oof_pred_pro=np.zeros((len(X),2))
    test_pred_pro=np.zeros((Config.num_folds,len(test_X),2))

    #10折交叉验证
    skf = StratifiedKFold(n_splits=Config.num_folds,random_state=Config.seed, shuffle=True)

    for fold, (train_index, valid_index) in (enumerate(skf.split(X, y.astype(str)))):
        print(f"name:{name},fold:{fold}")

        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
        model.fit(X_train,y_train,eval_set=[(X_valid, y_valid)],
                  callbacks=[log_evaluation(100),early_stopping(100)]
                 )
        
        oof_pred_pro[valid_index]=model.predict_proba(X_valid)
        #将数据分批次进行预测.
        test_pred_pro[fold]=model.predict_proba(test_X)
    print(f"roc_auc:{roc_auc_score(y.values,oof_pred_pro[:,1])}")
    
    return oof_pred_pro,test_pred_pro
#参数来源:https://www.kaggle.com/code/daviddirethucus/home-credit-risk-lightgbm
lgb_params={
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 12,
    "learning_rate": 0.05,
    "n_estimators":3072,
    "colsample_bytree": 0.9,
    "colsample_bynode": 0.9,
    "verbose": -1,
    "random_state": Config.seed,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees":True,
    'num_leaves':64,
    "verbose": -1,
    "max_bin":255,
    }


lgb_oof_pred_pro,lgb_test_pred_pro=fit_and_predict(model= LGBMClassifier(**lgb_params),name='lgb'
                                                  )
test_preds=lgb_test_pred_pro.mean(axis=0)[:,1]


cnt=0
for id,names in submission.items():
    for name in names:
        submission[id][name]=test_preds[cnt]
        cnt+=1
with open('baseline.json', 'w', encoding='utf-8') as f:
    json.dump(submission, f, ensure_ascii=False, indent=4)
复制代码
相关推荐
染指11101 小时前
26.RAG进阶(Advanced RAG)-假设性问题索引
人工智能·windows·agent·rag·advanced rag
闵孚龙1 小时前
动态图机制:为什么 PyTorch 调试起来更舒服
人工智能·pytorch·python
甲维斯2 小时前
还要啥Codex!DeepSeek接入Zcode远程连接!
人工智能
百胜软件@百胜软件2 小时前
百胜软件亮相“AI消费新生活”主题日活动,AI智能运营平台入选市级案例征集
人工智能·生活·零售数字化·数智中台·珠宝行业
专注搞钱3 小时前
GPT-4o写设备Recipe:从3小时到10分钟
数据库·人工智能·gpt·半导体
闻道参看3 小时前
贝芯宠AI灵兽 ELFVET 大模型聚焦临床应用,强化宠物诊疗综合能力
人工智能·宠物
MartinYeung53 小时前
[论文学习]重新思考大型语言模型忘却目标:梯度视角与超越
人工智能·学习·语言模型
财经资讯数据_灵砚智能3 小时前
基于全球经济类多源新闻的NLP情感分析与数据可视化(夜间-次晨)2026年6月14日
大数据·人工智能·python·ai·信息可视化·自然语言处理·灵砚智能
m0_380167144 小时前
加密货币价格 API、市场数据 API 与 分析 API 有什么区别?
人工智能·ai·区块链
zyplayer-doc4 小时前
企业知识库安全与权限管理完全指南:从加密到审计的六层防护
人工智能·安全·pdf·编辑器·创业创新