- 逻辑回归模型参数详解

求解对偶问题:如果数据量小但特征很多可以改成true
一般可以设置max_iter 大一些而tol小一些
Class_weight:输入{0:1,1:3}则代表1类样本的每条数据在计算损失函数时都会*3,当输入balanced,则调整为真实样本比例的反比,以达到平衡,但实际情况中不常用
Multi_class:默认情况auto,模型会优先根据惩罚项和solver选择OVR还是MVM
Solver:


- 逻辑回归调参实验
首先介绍PolynomialFeatures函数,其参数有degree(最高阶数)、interaction_only(是否包含交叉项)、include_bias(是否只包含0阶计算结果、偏置项)
具体实验代码如下:
#!/usr/bin/env python
coding: utf-8
In1:
import numpy as np
import pandas as pd
In2:
import matplotlib as mpl
import matplotlib.pyplot as plt
In3:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
In4:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
In5:
np.random.seed(24)
x=np.random.normal(0,1,size=(1000,2))
In6:
y=np.array(x:,0+x:,1**2<1.5,int)
In7:
plt.scatter(x:,0,x:,1,c=y)#c表示颜色
In8:
np.random.seed(24)
for i in range(200):
ynp.random.randint(1000)=1
ynp.random.randint(1000)=0
In9:
plt.scatter(x:,0,x:,1,c=y)
In10:
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=42)
In26:
def plr(degree=1,penalty='none',C=1.0):
pipe=make_pipeline(PolynomialFeatures(degree=degree,include_bias=False),
StandardScaler(),
LogisticRegression(penalty=penalty,tol=1e-4,C=C,max_iter=int(1e4)))
return pipe
#UI多迭代10的6次方次,tol是优化算法的收敛容忍度,c是正则化项参数
In27:
pl1=plr()
In28:
pl1.fit(x_train,y_train)
In29:
pl1.score(x_train,y_train),pl1.score(x_test,y_test)
In37:
def plot_decision_boundary(x,y,model):
x1,x2=np.meshgrid(
np.linspace(x:,0.min()-1,x:,0.max()+1,1000).reshape(-1,1),
np.linspace(x:,1.min()-1,x:,1.max()+1,1000).reshape(-1,1))
x_temp=np.concatenate(x1.reshape(-1,1),x2.reshape(-1,1),1)
yhat_temp=model.predict(x_temp)
yhat=yhat_temp.reshape(x1.shape)
from matplotlib.colors import ListedColormap
custom_cmap=ListedColormap('#EF9A9A','#90CAF9')
plt.contourf(x1,x2,yhat,cmap=custom_cmap)
plt.scatter(x(y==0).flatten(),0,x(y==0).flatten(),1,color='red')
plt.scatter(x(y==1).flatten(),0,x(y==1).flatten(),1,color='red')
In38:
plot_decision_boundary(x,y,pl1)
In39:
#再看下2次特征进行建模:
pr2=plr(degree=2)
In40:
pr2.fit(x_train,y_train)
In41:
pr2.score(x_train,y_train),pr2.score(x_test,y_test)
#(0.7914285714285715, 0.7866666666666666),分数提升10%
In43:
plot_decision_boundary(x,y,pr2)
In45:
#如何查看参数情况
pr2.named_steps'logisticregression'.coef_
#array(\[-0.81012988, 0.04384694, -0.48583038, 0.02977868, -1.12352417])
In46:
#过拟合倾向实验
pr3=plr(degree=10)
pr3.fit(x_train,y_train)
pr3.score(x_train,y_train),pr3.score(x_test,y_test)
#(0.8314285714285714, 0.78)
In47:
plot_decision_boundary(x,y,pr3)
In48:
#尝试不同参数下准确率评分
score_l=\[\]
for degree in range(1,21):
pr_temp=plr(degree=degree)
pr_temp.fit(x_train,y_train)
score_temp=pr_temp.score(x_train,y_train),pr_temp.score(x_test,y_test)
score_l.append(score_temp)
In49:
np.array(score_l)
In54:
#画图看准确率变化
plt.plot(list(range(1,21)),np.array(score_l):,0,label='train_acc')
plt.plot(list(range(1,21)),np.array(score_l):,1,label='test_acc')
plt.legend(loc=4)#指定图例位置
In55:
#手动调参,尝试LL1正则化
pl1=plr(degree=10,penalty='l1',C=1.0)
In57:
pl1.set_params(logisticregression__solver='saga')
pl1.fit(x_train,y_train)#直接fit会报错,要改变求解器为saga
In58:
pl1.score(x_train,y_train),pl1.score(x_test,y_test)
In62:
#尝试枚举搜索参数有degree、C、正则化项
score_l1=\[\]
for degree in range(1,21):
pr_temp=plr(degree=degree,penalty='l1')
pr_temp.set_params(logisticregression__solver='saga')
pr_temp.fit(x_train,y_train)
score_temp=pr_temp.score(x_train,y_train),pr_temp.score(x_test,y_test)
score_l1.append(score_temp)
plt.plot(list(range(1,21)),np.array(score_l1):,0,label='train_acc')
plt.plot(list(range(1,21)),np.array(score_l1):,1,label='test_acc')
plt.legend(loc=4)#指定图例位置
In63:
score_l1#打印发现degree=3是最优解,以此为degree进行后面的搜索
In64:
score_l2=\[\]
for degree in range(1,21):
pr_temp=plr(degree=degree,penalty='l2')
pr_temp.set_params(logisticregression__solver='saga')
pr_temp.fit(x_train,y_train)
score_temp=pr_temp.score(x_train,y_train),pr_temp.score(x_test,y_test)
score_l2.append(score_temp)
plt.plot(list(range(1,21)),np.array(score_l2):,0,label='train_acc')
plt.plot(list(range(1,21)),np.array(score_l2):,1,label='test_acc')
plt.legend(loc=4)#指定图例位置
In66:
score_l2#打印发现degree=15是最优解,以此为degree进行后面的搜索
In72:
#尝试C的取值
score_l1_3=\[\]
for c in np.arange(0.5,2,0.1):
pr_temp=plr(degree=3,penalty='l1',C=c)
pr_temp.set_params(logisticregression__solver='saga')
pr_temp.fit(x_train,y_train)
score_temp=pr_temp.score(x_train,y_train),pr_temp.score(x_test,y_test)
score_l1_3.append(score_temp)
plt.plot(list(np.arange(0.5,2,0.1)),np.array(score_l1_3):,0,label='train_acc')
plt.plot(list(np.arange(0.5,2,0.1)),np.array(score_l1_3):,1,label='test_acc')
plt.legend(loc=4)#指定图例位置
In73:
score_l1_3#因此准确率最高为0.8左右