集成学习
生活中的集成学习:
买东西找别推荐
data:image/s3,"s3://crabby-images/b08dd/b08dd3c124fc06cda73a3660f0a2dc0056f24d78" alt=""
python
import numpy as np
import matplotlib.pyplot as plt
python
from sklearn import datasets
X, y = datasets.make_moons(n_samples=500, noise=0.3, random_state=42)
python
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()
data:image/s3,"s3://crabby-images/1264e/1264e98843cbd319e80db92857cca43bf843b5b2" alt=""
python
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
python
from sklearn.linear_model import LogisticRegression
log_clf = LogisticRegression()
log_clf.fit(X_train, y_train)
log_clf.score(X_test, y_test)
data:image/s3,"s3://crabby-images/abccd/abccd93edd5c7c171064a1158e7c17722ace0591" alt=""
python
from sklearn.svm import SVC
svm_clf = SVC()
svm_clf.fit(X_train, y_train)
svm_clf.score(X_test, y_test)
data:image/s3,"s3://crabby-images/0f886/0f8862c99c229edd23ad661b0016886419c331a7" alt=""
python
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(random_state=666)
dt_clf.fit(X_train, y_train)
dt_clf.score(X_test, y_test)
data:image/s3,"s3://crabby-images/b7ad4/b7ad4ed853692d4c57239535840b1fb5957e3023" alt=""
python
y_predict1 = log_clf.predict(X_test)
y_predict2 = svm_clf.predict(X_test)
y_predict3 = dt_clf.predict(X_test)
python
y_predict = np.array((y_predict1 + y_predict2 + y_predict3) >= 2, dtype='int')
python
y_predict[:10]
data:image/s3,"s3://crabby-images/e9903/e9903e46fe5e1a509d581cb6b054a61491bb98ca" alt=""
python
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_predict)
data:image/s3,"s3://crabby-images/fb95e/fb95e8136900fc7e43fb0c967a35e6485d1d70a7" alt=""
使用Voting Classifier
python
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators=[
('log_clf', LogisticRegression()),
('svm_clf', SVC()),
('dt_clf', DecisionTreeClassifier(random_state=666))],
voting='hard')
python
voting_clf.fit(X_train, y_train)
voting_clf.score(X_test, y_test)
data:image/s3,"s3://crabby-images/87c73/87c73f1725914b5576852aa4f204912e471610c6" alt=""
Soft Voting
Voting Classifier
更合理的投票,应该有权值
data:image/s3,"s3://crabby-images/8efac/8efac2c3da95d167a01820f987b311095f105d60" alt=""
data:image/s3,"s3://crabby-images/bcb60/bcb600a74606ec19bc7a87511903599ca2c59ce9" alt=""
要求集合的每一个模型都能估计概率
python
import numpy as np
import matplotlib.pyplot as plt
python
from sklearn import datasets
X, y = datasets.make_moons(n_samples=500, noise=0.3, random_state=42)
python
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()
data:image/s3,"s3://crabby-images/b9220/b92200e434230bf9e226ed96d8b54c20b86e6c98" alt=""
python
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
使用 Hard Voting Classifier
python
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators=[
('log_clf', LogisticRegression()),
('svm_clf', SVC()),
('dt_clf', DecisionTreeClassifier(random_state=666))],
voting='hard')
python
voting_clf.fit(X_train, y_train)
voting_clf.score(X_test, y_test)
data:image/s3,"s3://crabby-images/947f3/947f368aa58e5c245585a7079ae7f84ff8b28254" alt=""
使用 Soft Voting Classifier
python
voting_clf2 = VotingClassifier(estimators=[
('log_clf', LogisticRegression()),
('svm_clf', SVC(probability=True)),
('dt_clf', DecisionTreeClassifier(random_state=666))],
voting='soft')
python
voting_clf2.fit(X_train, y_train)
voting_clf2.score(X_test, y_test)
data:image/s3,"s3://crabby-images/a3a46/a3a463819ad7fdc22dc39eead5b0fef4e206010d" alt=""
集成学习
虽然有很多机器学习方法,但是从投票的角度看,仍然不够多
创建更多的子模型!集成更多的子模型的意见。
子模型之间不能一致!子模型之间要有差异性
如何创建差异性?
每个子模型只看样本数据的一部分。
例如:一共有500个样本数据;每个子模型只看100个样本数据每个子模型不需要太高的准确率
data:image/s3,"s3://crabby-images/bbe83/bbe8386af41c842c42584a71c608e006bc8903f4" alt=""
data:image/s3,"s3://crabby-images/f420f/f420fea156537ad467d1c8e2f948e2a66c7f2e12" alt=""
Bagging 和 Pasting
取样:放回取样,不放回取样
放回取样:Bagging 不放回取样:Pasting
Bagging 更常用
python
import numpy as np
import matplotlib.pyplot as plt
python
from sklearn import datasets
X, y = datasets.make_moons(n_samples=500, noise=0.3, random_state=42)
python
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()
data:image/s3,"s3://crabby-images/c7bff/c7bff2d07cb2e28bb91864167081d33f89d3e002" alt=""
python
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
使用 Bagging
python
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
bagging_clf = BaggingClassifier(DecisionTreeClassifier(),
n_estimators=500, max_samples=100,
bootstrap=True)
bagging_clf.fit(X_train, y_train)
bagging_clf.score(X_test, y_test)
data:image/s3,"s3://crabby-images/fd5af/fd5af7d2207a5f4c7f22ccbb5ee262a7eaf86181" alt=""
python
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
bagging_clf = BaggingClassifier(DecisionTreeClassifier(),
n_estimators=5000, max_samples=100,
bootstrap=True)
bagging_clf.fit(X_train, y_train)
bagging_clf.score(X_test, y_test)
data:image/s3,"s3://crabby-images/12808/1280870683a1ea64f34607abe10e772d42e048ef" alt=""
OOB Out-of-Bag
放回取样导致一部分样本很有可能没有取到
平均大约有37%的样本没有取到。
不使用测试数据集,而使用这部分没有取到的样本做测试/验证
生成数据
python
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
X, y = datasets.make_moons(n_samples=500, noise=0.3, random_state=42)
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()
data:image/s3,"s3://crabby-images/651eb/651ebd8e6b6063ea8666ec5a5439d70de9126d8c" alt=""
oob
python
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
bagging_clf = BaggingClassifier(DecisionTreeClassifier(),
n_estimators=500, max_samples=100,
bootstrap=True, oob_score=True)
bagging_clf.fit(X, y)
python
bagging_clf.oob_score_
data:image/s3,"s3://crabby-images/4e0b4/4e0b47d1be618a14093328bd8715cf5a0cdc7c45" alt=""
Bagging的思路极易并行化处理
python
%%time
bagging_clf = BaggingClassifier(DecisionTreeClassifier(),
n_estimators=500, max_samples=100,
bootstrap=True, oob_score=True)
bagging_clf.fit(X, y)
data:image/s3,"s3://crabby-images/8bf9b/8bf9b76e169cef19848830f4e40e93c027f9c042" alt=""
python
%%time
bagging_clf = BaggingClassifier(DecisionTreeClassifier(),
n_estimators=500, max_samples=100,
bootstrap=True, oob_score=True,
n_jobs=-1)
bagging_clf.fit(X, y)
data:image/s3,"s3://crabby-images/8996a/8996ae708f065a11884b4b30c87be06f2602091c" alt=""
data:image/s3,"s3://crabby-images/5d912/5d912971d024517e6edb05546f1089207a27978c" alt=""
bootstrap_features
python
random_subspaces_clf = BaggingClassifier(DecisionTreeClassifier(),
n_estimators=500, max_samples=500,
bootstrap=True, oob_score=True,
max_features=1, bootstrap_features=True)
random_subspaces_clf.fit(X, y)
random_subspaces_clf.oob_score_
data:image/s3,"s3://crabby-images/37030/370305b49136264aafed4bc08abb3ee48681e55a" alt=""
python
random_patches_clf = BaggingClassifier(DecisionTreeClassifier(),
n_estimators=500, max_samples=100,
bootstrap=True, oob_score=True,
max_features=1, bootstrap_features=True)
random_patches_clf.fit(X, y)
random_patches_clf.oob_score_
data:image/s3,"s3://crabby-images/5f1ba/5f1baa2eb2387685088fe5205b8adc4019387277" alt=""
随机森林
Bagging
Base Estimator: Decision Tree
决策树在节点划分上,在随机的特征子集上寻找最优划分特征
python
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
X, y = datasets.make_moons(n_samples=500, noise=0.3, random_state=666)
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()
data:image/s3,"s3://crabby-images/e686c/e686cf2c3c18f78e53dddc7096c98b45c0043713" alt=""
随机森林
python
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=500, oob_score=True, random_state=666, n_jobs=-1)
rf_clf.fit(X, y)
python
rf_clf.oob_score_
data:image/s3,"s3://crabby-images/2dacf/2dacf6e37cf6cc4823a81e254c795e30f10cbf50" alt=""
python
rf_clf2 = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, oob_score=True, random_state=666, n_jobs=-1)
rf_clf2.fit(X, y)
rf_clf2.oob_score_
data:image/s3,"s3://crabby-images/be2d0/be2d0d9bfa9b031047de2bb591a99d2b4f08cb01" alt=""
Extra-Trees
Bagging
Base Estimator: Decision Tree
决策树在节点划分上,使用随机的特征和随机的阈值
提供额外的随机性,抑制过拟合,但增大了bias
更快的训练速度
python
from sklearn.ensemble import ExtraTreesClassifier
et_clf = ExtraTreesClassifier(n_estimators=500, bootstrap=True, oob_score=True, random_state=666, n_jobs=-1)
et_clf.fit(X, y)
python
et_clf.oob_score_
data:image/s3,"s3://crabby-images/a87a5/a87a5d1956dd2c528499b895e5072aff28d90539" alt=""
Boosting
集成多个模型
每个模型都在尝试增强(Boosting)整体的效果
Ada Boosting
data:image/s3,"s3://crabby-images/676fa/676fa7bd64b79d148596eea46aec79699bff7890" alt=""
python
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
X, y = datasets.make_moons(n_samples=500, noise=0.3, random_state=666)
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()
data:image/s3,"s3://crabby-images/62efa/62efa065760ba6fa7839d22467e58abb4f2b9703" alt=""
python
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
AdaBoosting
python
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(
DecisionTreeClassifier(max_depth=2), n_estimators=500)
ada_clf.fit(X_train, y_train)
python
ada_clf.score(X_test, y_test)
data:image/s3,"s3://crabby-images/4d34d/4d34dee0cc8445f6fe931a7abe63ca4c471bcde4" alt=""
Gradient Boosting
训练一个模型m1,产生错误e1
针对e1训练第二个模型m2,产生错误e2
针对e2训练第三个模型m3,产生错误e3...
最终预测结果是:m1+m2+m3+...
data:image/s3,"s3://crabby-images/0a324/0a324753a22e218334fb0817c138e6127e252fd1" alt=""
python
from sklearn.ensemble import GradientBoostingClassifier
gb_clf = GradientBoostingClassifier(max_depth=2, n_estimators=30)
gb_clf.fit(X_train, y_train)
python
gb_clf.score(X_test, y_test)
data:image/s3,"s3://crabby-images/d6c58/d6c58c5a57c1e333e80c7d547f43c35c7627eb44" alt=""
Stacking
data:image/s3,"s3://crabby-images/0d3ad/0d3ad0f6c26c6900a8ce09f54a3362fc2a549c51" alt=""
data:image/s3,"s3://crabby-images/62690/62690ddda2795468b72b5fc4a30cec8a49e9b3e1" alt=""
data:image/s3,"s3://crabby-images/9aaad/9aaadddae831fb4e1fdb48b4e4134be8c332188f" alt=""