机器学习中的工作流机制

在项目开发的时候，经常需要我们选择使用哪一种模型。同样的数据，可能决策树效果不错，朴素贝叶斯也不错，SVM也挺好。有没有一种方法能够让我们用一份数据，同时训练多个模型，并用某种直观的方式（包括模型得分），观察到模型在既有数据上的效果？有的，管线工作流pipeline就是专门干这个的，再配上决策边界，所有模型只用一眼，就能确定优劣，选择你的梦中情模。上效果图。

分为两行，上面是sklearn自带数据集中的数据，分两类。从第二列开始，每一列是某种模型在当前数据集中的拟合效果。如何查看某种模型效果好坏？从两个方面，左上角的模型得分，和图中颜色深浅，两种颜色的分解代表模型的决策边界。

下面是笔者自己的数据，分为4类。同样不同颜色的分界代表两种类型的判别边界。如果只看模型得分，那得分为100%的模型有5个，选再根据决策边界进一步确定更优秀的模型，为工程所用。这里贴出笔者所用代码供各位修改，也可以直接取官方代码修改

python 复制代码

def loadTrainData():
    df = pd.read_csv('./your/dataset/path/data.csv')
    trainDataLabel = df.values
    nodeData = trainDataLabel[:, :2], trainDataLabel[:, -1]
    return nodeData

def trainAnalySave():
    from matplotlib.colors import ListedColormap
    import joblib

    from sklearn.datasets import make_circles, make_classification, make_moons
    from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
    from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
    from sklearn.gaussian_process import GaussianProcessClassifier
    from sklearn.gaussian_process.kernels import RBF
    from sklearn.inspection import DecisionBoundaryDisplay
    from sklearn.model_selection import train_test_split
    from sklearn.naive_bayes import GaussianNB
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.neural_network import MLPClassifier
    from sklearn.pipeline import make_pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.svm import SVC
    from sklearn.tree import DecisionTreeClassifier

    names = [
        "Nearest Neighbors",
        "Linear SVM",
        "RBF SVM",
        "Gaussian Process",
        "Decision Tree",
        "Random Forest",
        "Neural Net",
        "AdaBoost",
        "Naive Bayes",
        "QDA",
    ]

    classifiers = [
        KNeighborsClassifier(3),
        SVC(kernel="linear", C=0.025),
        SVC(gamma=2, C=1),
        GaussianProcessClassifier(1.0 * RBF(1.0)),
        DecisionTreeClassifier(max_depth=5),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        MLPClassifier(alpha=1, max_iter=1000),
        AdaBoostClassifier(),
        GaussianNB(),
        QuadraticDiscriminantAnalysis(),
    ]

    # X, y = make_classification(
    #     n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1
    # )
    # rng = np.random.RandomState(2)
    # X += 2 * rng.uniform(size=X.shape)
    # linearly_separable = (X, y)

    nodeData = loadTrainData()

    datasets = [
        # make_moons(noise=0.3, random_state=0),
        make_circles(noise=0.2, factor=0.5, random_state=1),
        # linearly_separable,
        nodeData,
    ]

    # figure = plt.figure(figsize=(27, 9))
    figure = plt.figure(figsize=(15, 4))
    i = 1
    # iterate over datasets
    for ds_cnt, ds in enumerate(datasets):
        # preprocess dataset, split into training and test part
        X, y = ds
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.4, random_state=42
        )

        x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
        y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5

        # just plot the dataset first
        cm = plt.cm.RdBu
        cm_bright = ListedColormap(["#FF0000", "#00FF00", "#FFFF00", "#0000FF"])
        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
        if ds_cnt == 0:
            ax.set_title("Input data")
        # Plot the training points
        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k")
        # Plot the testing points
        ax.scatter(
            X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors="k"
        )
        ax.set_xlim(x_min, x_max)
        ax.set_ylim(y_min, y_max)
        ax.set_xticks(())
        ax.set_yticks(())
        i += 1

        # iterate over classifiers
        for name, clf in zip(names, classifiers):
            ax = plt.subplot(len(datasets), len(classifiers) + 1, i)

            clf = make_pipeline(StandardScaler(), clf)
            clf.fit(X_train, y_train)
            score = clf.score(X_test, y_test)
            # DecisionBoundaryDisplay.from_estimator(
            #     clf, X, cmap=cm, alpha=0.8, ax=ax, eps=0.5
            # )

            # save satisfied model
            savedPath = r'..\models\sklearn\\'
            savedList = ["Nearest Neighbors", "RBF SVM", "Neural Net"]
            if name in savedList:
                joblib.dump(clf, savedPath + name + '.pkl')

            # Plot the training points
            ax.scatter(
                X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k"
            )
            # Plot the testing points
            ax.scatter(
                X_test[:, 0],
                X_test[:, 1],
                c=y_test,
                cmap=cm_bright,
                edgecolors="k",
                alpha=0.6,
            )

            ax.set_xlim(x_min, x_max)
            ax.set_ylim(y_min, y_max)
            ax.set_xticks(())
            ax.set_yticks(())
            if ds_cnt == 0:
                ax.set_title(name)
            ax.text(
                # x_max - 0.3,
                # y_min + 0.3,
                x_min + 0.4,
                y_max - 0.4 - ds_cnt,
                ("%.2f" % score),
                # ("%.2f" % score).lstrip("0"),
                # size=15,
                size=10,
                # horizontalalignment="right",
                horizontalalignment="left",
            )
            i += 1

    plt.tight_layout()
    plt.show()   

    nodeData = loadTrainData()
if __name__ == '__main__':
    trainAnalySave()

注意，这里的DecisionBoundaryDisplay模块，需要安装sklearn的较新版本，因而python也需要较高版本。

最后打个广告，如果有想进修服务器开发相关的技能，这里是可以让你秒变大神的时光隧道。 enjoy~~