论文复现:Active Learning by Learning

这篇文章说实在的,写的差强人意。

实质性内容是两个现有方法的拼凑!

讲的模模糊糊!对于复现代码不太友好!

撸一点,通读一遍 ,再撸一点,通读一遍~~~

python 复制代码
"""
注意:使用了训练集索引。
"""
import xlwt
import xlrd
import numpy as np
import pandas as pd
from pathlib import Path
from copy import deepcopy
from sklearn.preprocessing import StandardScaler
from time import time
from sklearn.metrics.pairwise import pairwise_distances
from numpy.linalg import inv
from sklearn.metrics import accuracy_score, mean_absolute_error, f1_score, mutual_info_score
from sklearn.neighbors import NearestNeighbors
np.seterr(divide='ignore',invalid='ignore')


class AL_ALBL():
    def __init__(self,X, y, labeled, budget, X_test, y_test):
        self.X = X
        self.y = y
        self.nSample, self.nDim = X.shape
        self.labels = sorted(np.unique(self.y))
        self.nClass = len(self.labels)
        self.M = np.array([[(i - j) ** 2 for i in range(self.nClass)] for j in range(self.nClass)])
        self.X_test = X_test
        self.y_test = y_test
        self.labeled = list(deepcopy(labeled))

        self.dist_matrix = pairwise_distances(X=self.X, metric='euclidean')
        self.K = -1.0 * self.dist_matrix  # 无标记样本池对应的核矩阵
        self.K_test_pool = -1.0 * pairwise_distances(X=self.X_test, Y=self.X, metric='euclidean')
        # -------------------------------------------------
        self.budgetLeft = deepcopy(budget)
        self.c = 0.01
        self.unlabeled = [i for i in range(self.nSample)]
        self.model_initial()
        # ------------------------------
        self._nstrategies = 2
        self._delta = 0.1
        self._w = np.ones(self._nstrategies)
        self._pmin = 1.0 / (self._nstrategies * 10.0)
        self._start = True
        self._aw = np.zeros(self.nSample)
        self._aw[self.labeled] = 1.0
        self._s_idx = None
        # -------------------------------
        self._pmin = 1.0 / (self._nstrategies * 10.0)
        self.PS = np.array([0.5,0.5])
        self.phi = np.zeros((2, self.nSample))
        self.Q = np.zeros(self.nSample)
        self.T = deepcopy(budget)
        self.hat_y_matrix = np.zeros((self.T, self.nSample))
        self.Wt = np.zeros((self.T, self.nSample))
        self.reward = 0.0
        self._nT = 1/ (self.nSample * self.T)



        # -------------------------------
        self.ACClist = []
        self.MZElist = []
        self.MAElist = []
        self.F1list = []
        self.MIlist = []
        self.ALC_ACC = 0.0
        self.ALC_MZE = 0.0
        self.ALC_MAE = 0.0
        self.ALC_F1 = 0.0
        self.ALC_MI = 0.0
        self.Redundancy = 0.0
        # -------------------------------

    def model_initial(self):
        self.T_labeled = self.M[self.y[self.labeled],:]
        self.K_labeled = self.K[np.ix_(self.labeled, self.labeled)]
        self.K_labeled_inv = inv(self.c * np.eye(len(self.labeled)) + self.K_labeled)
        self.Beta = self.K_labeled_inv @ self.T_labeled
        # -----------------------------
        for idx in self.labeled:
            self.unlabeled.remove(idx)
        return self

    def Block_Matrix_Inverse(self, A11_inv, A12, A21, A22):
        n = A11_inv.shape[0]
        m = A22.shape[0]
        M = np.zeros((m+n, m+n))
        B22 = inv(A22 - A21 @ A11_inv @ A12)
        B12 = -A11_inv @ A12 @ B22
        M[n:,n:] = B22
        M[:n,:n] = A11_inv - B12 @ (A21 @ A11_inv)
        M[:n,n:] = B12
        M[n:,:n] = -B22 @ A21 @ A11_inv
        return M

    def model_incremental_train(self, new_ids):
        A12 = self.K[np.ix_(self.labeled, new_ids)]
        A22 = self.K[np.ix_(new_ids, new_ids)] + self.c * np.eye(len(new_ids))
        K_bar_inv = self.Block_Matrix_Inverse(A11_inv=self.K_labeled_inv, A12=A12, A21=A12.T, A22=A22)
        T_bar = np.vstack((self.T_labeled, self.M[self.y[new_ids],:]))
        Beta_bar = K_bar_inv @ T_bar
        # --------------------------
        self.K_labeled_inv = K_bar_inv
        self.T_labeled = T_bar
        self.Beta = Beta_bar
        return self

    def tmp_incremental_train(self, tmp_idx):
        A12 = self.K[np.ix_(self.labeled, [tmp_idx])]
        A22 = self.K[np.ix_([tmp_idx], [tmp_idx])] + self.c * np.eye(1)
        K_bar_inv = self.Block_Matrix_Inverse(A11_inv=self.K_labeled_inv, A12=A12, A21=A12.T, A22=A22)
        return K_bar_inv

    def predict_proba(self, ids):
        K_test_labeled = self.K[np.ix_(ids, self.labeled)]
        output = K_test_labeled @ self.Beta
        predictions = np.linalg.norm(output[:, None] - self.M, axis=2, ord=1)
        predictions = -predictions
        predictions = np.exp(predictions)
        predictions_sum = np.sum(predictions, axis=1, keepdims=True)
        proba_matrix = predictions / predictions_sum
        return proba_matrix

    def predict(self, ids):
        K_test_labeled = self.K[np.ix_(ids, self.labeled)]
        output = K_test_labeled @ self.Beta
        predictions = np.argmin(np.linalg.norm(output[:, None] - self.M, axis=2, ord=1), axis=1)
        return predictions

    def get_EC(self, ids):
        K_test_labeled = self.K[np.ix_(ids, self.labeled)]
        output = K_test_labeled @ self.Beta
        ids_norm1 = np.linalg.norm(output[:, None] - self.M, axis=2, ord=1)
        predictions = -1.0 * deepcopy(ids_norm1)
        predictions = np.exp(predictions)
        predictions_sum = np.sum(predictions, axis=1, keepdims=True)
        proba_matrix = predictions / predictions_sum
        return np.sum(ids_norm1 * proba_matrix, axis=1)



    def evaluation(self):
        output = self.K_test_pool[:,self.labeled] @ self.Beta
        y_hat = np.argmin(np.linalg.norm(output[:, None] - self.M, axis=2, ord=1), axis=1)
        self.ACClist.append(accuracy_score(self.y_test, y_hat))
        self.MZElist.append(1-accuracy_score(self.y_test, y_hat))
        self.MAElist.append(mean_absolute_error(self.y_test, y_hat))
        self.F1list.append(f1_score(self.y_test, y_hat, average='macro'))
        self.MIlist.append(mutual_info_score(labels_true=self.y_test, labels_pred=y_hat))
        self.ALC_ACC += self.ACClist[-1]
        self.ALC_MZE += self.MZElist[-1]
        self.ALC_MAE += self.MAElist[-1]
        self.ALC_F1 += self.F1list[-1]
        self.ALC_MI += self.MIlist[-1]

    def SoftMax(self,value_list):
        exp_values = np.exp(value_list)
        sum_exp_values = np.sum(exp_values)
        return exp_values / sum_exp_values

    def select(self):
        self.evaluation()
        t = 0  #迭代次数
        while self.budgetLeft > 0:
            if not self._start:
                self._w[self._s_idx] *= np.exp(self._pmin / 2.0 * (self.reward + 1.0 / self.last_p * np.sqrt(np.log(self._nstrategies / self._delta) / self._nstrategies)))
            self._start = False
            W = self._w.sum()
            p = (1.0 - self._nstrategies * self._pmin) * self._w / W + self._pmin

            s_idx = np.random.choice(np.arange(self._nstrategies), p=p)
            tar_idx = None
            if s_idx == 0:
                print("Div")
                # ----------------Diversity sampling criterion
                dist_D_L = self.dist_matrix[np.ix_(np.arange(self.nSample), self.labeled)]
                Div = np.min(dist_D_L, axis=1)
                tar_idx = np.argmax(Div)
            elif s_idx == 1:
                print("Expected misclassification cost")
                # ----------------Least Confidence criterion
                EC = self.get_EC(ids=np.arange(self.nSample))
                tar_idx = np.argmax(EC)
                # proba_matrix = self.predict_proba(ids=np.arange(self.nSample))
                # proba_max = np.max(proba_matrix, axis=1)
                # tar_idx = np.argmin(proba_max)
            # ==========================================
            self.last_p = p[s_idx]
            # ==========================================
            if tar_idx in self.labeled:
                """不用更新模型"""
                """计算奖励"""
                hat_y = self.predict(ids=[tar_idx])
                if hat_y == self.y[tar_idx]:
                    self.reward += self._nT / p[s_idx]
            elif tar_idx not in self.labeled:
                """更新模型"""
                self.model_incremental_train(new_ids=[tar_idx])
                self.unlabeled.remove(tar_idx)
                self.labeled.append(tar_idx)
                self.budgetLeft -= 1
                self.evaluation()
                """计算奖励"""
                hat_y = self.predict(ids=[tar_idx])
                if hat_y == self.y[tar_idx]:
                    self.reward += self._nT / p[s_idx] # the IW-ACC

            # -----------------------------------------------
            t += 1  #迭代次数加一
        neigh = NearestNeighbors(n_neighbors=1)
        neigh.fit(X=self.X[self.labeled])
        self.Redundancy = (1/np.mean(neigh.kneighbors()[0].flatten()))

if __name__ == '__main__':

    # name_list = ["Balance-scale","Toy","Cleveland","Knowledge","Glass",
    #              "Melanoma","Housing-5bin","Housing-10bin","Car"]
    name_list = ["Student","Balance-scale","Newthyroid","CTGs","Knowledge","Car","Nursery",
                 "Toy","Melanoma","Eucalyptus","Glass","Obesity1","stock-10bin","Computer-10bin"]

    class results():
        def __init__(self):
            self.ACCList = []
            self.MZEList = []
            self.MAEList = []
            self.F1List = []
            self.MIList = []
            self.ALC_ACC = []
            self.ALC_MZE = []
            self.ALC_MAE = []
            self.ALC_F1 = []
            self.ALC_MI = []
            self.Redun = []

    class stores():
        def __init__(self):
            self.num_labeled_mean = []
            self.num_labeled_std = []
            #-----------------------
            self.ACCList_mean = []
            self.ACCList_std = []
            #-----------------------
            self.MZEList_mean = []
            self.MZEList_std = []
            # -----------------
            self.MAEList_mean = []
            self.MAEList_std = []
            # -----------------
            self.F1List_mean = []
            self.F1List_std = []
            # -----------------
            self.MIList_mean = []
            self.MIList_std = []
            # -----------------
            self.ALC_ACC_mean = []
            self.ALC_ACC_std = []
            # -----------------
            self.ALC_MZE_mean = []
            self.ALC_MZE_std = []
            # -----------------
            self.ALC_MAE_mean = []
            self.ALC_MAE_std = []
            # -----------------
            self.ALC_F1_mean = []
            self.ALC_F1_std = []
            # -----------------
            self.ALC_MI_mean = []
            self.ALC_MI_std = []
            # -----------------
            self.ALC_ACC_list = []
            self.ALC_MZE_list = []
            self.ALC_MAE_list = []
            self.ALC_F1_list = []
            self.ALC_MI_list = []
            # -----------------
            self.Redun_list = []#TODO
            self.Redun_mean = []#TODO
            self.Redun_std = []#TODO

    for name in name_list:
        print("########################{}".format(name))
        data_path = Path("D:\Chapter1\DataSet")
        partition_path = Path(r"D:\Chapter1\Partition")
        """--------------read the whole data--------------------"""
        read_data_path = data_path.joinpath(name + ".csv")
        data = np.array(pd.read_csv(read_data_path, header=None))
        X = np.asarray(data[:, :-1], np.float64)
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
        y = data[:, -1]
        y -= y.min()
        dist_matrix = pairwise_distances(X=X, metric="euclidean")
        nClass = len(np.unique(y))
        nSample = len(y)
        Budget = 25 * nClass
        """--------read the partitions--------"""
        read_partition_path = str(partition_path.joinpath(name + ".xls"))
        book_partition = xlrd.open_workbook(read_partition_path)
        workbook = xlwt.Workbook()
        count = 0
        # --------------------------------------
        RESULT = results()
        STORE = stores()
        # --------------------------------------
        for SN in book_partition.sheet_names():
            print("================{}".format(SN))
            S_Time = time()
            train_ids = []
            test_ids = []
            labeled = []
            table_partition = book_partition.sheet_by_name(SN)
            for idx in table_partition.col_values(0):
                if isinstance(idx,float):
                    train_ids.append(int(idx))
            for idx in table_partition.col_values(1):
                if isinstance(idx,float):
                    test_ids.append(int(idx))
            for idx in table_partition.col_values(2):
                if isinstance(idx,float):
                    labeled.append(int(idx))
            X_train = X[train_ids]
            y_train = y[train_ids].astype(np.int32)
            X_test = X[test_ids]
            y_test = y[test_ids]

            model = AL_ALBL(X=X_train, y=y_train, labeled=labeled, budget=Budget, X_test=X_test, y_test=y_test)
            model.select()

            RESULT.ACCList.append(model.ACClist)
            RESULT.MZEList.append(model.MZElist)
            RESULT.MAEList.append(model.MAElist)
            RESULT.F1List.append(model.F1list)
            RESULT.MIList.append(model.MIlist)
            RESULT.ALC_ACC.append(model.ALC_ACC)
            RESULT.ALC_MZE.append(model.ALC_MZE)
            RESULT.ALC_MAE.append(model.ALC_MAE)
            RESULT.ALC_F1.append(model.ALC_F1)
            RESULT.ALC_MI.append(model.ALC_MI)
            RESULT.Redun.append(model.Redundancy) # TODO
            print("SN===",SN, "time:",time()-S_Time)

        STORE.ACCList_mean = np.mean(RESULT.ACCList, axis=0)
        STORE.ACCList_std = np.std(RESULT.ACCList, axis=0)
        STORE.MZEList_mean = np.mean(RESULT.MZEList, axis=0)
        STORE.MZEList_std = np.std(RESULT.MZEList, axis=0)
        STORE.MAEList_mean = np.mean(RESULT.MAEList, axis=0)
        STORE.MAEList_std = np.std(RESULT.MAEList, axis=0)
        STORE.F1List_mean = np.mean(RESULT.F1List, axis=0)
        STORE.F1List_std = np.std(RESULT.F1List, axis=0)
        STORE.MIList_mean = np.mean(RESULT.MIList, axis=0)
        STORE.MIList_std = np.std(RESULT.MIList, axis=0)
        STORE.ALC_ACC_mean = np.mean(RESULT.ALC_ACC)
        STORE.ALC_ACC_std = np.std(RESULT.ALC_ACC)
        STORE.ALC_MZE_mean = np.mean(RESULT.ALC_MZE)
        STORE.ALC_MZE_std = np.std(RESULT.ALC_MZE)
        STORE.ALC_MAE_mean = np.mean(RESULT.ALC_MAE)
        STORE.ALC_MAE_std = np.std(RESULT.ALC_MAE)
        STORE.ALC_F1_mean = np.mean(RESULT.ALC_F1)
        STORE.ALC_F1_std = np.std(RESULT.ALC_F1)
        STORE.ALC_MI_mean = np.mean(RESULT.ALC_MI)
        STORE.ALC_MI_std = np.std(RESULT.ALC_MI)
        STORE.ALC_ACC_list = RESULT.ALC_ACC
        STORE.ALC_MZE_list = RESULT.ALC_MZE
        STORE.ALC_MAE_list = RESULT.ALC_MAE
        STORE.ALC_F1_list = RESULT.ALC_F1
        STORE.ALC_MI_list = RESULT.ALC_MI
        STORE.Redun_list = RESULT.Redun # TODO
        STORE.Redun_mean = np.mean(RESULT.Redun)# TODO
        STORE.Redun_std = np.std(RESULT.Redun)# TODO

        sheet_names = ["ACC","MZE","MAE","F1","MI",
                       "ALC_ACC_list","ALC_MZE_list","ALC_MAE_list","ALC_F1_list","ALC_MI_list",
                       "ALC_ACC", "ALC_MZE","ALC_MAE", "ALC_F1", "ALC_MI",
                       "Redun_list","Redun"]
        workbook = xlwt.Workbook()

        for sn in sheet_names:
            print("sn::",sn)
            sheet = workbook.add_sheet(sn)
            n_col = len(STORE.MZEList_mean)
            if sn == "ACC":
                sheet.write(0, 0, sn)
                for j in range(1,n_col + 1):
                    sheet.write(j,0,STORE.ACCList_mean[j - 1])
                    sheet.write(j,1,STORE.ACCList_std[j - 1])
            elif sn == "MZE":
                sheet.write(0, 0, sn)
                for j in range(1,n_col + 1):
                    sheet.write(j,0,STORE.MZEList_mean[j - 1])
                    sheet.write(j,1,STORE.MZEList_std[j - 1])
            elif sn == "MAE":
                sheet.write(0, 0, sn)
                for j in range(1,n_col + 1):
                    sheet.write(j,0,STORE.MAEList_mean[j - 1])
                    sheet.write(j,1,STORE.MAEList_std[j - 1])
            elif sn == "F1":
                sheet.write(0, 0, sn)
                for j in range(1,n_col + 1):
                    sheet.write(j,0,STORE.F1List_mean[j - 1])
                    sheet.write(j,1,STORE.F1List_std[j - 1])
            elif sn == "MI":
                sheet.write(0, 0, sn)
                for j in range(1,n_col + 1):
                    sheet.write(j,0,STORE.MIList_mean[j - 1])
                    sheet.write(j,1,STORE.MIList_std[j - 1])

            # ---------------------------------------------------
            elif sn == "ALC_ACC_list":
                sheet.write(0, 0, sn)
                for j in range(1,len(STORE.ALC_ACC_list) + 1):
                    sheet.write(j,0,STORE.ALC_ACC_list[j - 1])
            elif sn == "ALC_MZE_list":
                sheet.write(0, 0, sn)
                for j in range(1,len(STORE.ALC_MZE_list) + 1):
                    sheet.write(j,0,STORE.ALC_MZE_list[j - 1])
            elif sn == "ALC_MAE_list":
                sheet.write(0, 0, sn)
                for j in range(1,len(STORE.ALC_MAE_list) + 1):
                    sheet.write(j,0,STORE.ALC_MAE_list[j - 1])
            elif sn == "ALC_F1_list":
                sheet.write(0, 0, sn)
                for j in range(1,len(STORE.ALC_F1_list) + 1):
                    sheet.write(j,0,STORE.ALC_F1_list[j - 1])
            elif sn == "ALC_MI_list":
                sheet.write(0, 0, sn)
                for j in range(1,len(STORE.ALC_MI_list) + 1):
                    sheet.write(j,0,STORE.ALC_MI_list[j - 1])

            # -----------------
            elif sn == "ALC_ACC":
                sheet.write(0, 0, sn)
                sheet.write(1, 0, STORE.ALC_ACC_mean)
                sheet.write(2, 0, STORE.ALC_ACC_std)
            elif sn == "ALC_MZE":
                sheet.write(0, 0, sn)
                sheet.write(1, 0, STORE.ALC_MZE_mean)
                sheet.write(2, 0, STORE.ALC_MZE_std)
            elif sn == "ALC_MAE":
                sheet.write(0, 0, sn)
                sheet.write(1, 0, STORE.ALC_MAE_mean)
                sheet.write(2, 0, STORE.ALC_MAE_std)
            elif sn == "ALC_F1":
                sheet.write(0, 0, sn)
                sheet.write(1, 0, STORE.ALC_F1_mean)
                sheet.write(2, 0, STORE.ALC_F1_std)
            elif sn == "ALC_MI":
                sheet.write(0, 0, sn)
                sheet.write(1, 0, STORE.ALC_MI_mean)
                sheet.write(2, 0, STORE.ALC_MI_std)
            elif sn == "Redun_list":
                sheet.write(0, 0, sn)
                for j in range(1,len(STORE.Redun_list) + 1):
                    sheet.write(j,0,STORE.Redun_list[j - 1])
            elif sn == "Redun":
                sheet.write(0, 0, sn)
                sheet.write(1, 0, STORE.Redun_mean)
                sheet.write(2, 0, STORE.Redun_std)

        save_path = Path(r"D:\Chapter1\ALresult\ALBL")
        save_path = str(save_path.joinpath(name + ".xls"))
        workbook.save(save_path)
相关推荐
m0_748554813 小时前
golang如何实现用户订阅偏好管理_golang用户订阅偏好管理实现总结
jvm·数据库·python
smj2302_796826524 小时前
解决leetcode第3911题.移除子数组元素后第k小偶数
数据结构·python·算法·leetcode
阿正呀5 小时前
Redis怎样实现本地缓存的高效失效通知
jvm·数据库·python
2501_901200535 小时前
mysql如何设置InnoDB引擎参数_优化innodb_buffer_pool
jvm·数据库·python
_.Switch5 小时前
东方财富股票数据JS逆向:secids字段和AES加密实战
开发语言·前端·javascript·网络·爬虫·python·ecmascript
Mr_sst5 小时前
Claude Code 部署与使用保姆级教程(2026 最新)
python·ai
瞎某某Blinder5 小时前
DFT学习记录[6]基于 HES06的能带计算+有效质量计算
python·学习·程序人生·数据挖掘·云计算·学习方法
m0_495496416 小时前
mysql处理复杂SQL性能_InnoDB优化器与MyISAM差异
jvm·数据库·python
forEverPlume7 小时前
PHP怎么使用Eloquent Attribute Composition属性组合_Laravel通过组合构建复杂属性【方法】
jvm·数据库·python
Aleeeeex7 小时前
RAG 那点事:从 8 份企业文档到能用的问答系统,全过程拆给你看
人工智能·python·ai编程