问题:
- 深度学习训练的常规步骤
- 微调模型的几个思考方向
- 构建自己的数据集用于微调大模型
- 各种微调方式的实现
编写代码
先用一个简单的中文手写识别的深度学习例子来说明训练的过程,这里分别使用PyTorch和TenserFlow来实现,以便比较两个工具库的不同风格。
Talk is cheap, Show code.
数据存放的路径:总共15000张图片,可以去kaggle.com下载 www.kaggle.com/datasets/gp...

图片文件名最后一个数字和图片内容中的文字对应关系是:
"零一二三四五六七八九十百千万亿"\] -\> index + 1 按咱的惯例,定义数据路径: ```ini cur_path = os.getcwd() class DataPathEnum(str, Enum): ZH_HANDWRITTING_IMG_DIR = "chinese_handwritting/images" GPT2XL_TRAIN_DATA_DIR = "gpt2xl" MODEL_CHECKPOINT_DIR = "checkpoints" def __str__(self): return os.path.join(cur_path, "data", self.value) ``` 然后定义两个类库都可以使用的数据基类: ```python IMAGE_DIR = str(DataPathEnum.ZH_HANDWRITTING_IMG_DIR) # image = 64 * 64 class HWData(): def __init__(self) -> None: self.image_files = os.listdir(IMAGE_DIR) self.character_str ="零一二三四五六七八九十百千万亿" self.image_folder:str = IMAGE_DIR # 获取图片路径和标签 def get_image_path_vs_lable(self, index): image_file = self.image_files[index] image_path = os.path.join(self.image_folder, image_file) label = int(image_file.split(".")[0].split("_")[-1]) -1 return label, image_path # 在ipynb文件展示图片并显示标签,以便和预测的结果进行比较 def plot_image(self, index): label, image_path = self.get_image_path_vs_lable(index) image = Image.open(image_path) plt.title("label: " + str(label) + "/" + self.character_str[label]) plt.imshow(image) ``` 先用PyTorch库来操作, 继承自torch.utils.data.Dataset实现自定义数据集,引入图像处理torchvision库。 ```python class HWDataset(HWData, Dataset): def __init__(self) -> None: super().__init__() self.transform = torchvision.transforms.ToTensor() # 实现基类方法 def __len__(self): return len(self.image_files) # 实现基类方法 def __getitem__(self, index) -> Any: label, image_path = self.get_image_path_vs_lable(index) image_tensor = self.transform(Image.open(image_path)) # 标签向量,独热模式 target = torch.zeros((15)) target[label] = 1.0 return image_tensor, target, self.character_str[label] # 随机抽一张图,做验证比较 def get_random_item(self): index = randint(0, self.__len__() -1) return self.__getitem__(index), index ``` 继承自torch.nn.Module实现模型类: ```python class TorchClassifier(nn.Module): def __init__(self) -> None: super().__init__() # 这里只构建全连接层,以便和后面加了卷积和池化层的模型进行比较 self.model = nn.Sequential( nn.Linear(64*64, 1000), # 图片格式64x64 nn.LeakyReLU(0.02), nn.Linear(1000, 100), nn.LeakyReLU(0.02), nn.LayerNorm(100), nn.Linear(100, 15), nn.Softmax(dim = 1) ) # 定义优化函数 self.optimiser = torch.optim.Adam(self.parameters(), lr=0.001) # 定义损失函数 self.loss = nn.BCELoss() # 初始化数据集 self.dataset = HWDataset() # 训练过程数据记录 self.counter = 0 self.progress = [] # 模型参数保存路径 self.checkpoint_file = os.path.join(str(DataPathEnum.MODEL_CHECKPOINT_DIR), "zh_hw_torch.pth") # 实现基类方法 def forward(self, x): # 全连接层接受扁平的批次数据 x = x.view(x.size(0), -1) # 等同 nn.flatten(x) return self.model(x) # 训练方法 def _train(self, x, y): outputs = self.forward(x) loss = self.loss(outputs, y) # 优化过程 self.optimiser.zero_grad() #重置,否则在下个批次梯度会叠加 loss.backward() #反向传播 self.optimiser.step() #调整每层的参数 # 记录过程数据(损失函数的值) self.counter += 1 if(self.counter % 10 == 0): self.progress.append(loss.item()) # 按周期次数进行训练 def train(self, epochs: int): print('start train model...') for epochs in range(epochs): data_loader = DataLoader(self.dataset, batch_size=100, shuffle=True) for index, (data, target, target_char) in enumerate(data_loader): self._train(data, target) self._plot_progress() #绘制训练过程中损失函数的值 # 随机抽一张图,验证查看 def random_eval_model(self): (data, target, _), index = self.dataset.get_random_item() self.dataset.plot_image(index) with torch.no_grad(): output = self.forward(data) df = pd.DataFrame(output.detach().numpy()).T df.plot.barh(rot=0, legend=False, ylim=(0, 15), xlim=(0,1)) def _plot_progress(self): df = pd.DataFrame(self.progress, columns = ["loss"]) df.plot(title="counter:" + str(self.counter), ylim=(0, 1.0), figsize=(16,8), alpha=0.5, marker=".", grid=True, yticks=(0,0.25,0.5)) # 保存模型参数 def save_model_state(self): torch.save(self.model.state_dict(), self.checkpoint_file) ``` 写个ipynb文件来调用模型进行训练: ```scss from aitrain.classifier_torch import HWDataset, TorchClassifier #测试数据集 dataset = HWDataset() dataset.plot_image(2) # 训练模型并保存参数 classifier = TorchClassifier() classifier.train(3) classifier.save_model_state() ```  ```bash # 随机验证并比较预测结果 classifier.random_eval_model() ```  下面用TensorFlow2来实现相同的模块和功能,以便比较两种类库的操作区别。 继承自tensorflow.keras.utils.Sequence实现数据集 ```python class TFDataset(HWData, tf.keras.utils.Sequence): def __init__(self, batch_size) -> None: super().__init__() self.batch_size = batch_size # 实现基类方法,这里返回的批次 def __len__(self): return math.ceil(len(self.image_files)/self.batch_size) # 实现基类方法,依然是批次数据 def __getitem__(self, idx): low = idx * self.batch_size high = min(low + self.batch_size, len(self.image_files)) image_list = [] label_list = [] for index in range(low, high): label, image_path = self.get_image_path_vs_lable(index) image_array = np.array(list(Image.open(image_path).getdata())) image_list.append(image_array) # 标签的独热模式 target =[0 if label != i else 1 for i in range(15)] label_list.append(target) # 返回的是批次数据,注意shape与模型的输入参数一致 return np.array(image_list).reshape(-1,64,64,1), np.array(label_list) # 实现基类方法,每个训练周期结束后调用 def on_epoch_end(self): random.shuffle(self.image_files) ``` 使用tensorflow.keras.Sequential构建TF模型,这里添加卷积和池化层,只是比较一下两个模型的预测准确率。 ```ini class TFClassifier(): def __init__(self, data: TFDataset) -> None: model = tf.keras.Sequential() layers = tf.keras.layers model.add(layers.Rescaling(1./255)) # 压缩到0 ~ 1 model.add(layers.Conv2D(filters=32, kernel_size=(3, 3), strides=(1, 1), activation='relu', input_shape=(64, 64, 1))) model.add(layers.MaxPool2D(pool_size=(2, 2), strides=(2, 2))) model.add(layers.Conv2D(filters=64, kernel_size=(3, 3), strides=(1, 1), activation='relu')) model.add(layers.MaxPool2D(pool_size=(2, 2), strides=(2, 2))) model.add(layers.Conv2D(filters=128, kernel_size=(3, 3), strides=(1, 1), activation='relu')) model.add(layers.MaxPool2D(pool_size=(2, 2), strides=(2, 2))) model.add(layers.Flatten()) #扁平层连接卷积层和全连接层 model.add(layers.Dense(units=1024, activation='relu')) model.add(layers.Dropout(0.2)) # 丢弃一些特征 model.add(layers.Dense(units=256, activation='relu')) model.add(layers.Dense(units=15, activation='softmax')) model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]) self.model = model self.data = data self.checkpoint_file = os.path.join(str(DataPathEnum.MODEL_CHECKPOINT_DIR), "zh_hw_tf.h5") # 训练模型 def train(self, epochs): his = self.model.fit(self.data, epochs=epochs, verbose=2).history # self.__plot_history(his) self.model.summary() # 模型概要 # 随机选图对比验证 def random_eval_model(self): index = random.randint(0, len(self.data.image_files)-1) self.data.plot_image(index) label, image_path= self.data.get_image_path_vs_lable(index) image_array = np.array(list(Image.open(image_path).getdata())).reshape(-1, 64, 64, 1) prediction = self.model.predict(image_array) df = pd.DataFrame(prediction[0]) df.plot.barh(rot=0, legend=False, ylim=(0, 15), xlim=(0,1)) # 保存模型参数 def save_model_weights(self): self.model.save_weights(self.checkpoint_file) def load_model_weights(self): self.model.load_weights(self.checkpoint_file) # 绘制历史数据,这里没有导入验证数据集,所以没有与验证相关损失函数值 def __plot_history(self, history): fig, axs = plt.subplots(1, 2, figsize=(10, 5)) axs[0].plot(history['loss'], label='training loss') # axs[0].plot(history['val_loss'], label='validation loss') axs[0].legend(loc='upper left') axs[0].set_title('training data vs validation data') axs[1].plot(history['accuracy'], label='testing accuracy') # axs[1].plot(history['val_accuracy'], label='validation accuracy') axs[1].set_ylim([0, 1]) axs[1].legend(loc='upper left') axs[1].set_title('accuracy') axs.flat[0].set(xlabel='epochs', ylabel='loss') axs.flat[1].set(xlabel='epochs', ylabel='accuracy') plt.show() ``` 写个ipynb文件来调用模型进行训练: ```scss from aitrain.classifier_tensorflow import TFDataset, TFClassifier # 验证数据集 data = TFDataset(100) data.plot_image(5) # 训练模型 classifier = TFClassifier(data) classifier.train(3) classifier.save_model_weights() ```  ```scss classifier.random_eval_model() ```  从构建数据集和模型的方式,可见TensorFlow和PyTorch封装的方式不同,但是都能方便地实现自己的想法。后面提到的transformers可以很好的融合这两个深度学习库。 从问题解决的角度看,与问题相适应的复杂度模型会很好地平衡训练时间和预测准确度,所以说优化方法和思维方式一直都是不断提高AI能力的推动力。 下一篇谈谈巨量参数的语言模型,如何在降低参数精度载入、增加AB低秩矩阵Tuning层来节省Full Fine-Tunning全量微调的计算资源需求。 ## 完整代码地址 