手搓没有softmax 的gpt
代码
python
import pandas as pd
from tqdm import tqdm
import numpy as np
import paddle
#
class FeedForward(paddle.nn.Layer):
def __init__(self, hidden_dim):
super(FeedForward, self).__init__()
self.fc_one = paddle.nn.Linear(hidden_dim, hidden_dim // 2, bias_attr=False)
self.fc_two = paddle.nn.Linear(hidden_dim // 2, hidden_dim, bias_attr=False)
self.gre = paddle.nn.GELU()
def forward(self, feed_x):
feed_x = self.fc_one(feed_x)
feed_x = self.gre(feed_x)
feed_x = self.fc_two(feed_x)
return feed_x
# 注意力层
class Attention(paddle.nn.Layer):
def __init__(self, hidden_dim, heads):
super(Attention, self).__init__()
self.q = paddle.nn.Linear(hidden_dim, heads * hidden_dim, bias_attr=False)
self.k = paddle.nn.Linear(hidden_dim, heads * hidden_dim, bias_attr=False)
self.v = paddle.nn.Linear(hidden_dim, heads * hidden_dim, bias_attr=False)
self.heads = heads
def forward(self, sx):
b, s, h = sx.shape
q = paddle.nn.functional.relu(self.q(sx))
k = paddle.nn.functional.relu(self.k(sx))
v = self.v(sx)
qk = q.reshape([b, s, self.heads, h]).transpose([0, 2, 1, 3]) @ k.reshape([b, s, self.heads, h]).transpose(
[0, 2, 3, 1])
mask = paddle.triu(paddle.ones([s, s]))
# mask[mask == 0] = -np.inf
qk_mask = qk * mask
qk = qk_mask / (paddle.sum(qk_mask, -2).unsqueeze([-2]) + 0.00000000000001)
qkv = qk.transpose([0, 1, 3, 2]) @ v.reshape([b, s, self.heads, h]).transpose([0, 2, 1, 3])
qkv = qkv.transpose([0, 2, 3, 1])
qkv = paddle.nn.functional.max_pool1d(qkv.reshape([b, -1, self.heads]), self.heads).reshape([b, s, h])
return qkv
class GPT(paddle.nn.Layer):
def __init__(self, voc_size, hidden_dim, row_layers, lora=False):
super(GPT, self).__init__()
self.em = paddle.nn.Embedding(voc_size, hidden_dim)
self.cv = Attention(hidden_dim, row_layers)
self.feed = FeedForward(hidden_dim)
self.lora = FeedForward(hidden_dim)
self.lora_flag = lora
self.out_layer = paddle.nn.Linear(hidden_dim, voc_size, bias_attr=False)
self.layer_nor = paddle.nn.LayerNorm(hidden_dim, bias_attr=False)
# self.p_next = paddle.to_tensor(list(range(voc_size))).astype("int64").reshape([1, -1])
def forward(self, sx):
if self.lora_flag:
with paddle.no_grad():
sx = self.em(sx)
sx += self.cv(sx)
sx = self.layer_nor(sx)
sx += self.feed(sx)
sx = self.layer_nor(sx)
sx += self.lora(sx)
with paddle.no_grad():
out = self.out_layer(sx)
else:
sx = self.em(sx)
sx += self.cv(sx)
sx = self.layer_nor(sx)
sx += self.feed(sx)
sx = self.layer_nor(sx)
sx += self.lora(sx)
out = self.out_layer(sx)
return out
def load_lora(self, lora_name):
self.lora.load_dict(paddle.load(lora_name))
def save_lora(self, lora_name):
paddle.save(self.lora.state_dict(), lora_name)
def gen_basic_data():
seq_len = 32
with open("fixed_couplets_in.txt", "r", encoding="utf-8") as f:
train_data = f.readlines()
with open("fixed_couplets_out.txt", "r", encoding="utf-8") as f:
dev_data = f.readlines()
train_data = [i.strip().split() for i in tqdm(train_data)]
dev_data = [i.strip().split() for i in tqdm(dev_data)]
train_data_list = []
data_id_index = 0
for i, j in tqdm(zip(train_data, dev_data)):
one = i + ["。"] + j + list("|_{}_|".format(data_id_index))
data_id_index += 1
train_data_list += one
seq_len_count = 1
with open("train_data_list.txt", "a", encoding="utf-8") as f:
voc = dict()
for i in tqdm(range(0, len(train_data_list), seq_len)):
if i > 0:
j = i + seq_len
one = train_data_list[i - seq_len_count:j - seq_len_count]
seq_len_count += 1
else:
j = i + seq_len
one = train_data_list[i:j]
if len(one) == seq_len:
f.write(str(one) + "\n")
for k in one:
voc[k] = ""
del train_data_list
del train_data
del dev_data
voc = ["<|pad|>"] + list(voc.keys())
voc_dict = {k: v for v, k in enumerate(voc)}
pd.to_pickle(voc, "voc_data.pandas_pickle")
with open("train_data_list.txt", "r", encoding="utf-8") as f:
train_data = f.readlines()
train_data_list = [[voc_dict[j] for j in eval(i)] for i in tqdm(train_data)]
pd.to_pickle(train_data_list, "train_data.pandas_pickle")
def train_data():
voc_id = pd.read_pickle("voc_data.pandas_pickle")
net = GPT(len(voc_id) + 1, 128, 2)
loss_func = paddle.nn.CrossEntropyLoss(ignore_index=-1)
opt = paddle.optimizer.Adam(learning_rate=0.0001, parameters=net.parameters())
bar = tqdm(range(1700))
batch_size = 1200
data_set = pd.read_pickle("train_data.pandas_pickle")
acc_list = []
for epoch in bar:
np.random.shuffle(data_set)
for i in range(0, len(data_set), batch_size):
j = i + batch_size
data = paddle.to_tensor(data_set[i:j]).astype("int64")
label = data[:, 1:]
input_data = data[:, :-1]
out = net(input_data)
loss = loss_func(out.reshape([-1, out.shape[-1]]), label.reshape([-1]))
acc = paddle.metric.accuracy(out.reshape([-1, len(voc_id) + 1]), label.reshape([-1, 1]))
acc_list.append(acc.item())
bar.set_description(
"epoch___{}___step___{}_loss___{:.5f}_acc__{:.5f}__{:.5f}".format(
epoch, j, loss.item(),
np.mean(acc_list), (paddle.argmax(out, -1) == label).numpy().mean()))
opt.clear_grad()
loss.backward()
opt.step()
paddle.save(net.state_dict(), "model_{}.paddle".format(epoch))
def train_data_lora(lora_one_name):
voc_id = pd.read_pickle("voc_data.pandas_pickle")
net = GPT(len(voc_id) + 1, 128, 2, True)
net.load_dict(paddle.load("basic.paddle"))
loss_func = paddle.nn.CrossEntropyLoss(ignore_index=-1)
opt = paddle.optimizer.Adam(learning_rate=0.00001, parameters=net.parameters())
bar = tqdm(range(1700))
batch_size = 1200
data_set = pd.read_pickle("train_data.pandas_pickle")
# plt.ion()
acc_list = []
for epoch in bar:
np.random.shuffle(data_set)
for i in range(0, len(data_set), batch_size):
j = i + batch_size
data = paddle.to_tensor(data_set[i:j]).astype("int64")
label = data[:, -1:]
input_data = data[:, :-1]
out = net(input_data)
loss = loss_func(out.reshape([-1, out.shape[-1]]), label.reshape([-1]))
acc = paddle.metric.accuracy(out.reshape([-1, len(voc_id) + 1]), label.reshape([-1, 1]))
acc_list.append(acc.item())
bar.set_description(
"epoch___{}___step___{}_loss___{:.5f}_acc__{:.5f}__{:.5f}".format(epoch, j, loss.item(),
np.mean(acc_list),
(paddle.argmax(out, -1) ==
label).numpy().mean()))
opt.clear_grad()
loss.backward()
opt.step()
paddle.save(net.lora.state_dict(), "model_{}.paddle".format(lora_one_name))
if __name__ == '__main__':
# gen_basic_data()
train_data()
# net = CvFoBlock(256, 2,8)
# net(paddle.randn([3, 5, 256]))
# eval_data()
解析代码
该代码定义了一个GPT模型,其中包括了三个子模块:FeedForward、Attention和GPT。
FeedForward模块是一个前馈神经网络,用于对输入进行非线性变换。
Attention模块实现了多头注意力机制,用于计算输入的注意力权重。
GPT模块是整个模型的主体,包括了嵌入层、注意力层、前馈层和输出层。它将输入通过嵌入层得到词向量表示,然后通过注意力层和前馈层进行特征提取和转换,最后通过输出层得到最终的预测结果。在其中还包含了LayerNorm层用于归一化输入。
模型的前向传播过程中根据lora_flag标志来决定是否加载预训练的lora模型。如果需要加载lora模型,则在前向传播过程中先进行一次运算,并将结果与输入相加,再进行后续的层运算。最后得到的输出结果经过线性变换得到预测结果。
该模型还提供了加载和保存lora模型的方法,用于在训练过程中保存和加载预训练的lora模型。