karpathy Let‘s build GPT

1 introduction

按照karpathy的教程,一步步的完成transformer的构建,并在这个过程中,加深对transformer设计的理解。

karpathy推荐在进行网络设计的过程中,同时利用jupyter notebook进行快速测试和python进行主要的网络的构建。

2 网络实现

2.1 数据的构建

  • 读取text
python 复制代码
text = open("input.txt", "r", encoding='utf-8').read()
words = sorted(set(''.join(text)))
vocab_size = len(words)
print(f'vocab_size is: {vocab_size}')
print(''.join(words))
print(text[:1000])

vocab_size is: 65

!$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz

First Citizen:

Before we proceed any further, hear me speak.

All:

Speak, speak.

First Citizen:

You are all resolved rather to die than to famish?

  • 将字符转换成数字
python 复制代码
stoi = {ch : i for i, ch in enumerate(words)}
itos = {i : ch for i, ch in enumerate(words)}

encode = lambda s: [stoi[ch] for ch in s]
decode = lambda l: ''.join([itos[i] for i in l])
print(encode("hii")) 
print(decode(encode("hii")))

[46, 47, 47]

hii

  • 制作数据集
python 复制代码
import torch
# 生成数据集
data = torch.tensor(encode(text), dtype=torch.long)
print(len(data))
n = int(len(data) * 0.9)
train_data = data[:n]
val_data = data[n:]
print(train_data[:1000])

1115394

tensor([18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 14, 43, 44,

53, 56, 43, 1, 61, 43, 1, 54, 56, 53, 41, 43, 43, 42, 1, 39, 52, 63,

1, 44, 59, 56, 58, 46, 43, 56, 6, 1, 46, 43, 39, 56, 1, 51, 43, 1,

57, 54, 43, 39, 49, 8, 0, 0, 13, 50, 50, 10, 0, 31, 54, 43, 39, 49,

  • 构建dataloader
python 复制代码
import torch
batch_size = 4
torch.manual_seed(1337)
def get_batch(split):
    datasets = {
        'train': train_data,
        'val': val_data,
    }[split]
    ix = torch.randint(0, len(datasets) - block_size, (batch_size,))
    x = torch.stack([datasets[i:i+block_size] for i in ix])
    y = torch.stack([datasets[1+i:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print(f'x shape is: {xb.shape}, y shape is: {yb.shape}')
print(f'x is {xb}')
print(f'y is {yb}')

x shape is: torch.Size([4, 8]), y shape is: torch.Size([4, 8])

x is tensor([[24, 43, 58, 5, 57, 1, 46, 43],

[44, 53, 56, 1, 58, 46, 39, 58],

[52, 58, 1, 58, 46, 39, 58, 1],

[25, 17, 27, 10, 0, 21, 1, 54]])

y is tensor([[43, 58, 5, 57, 1, 46, 43, 39],

[53, 56, 1, 58, 46, 39, 58, 1],

[58, 1, 58, 46, 39, 58, 1, 46],

[17, 27, 10, 0, 21, 1, 54, 39]])

2.2 构建pipeline

  • 定义一个最简单的网络
python 复制代码
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(1337)
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        self.out = self.token_embedding_table(idx)
        return self.out
    
xb, yb = get_batch('train')
model = BigramLanguageModel(vocab_size)
out = model(xb)
print(f'x shape is: {xb.shape}')
print(f'out shape is: {out.shape}')

x shape is: torch.Size([4, 8])

out shape is: torch.Size([4, 8, 65])

  • 包含输出以后的完整的pipeline是
python 复制代码
from typing import Iterator
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(1337)
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)  # B, T, C
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # 这是很好理解的
            targets = targets.view(B*T) # 但是targets是B,T
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)    
            logits = logits[:, -1, :]  # B, C    
            prob = F.softmax(logits, dim=-1)  # 对最后一维进行softmax
            ix = torch.multinomial(prob, num_samples=1) # B, C
            print(idx)
            idx = torch.cat((idx, ix), dim=1)   # B,T+1
            print(idx)
        return idx

            # ix = ix.view(B)
    
xb, yb = get_batch('train')
model = BigramLanguageModel(vocab_size)
out, loss = model(xb)
print(f'x shape is: {xb.shape}')
print(f'out shape is: {out.shape}')

idx = idx = torch.zeros((1, 1), dtype=torch.long)
print(decode(model.generate(idx, max_new_tokens=10)[0].tolist()))
# print(f'idx is {idx}')

x shape is: torch.Size([4, 8])

out shape is: torch.Size([4, 8, 65])

tensor([[0]])

tensor([[ 0, 50]])

tensor([[ 0, 50]])

tensor([[ 0, 50, 7]])

tensor([[ 0, 50, 7]])

tensor([[ 0, 50, 7, 29]])

tensor([[ 0, 50, 7, 29]])

tensor([[ 0, 50, 7, 29, 37]])

tensor([[ 0, 50, 7, 29, 37]])

tensor([[ 0, 50, 7, 29, 37, 48]])

tensor([[ 0, 50, 7, 29, 37, 48]])

tensor([[ 0, 50, 7, 29, 37, 48, 58]])

tensor([[ 0, 50, 7, 29, 37, 48, 58]])

tensor([[ 0, 50, 7, 29, 37, 48, 58, 5]])

tensor([[ 0, 50, 7, 29, 37, 48, 58, 5]])

tensor([[ 0, 50, 7, 29, 37, 48, 58, 5, 15]])

tensor([[ 0, 50, 7, 29, 37, 48, 58, 5, 15]])

tensor([[ 0, 50, 7, 29, 37, 48, 58, 5, 15, 24]])

tensor([[ 0, 50, 7, 29, 37, 48, 58, 5, 15, 24]])

tensor([[ 0, 50, 7, 29, 37, 48, 58, 5, 15, 24, 12]])

l-QYjt'CL?

这里有几个地方需要注意,首先输入输出是:

x is tensor([[24, 43, 58, 5, 57, 1, 46, 43],

[44, 53, 56, 1, 58, 46, 39, 58],

[52, 58, 1, 58, 46, 39, 58, 1],

[25, 17, 27, 10, 0, 21, 1, 54]])

y is tensor([[43, 58, 5, 57, 1, 46, 43, 39],

[53, 56, 1, 58, 46, 39, 58, 1],

[58, 1, 58, 46, 39, 58, 1, 46],

[17, 27, 10, 0, 21, 1, 54, 39]])

并且这个pipeline,网络对输入的长度也没有限制

  • 开始训练
    这个时候我们需要构建一个完整的训练代码,如果还是用jupyter notebook,每次改变了网络的一个组成部分,需要重新执行很多地方,比较麻烦,所以构建一个.py文件。
python 复制代码
import torch
import torch.nn as nn
import torch.nn.functional as F

# hyperparameters
batch_size = 32
block_size = 8
max_iter = 3000
eval_interval = 300
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
# ---------------------

torch.manual_seed(1337)

text = open("input.txt", "r", encoding='utf-8').read()
chars = sorted(list(set(text)))
vocab_size = len(chars)

stoi = {ch : i for i, ch in enumerate(chars)}
itos = {i : ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[ch] for ch in s]
decode = lambda l: ''.join([itos[i] for i in l])

# 生成数据集
data = torch.tensor(encode(text), dtype=torch.long)
n = int(len(data) * 0.9)
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    datasets = {
        'train': train_data,
        'val': val_data,
    }[split]
    ix = torch.randint(0, len(datasets) - block_size, (batch_size,))
    x = torch.stack([datasets[i:i+block_size] for i in ix])
    y = torch.stack([datasets[1+i:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # import pdb; pdb.set_trace()
        logits = self.token_embedding_table(idx)  # B, T, C
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # 这是很好理解的
            targets = targets.view(B*T) # 但是targets是B,T, C其实并不好理解
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)    
            logits = logits[:, -1, :]  # B, C    
            prob = F.softmax(logits, dim=-1)  # 对最后一维进行softmax
            ix = torch.multinomial(prob, num_samples=1) # B, 1
            # print(idx)
            idx = torch.cat((idx, ix), dim=1)   # B,T+1
            # print(idx)
        return idx

model = BigramLanguageModel(vocab_size)
m = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

lossi = []
for iter in range(max_iter):

    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f'step {iter}: train loss {losses["train"]:.4f}, val loss {losses["val"]:.4f}')
    xb, yb = get_batch('train')
    out, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1,1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))

输出的结果是

step 0: train loss 4.7305, val loss 4.7241

step 300: train loss 2.8110, val loss 2.8249

step 600: train loss 2.5434, val loss 2.5682

step 900: train loss 2.4932, val loss 2.5088

step 1200: train loss 2.4863, val loss 2.5035

step 1500: train loss 2.4665, val loss 2.4921

step 1800: train loss 2.4683, val loss 2.4936

step 2100: train loss 2.4696, val loss 2.4846

step 2400: train loss 2.4638, val loss 2.4879

step 2700: train loss 2.4738, val loss 2.4911

CEThik brid owindakis b, bth

HAPet bobe d e.

S:

O:3 my d?

LUCous:

Wanthar u qur, t.

War dXENDoate awice my.

Hastarom oroup

Yowhthetof isth ble mil ndill, ath iree sengmin lat Heriliovets, and Win nghir.

Swanousel lind me l.

HAshe ce hiry:

Supr aisspllw y.

Hentofu n Boopetelaves

MPOLI s, d mothakleo Windo whth eisbyo the m dourive we higend t so mower; te

AN ad nterupt f s ar igr t m:

Thin maleronth,

Mad

RD:

WISo myrangoube!

KENob&y, wardsal thes ghesthinin couk ay aney IOUSts I&fr y ce.

J

2.3 self-attention

我们处理当前的字符的时候,需要和历史字符进行通信,历史字符可以看成是某一种特征,使用最简单的均值提取的方式提取历史字符的feature

python 复制代码
# 最简单的通信方式,将当前的字符和之前的字符平均进行沟通
# 可以看成是history information的features
a = torch.tril(torch.ones(3, 3))
print(a)
a = torch.tril(a) / torch.sum(a, 1, keepdim=True)
print(a)

tensor([[1., 0., 0.],

[1., 1., 0.],

[1., 1., 1.]])

tensor([[1.0000, 0.0000, 0.0000],

[0.5000, 0.5000, 0.0000],

[0.3333, 0.3333, 0.3333]])

可以采用softmax的方式进行mask

python 复制代码
import torch.nn.functional as F
tril = torch.tril(torch.ones(T, T))  # 某种意义上的Q
wei = torch.zeros(T, T) # K
wei = wei.masked_fill(tril == 0, float('-inf'))  
print(wei)
wei = F.softmax(wei)
print(wei)

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],

[0., 0., -inf, -inf, -inf, -inf, -inf, -inf],

[0., 0., 0., -inf, -inf, -inf, -inf, -inf],

[0., 0., 0., 0., -inf, -inf, -inf, -inf],

[0., 0., 0., 0., 0., -inf, -inf, -inf],

[0., 0., 0., 0., 0., 0., -inf, -inf],

[0., 0., 0., 0., 0., 0., 0., -inf],

[0., 0., 0., 0., 0., 0., 0., 0.]])

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],

[0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],

[0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],

[0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],

[0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],

[0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],

[0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],

[0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

特征提取的结果

python 复制代码
xbow2 = wei @ x # (T, T) @ (B, T, C) --> (B, T, C)  # x对应v
print(xbow2.shape)

torch.Size([4, 8, 2])

加上pos_emb现在的forward版本

python 复制代码
def forward(self, idx, targets=None):
        # import pdb; pdb.set_trace()
        tok_emb = self.token_embedding_table(idx)  # B, T, C(n_emb)
        pos_emb = self.position_embedding_table(torch.range(T, device=device)) # T,C 
        # positional encoding
        x = tok_emb + pos_emb   # (B, T, C) broadcasting
        logits = self.lm_head(x)  # B, T, C(vocab_size)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # 这是很好理解的
            targets = targets.view(B*T) # 但是targets是B,T, C其实并不好理解
            loss = F.cross_entropy(logits, targets)

        return logits, loss

karpathy 给出的一些启示

  • Attention is a communication mechanism. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
  • There is no notion of space. Attention simply acts over a set of vectors. This is why we need to positionally encode tokens.
  • Each example across batch dimension is of course processed completely independently and never "talk" to each other
  • In an "encoder" attention block just delete the single line that does masking with tril, allowing all tokens to communicate. This block here is called a "decoder" attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
  • "self-attention" just means that the keys and values are produced from the same source as queries. In "cross-attention", the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)
  • "Scaled" attention additional divides wei by 1/sqrt(head_size). This makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much. Illustration below

attention的公式其中scale是为了保证两个分布相乘的时候,方差不变的。

python 复制代码
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
wei = q @ k.transpose(-2, -1)
wei_scale = wei / head_size**0.5
print(k.var())
print(q.var())
print(wei.var())
print(wei_scale.var())

输出结果

tensor(1.0278)

tensor(0.9802)

tensor(15.9041)

tensor(0.9940)

初始化对结果的影响很大,实际上来说我们还是很希望softmax初始化的结果是一个方差较小的分布,如果不进行scale

python 复制代码
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]) * 8, dim=-1)

tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])

对原来的py文件做一些修改:

python 复制代码
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        # import pdb; pdb.set_trace()
        B, T, C = x.shape    
        q = self.query(x)      #(B, T, C)
        k = self.key(x)        #(B, T, C)
        v = self.value(x)      #(B, T, C)
        wei = q @ k.transpose(-2, -1) * C**-0.5 # (B,T,C)@(B,C,T) --> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) 
        wei = F.softmax(wei, dim=-1)   # (B, T, T)
        out = wei @ v   #(B, T, T) @ (B, T, C) --> (B, T, C)
        return out

修改模型

python 复制代码
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.sa_head = Head(n_embd)  # head的尺寸保持不变
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        # import pdb; pdb.set_trace()
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)  # B, T, C(n_emb)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # T,C 
        # positional encoding
        x = tok_emb + pos_emb   # (B, T, C) broadcasting
        x = self.sa_head(x)
        logits = self.lm_head(x)  # B, T, C(vocab_size)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # 这是很好理解的
            targets = targets.view(B*T) # 但是targets是B,T, C其实并不好理解
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cmd = idx[:, -block_size:]   # (B, T)
            logits, loss = self(idx_cmd)    
            logits = logits[:, -1, :]  # B, C    
            prob = F.softmax(logits, dim=-1)  # 对最后一维进行softmax
            ix = torch.multinomial(prob, num_samples=1) # B, 1
            # print(idx)
            idx = torch.cat((idx, ix), dim=1)   # B,T+1
            # print(idx)
        return idx

加上self-attention的结果

step 4500: train loss 2.3976, val loss 2.4041

2.4 multi-head attention

这里借鉴了group convolutional 的思想,

python 复制代码
class MultiHeadAttention(nn.Module):
    """ multiple head of self attention in parallel """
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])

    def forward(self, x):
        return torch.cat([h(x) for h in self.heads], dim=-1)

应用的时候

python 复制代码
self.sa_head = MultiHeadAttention(4, n_embd//4)  # head的尺寸保持不变

训练的结果

step 4500: train loss 2.2679, val loss 2.2789

2.5 feedforward network


加上feedforward的结果

step 4500: train loss 2.2337, val loss 2.2476

同时用一个block表示这个这个单元,

一个transform的block可以理解成一个connection 组成部分+computation组成部分

python 复制代码
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
    
    def forward(self, x):
        x = self.sa(x)
        x = self.ffwd(x)
        return x

修改模型的定义

python 复制代码
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(
            Block(n_embd, n_head=4),
            Block(n_embd, n_head=4),
            Block(n_embd, n_head=4),
        )
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        # import pdb; pdb.set_trace()
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)  # B, T, C(n_emb)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # T,C 
        # positional encoding
        x = tok_emb + pos_emb   # (B, T, C) broadcasting
        x = self.blocks(x)
        logits = self.lm_head(x)  # B, T, C(vocab_size)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # 这是很好理解的
            targets = targets.view(B*T) # 但是targets是B,T, C其实并不好理解
            loss = F.cross_entropy(logits, targets)

        return logits, loss

2.6 Residual network

现在模型的深度已经很深了,直接训练很可能无法很好的收敛,需要另外一个很重要的工具,残差网络。

python 复制代码
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
    
    def forward(self, x):
        x = x + self.sa(x)
        x = x + self.ffwd(x)
        return x

深度扩充了以后,很容易过拟合了

step 4500: train loss 2.0031, val loss 2.1067

2.7 Layer normalization

我们先来看一下很基础的batchnorm。加入x,y 是两个独立,并且均值为0,方差为1的分布。

根据Var(xy)=E(X)^2 * Var(Y) + E(Y)^2 * Var(X) + Var(X) * Var(Y)=1

再来看矩阵相乘后,每一行变成了T2个独立同分布的乘积,根据中心极限定理:它们的和将近似服从正态分布,均值为各随机变量均值之和,方差为各随机变量方差之和。

也就是说矩阵相乘后的第一列的var=T2*1, mean=0

所以在矩阵相乘的时候,进行scale T 2 \sqrt{T2} T2 可以normalize(var 是平方,所以用了开根号)

python 复制代码
x = torch.ones(5,5)
x = torch.tril(x)
print(x)
print(x.mean(dim=0))
print(x.mean(dim=1))

观察一下矩阵normalize的特点

tensor([[1., 0., 0., 0., 0.],

[1., 1., 0., 0., 0.],

[1., 1., 1., 0., 0.],

[1., 1., 1., 1., 0.],

[1., 1., 1., 1., 1.]])

tensor([1.0000, 0.8000, 0.6000, 0.4000, 0.2000])

tensor([0.2000, 0.4000, 0.6000, 0.8000, 1.0000])

python 复制代码
class BatchNorm1d:
  
  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.momentum = momentum
    self.training = True
    # parameters (trained with backprop)
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)
    # buffers (trained with a running 'momentum update')
    self.running_mean = torch.zeros(dim)
    self.running_var = torch.ones(dim)
  
  def __call__(self, x):
    # calculate the forward pass
    if self.training:
      if x.ndim == 2:
        dim = 0
      elif x.ndim == 3:
        dim = (0,1)
      xmean = x.mean(dim, keepdim=True) # batch mean
      xvar = x.var(dim, keepdim=True) # batch variance
    else:
      xmean = self.running_mean
      xvar = self.running_var
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    # update the buffers
    if self.training:
      with torch.no_grad():
        self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
        self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
    return self.out
  
  def parameters(self):
    return [self.gamma, self.beta]

BatchNormalized对一列进行normalized, layernormalize 对一行进行normalized。

python 复制代码
class BatchNorm1d:
  
  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.momentum = momentum
    self.training = True
    # parameters (trained with backprop)
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)
    # buffers (trained with a running 'momentum update')
    self.running_mean = torch.zeros(dim)
    self.running_var = torch.ones(dim)
  
  def __call__(self, x):
    # calculate the forward pass
    if self.training:
      dim = 1
      xmean = x.mean(dim, keepdim=True) # batch mean
      xvar = x.var(dim, keepdim=True) # batch variance
    else:
      xmean = self.running_mean
      xvar = self.running_var
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    # update the buffers
    if self.training:
      with torch.no_grad():
        self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
        self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
    return self.out
  
  def parameters(self):
    return [self.gamma, self.beta]

如今用的比较 的模式

对应的代码

``python

class Block(nn.Module):

def init (self, n_embd, n_head):

super().init ()

head_size = n_embd // n_head

self.sa = MultiHeadAttention(n_head, head_size)

self.ffwd = FeedForward(n_embd)

self.ln1 = nn.LayerNorm(n_embd)

self.ln2 = nn.LayerNorm(n_embd)

def forward(self, x):
    x = x + self.sa(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))
    return x

并且一般会在连续的decoder block 模块后添加一个layerNorm
```python
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(
            Block(n_embd, n_head=4),
            Block(n_embd, n_head=4),
            Block(n_embd, n_head=4),
            nn.LayerNorm(n_embd),
        )
        self.lm_head = nn.Linear(n_embd, vocab_size)

加上layerNormlization以后,精度又上升了一些

step 4500: train loss 1.9931, val loss 2.0892

现在训练误差和验证误差的loss比较大 ,需要想办法解决一下。

2.8 使用dropout

  • 在head 使用dropout,防止模型被特定的feature给过分影响了提高模型的鲁棒性。
python 复制代码
    def forward(self, x):
        # import pdb; pdb.set_trace()
        B, T, C = x.shape    
        q = self.query(x)      #(B, T, C)
        k = self.key(x)        #(B, T, C)
        v = self.value(x)      #(B, T, C)
        wei = q @ k.transpose(-2, -1) * C**-0.5 # (B,T,C)@(B,C,T) --> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) 
        wei = F.softmax(wei, dim=-1)   # (B, T, T)
        wei = self.dropout(wei)
        out = wei @ v   #(B, T, T) @ (B, T, C) --> (B, T, C)
        return out
  • 在multihead上使用dropout,也是同样的原理,防止特定feature过分影响了模型
python 复制代码
    def forward(self, x):
        out =  torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out
  • 在计算单元的输出结果前使用dropout
python 复制代码
class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

修改设定参数

python 复制代码
# hyperparameters
batch_size = 64
block_size = 256
max_iter = 5000
eval_interval = 500
learning_rate = 3e-4    # self-attention can't tolerate very high learnning rate
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_layer = 6
n_head = 6
dropout = 0.2

step 4500: train loss 1.1112, val loss 1.4791

References

[1] https://www.youtube.com/watch?v=kCc8FmEb1nY

相关推荐
玄奕子4 小时前
GPT对话知识库——在STM32的平台下,通过SPI读取和写入Flash的步骤。
stm32·单片机·gpt·嵌入式·嵌入式驱动
龙的爹23331 天前
论文翻译 | LLaMA-Adapter :具有零初始化注意的语言模型的有效微调
人工智能·gpt·语言模型·自然语言处理·nlp·prompt·llama
罗曼蒂克在消亡1 天前
github项目——gpt-pilot自动创建应用
gpt·github·github项目
wgggfiy2 天前
chatgpt学术科研prompt模板有哪些?chatgpt的学术prompt有哪些?学术gpt,学术科研
论文阅读·人工智能·gpt·chatgpt·prompt·aigc
Ephemeroptera2 天前
通过python-api使用openai的gpt
人工智能·python·gpt
DC10202 天前
GPT 的工作原理:深入解析
java·开发语言·gpt
BIGSHU09233 天前
GPT带我学-设计模式16-原型模式
gpt·设计模式·原型模式
玄奕子3 天前
GPT对话知识库——bootloader是什么?ymodel协议是什么?
stm32·gpt·嵌入式·传输协议·嵌入式驱动
BIGSHU09234 天前
GPT带我学-设计模式18-访问者模式
gpt·设计模式·访问者模式
hunteritself4 天前
ChatGPT实时语音将于本周向免费用户推出:OpenAI DevDay 2024详细解读
人工智能·gpt·算法·chatgpt·openai·语音识别