import copy
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import clone
from torch.nn import Dropout
class Embeddings(nn.Module):
def __init__(self, vocab_size, d_model):
super(Embeddings, self).__init__()
self.d_model = d_model
self.vocab_size = vocab_size
self.embeddings = nn.Embedding(self.vocab_size, self.d_model)
def forward(self, x):
return self.embeddings(x)*math.sqrt(self.d_model)
class PositionalEncoding(nn.Module):
def __init__(self, d_model, max_len,dropout):
super(PositionalEncoding, self).__init__()
self.d_model = d_model
self.dropout = nn.Dropout(p=dropout)
self.max_len = max_len
pe=torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:, :x.size(1)]
return self.dropout(x)
def attention(query,key,value,mask=None,dropout=None):
d_k = query.size(-1)
scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
if mask is not None:
scores.masked_fill(mask==0,-1e9)
f_atten=F.softmax(scores,dim=-1)
if dropout is not None:
f_atten=dropout(f_atten)
return f_atten@value,f_atten
def clones(model,N):
return nn.ModuleList([copy.deepcopy(model) for _ in range(N)])
class MutilAttention(nn.Module):
def __init__(self, d_model, heads,dropout):
super(MutilAttention, self).__init__()
assert d_model % heads == 0
self.d_model = d_model
self.heads = heads
d_k = d_model // heads
self.d_k = d_k
self.linears = clones(nn.Linear(d_model, d_model), 4)
self.dropout = nn.Dropout(dropout)
def forward(self, query, key, value, mask=None):
batch_size = query.size(0)
query, key, value = [model(x).view(batch_size,-1,self.heads,self.d_k).transpose(2,1) for model,x in zip(self.linears,[query,key,value])]
x,atten=attention(query,key,value,mask,dropout=None)
x=x.transpose(1,2).contiguous().view(batch_size,-1,self.d_model)
return self.linears[-1](x)
class FeedForward(nn.Module):
def __init__(self, d_model, d_ff, dropout):
super(FeedForward, self).__init__()
self.d_model = d_model
self.d_ff = d_ff
self.dropout = nn.Dropout(dropout)
self.linear1 = nn.Linear(d_model, d_ff)
self.linear2 = nn.Linear(d_ff, d_model)
def forward(self, x):
return self.linear2(self.dropout(F.relu(self.linear1(x))))
class LayerNorm(nn.Module):
def __init__(self, d_model, eps=1e-6):
super(LayerNorm, self).__init__()
self.d_model = d_model
self.eps = eps
self.gamma = nn.Parameter(torch.ones(d_model))
self.beta = nn.Parameter(torch.zeros(d_model))
def forward(self, x):
x_mean = x.mean(-1, keepdim=True)
x_std = x.std(-1, keepdim=True)
x = (x - x_mean) / (x_std + self.eps)
return x * self.gamma + self.beta
class SublayerConnection(nn.Module):
def __init__(self, size, dropout):
super(SublayerConnection, self).__init__()
self.size = size
self.dropout = nn.Dropout(dropout)
self.norm = LayerNorm(size)
def forward(self, x, sublayer):
return x + self.dropout(sublayer(self.norm(x)))
class EncoderLayer(nn.Module):
def __init__(self, size, self_attn, feed_forward, dropout):
super(EncoderLayer, self).__init__()
self.size = size
self.self_attn = self_attn
self.feed_forward = feed_forward
self.sublayers = clones(SublayerConnection(size, dropout),2)
def forward(self, x,mask):
x = self.sublayers[0](x,lambda x: self.self_attn(x,x,x,mask=None))
x =self.sublayers[1](x,self.feed_forward)
return x
class Encoder(nn.Module):
def __init__(self, layer,N):
super(Encoder, self).__init__()
self.layers=clones(layer,N)
def forward(self, x,mask):
for layer in self.layers:
x = layer(x,mask)
return x
class Decoderlayer(nn.Module):
def __init__(self,size,self_atten,src_atten,feed_forward,dropout):
super().__init__()
self.size = size
self.self_atten = self_atten
self.src_atten = src_atten
self.feed_forward = feed_forward
self.sublayers = clones(SublayerConnection(size,dropout),3)
def forward(self,x,memory,self_mask,src_mask):
x=self.sublayers[0](x,lambda x:self.self_atten(x,x,x,self_mask))
x=self.sublayers[1](x,lambda x:self.src_atten(x,memory,memory,src_mask))
x=self.sublayers[2](x,self.feed_forward)
return x
class Decoder(nn.Module):
def __init__(self,layer,N):
super(Decoder, self).__init__()
self.layers = clones(layer,N)
def forward(self,x,memory,self_mask,src_mask):
for layer in self.layers:
x = layer(x,memory,self_mask,src_mask)
return x
class Generator(nn.Module):
def __init__(self,d_model,vocab_size):
super(Generator,self).__init__()
self.d_model = d_model
self.vocab_size = vocab_size
self.linear = nn.Linear(d_model,vocab_size)
def forward(self,x):
return F.log_softmax(self.linear(x),dim=-1)
class EncoderDecoder(nn.Module):
def __init__(self,encoder,decoder, source_embed, target_embed,generator):
super(EncoderDecoder,self).__init__()
self.encoder = encoder
self.decoder = decoder
self.source_embed = source_embed
self.target_embed = target_embed
self.generator = generator
def forward(self,x,target,self_mask,src_mask):
return self.generator(self.decoder(self.target_embed(target),self.encoder(self.source_embed(x),self_mask),self_mask,src_mask))
def encode(self,x,self_mask):
return self.encoder(self.source_embed(x),self_mask)
def decode(self,x,memory,self_mask,src_mask):
return self.decoder(self.target_embed(x),memory,self_mask,src_mask)
def text():
vocab_size = 10000
d_model = 512
dropout = 0.2
x=torch.randint(0,5,(3,4))
embeddings = Embeddings(vocab_size, d_model)
xembeddings = embeddings(x)
print(xembeddings.size())
position=PositionalEncoding(d_model, max_len=10,dropout=dropout)
xposition=position(xembeddings)
print(xposition.size())
src_atten=self_atten=MutilAttention(d_model, heads=8,dropout=dropout)
x_atten=self_atten(xposition,xposition,xposition)
print(x_atten.size())
feed=FeedForward(d_model,1024,dropout=dropout)
encoderlayer=EncoderLayer(d_model,self_atten,feed,dropout)
encoder=Encoder(encoderlayer,6)
result=encoder(xposition)
print(result.size())
print('-'*100)
decoderlayer=Decoderlayer(d_model,self_atten,src_atten,feed,dropout)
decoder=Decoder(decoderlayer,6)
result=decoder(result,result,self_mask=None,src_mask=None)
generator=Generator(d_model,vocab_size)
result=generator(result)
print(result.size())
def make_model(source_taget,target_taget,N,d_model,d_ff,head,dropout):
c=copy.deepcopy
atten=MutilAttention(d_model,head,dropout)
ff=FeedForward(d_model,d_ff,dropout)
position=PositionalEncoding(d_model, max_len=10,dropout=dropout)
model=EncoderDecoder(Encoder(EncoderLayer(size=d_model,self_attn=c(atten),feed_forward=c(ff),dropout=dropout),N),
Decoder(Decoderlayer(size=d_model,self_atten=c(atten),src_atten=c(atten),feed_forward=c(ff),dropout=dropout),N),
nn.Sequential(Embeddings(source_taget,d_model),PositionalEncoding(d_model,max_len=10,dropout=dropout)),
nn.Sequential(Embeddings(target_taget,d_model),PositionalEncoding(d_model,max_len=10,dropout=dropout)),
generator=Generator(d_model,target_taget))
for p in model.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
return model
def textmodel():
source_taget = 1000
target_taget = 1000
N=6
d_model=512
d_ff=1024
head=8
dropout=0.2
model=make_model(source_taget,target_taget,N,d_model,d_ff,head,dropout)
source = target = torch.LongTensor([[1, 2, 3, 8], [3, 4, 1, 8]])
# 假设src_mask与tgt_mask相同,实际中并不相同
source_mask = target_mask = torch.tril(torch.ones(size=(1, 8, 4, 4))).type(torch.uint8)
mydata = model(source, target, source_mask, target_mask)
print('mydata.shape--->', mydata.shape)
print('mydata--->', mydata)
if __name__ == '__main__':
textmodel()
手搓transformer
三排扣2025-11-07 17:21
相关推荐
ZhengEnCi2 小时前
P3B-90%初学者参数传错位置?合格程序员都这样选择参数类型程序员晚枫2 小时前
Python处理Excel的5个“神仙库”,办公效率直接翻倍!小兜全糖(xdqt)3 小时前
python ppt转pdf以及图片提取前端世界3 小时前
用Python打造智能成绩分析系统:从异常处理到断言验证的全流程实战yaoxin5211233 小时前
229. Java 集合 - 操作集合中的多个元素(批量操作)岁岁岁平安3 小时前
python 配置pip镜像源在人间负债^3 小时前
从Python到仓颉:核心项目内容迁移实践云和数据.ChenGuang3 小时前
SyntaxError: Non-UTF-8 code starting站大爷IP3 小时前
Airflow调度爬虫任务:从零搭建高效定时采集系统