Mindspore 公开课 - gpt2

GPT-2 Masked Self-Attention

GPT-2 Self-attention: 1- Creating queries, keys, and values
python 复制代码
batch_size = 1
seq_len = 10
embed_dim = 768

x = Tensor(np.random.randn(batch_size, seq_len, embed_dim), mindspore.float32)

from mindnlp._legacy.functional import split
from mindnlp.models.utils.utils import Conv1D

c_attn = Conv1D(3 * embed_dim, embed_dim)
query, key, value = split(c_attn(x), embed_dim, axis=2)
query.shape, key.shape, value.shape

def split_heads(tensor, num_heads, attn_head_size):
    """
    Splits hidden_size dim into attn_head_size and num_heads
    """
    new_shape = tensor.shape[:-1] + (num_heads, attn_head_size)
    tensor = tensor.view(new_shape)
    return ops.transpose(tensor, (0, 2, 1, 3))  # (batch, head, seq_length, head_features)

num_heads = 12
head_dim = embed_dim // num_heads

query = split_heads(query, num_heads, head_dim)
key = split_heads(key, num_heads, head_dim)
value = split_heads(value, num_heads, head_dim)

query.shape, key.shape, value.shape
GPT-2 Self-attention: 2- Scoring
python 复制代码
attn_weights = ops.matmul(query, key.swapaxes(-1, -2))

attn_weights.shape

max_positions = seq_len

bias = Tensor(np.tril(np.ones((max_positions, max_positions))).reshape(
              (1, 1, max_positions, max_positions)), mindspore.bool_)
bias
python 复制代码
from mindnlp._legacy.functional import where, softmax

attn_weights = attn_weights / ops.sqrt(ops.scalar_to_tensor(value.shape[-1]))
query_length, key_length = query.shape[-2], key.shape[-2]
causal_mask = bias[:, :, key_length - query_length: key_length, :key_length].bool()
mask_value = Tensor(np.finfo(np.float32).min, dtype=attn_weights.dtype)
attn_weights = where(causal_mask, attn_weights, mask_value)

np.finfo(np.float32).min

attn_weights[0, 0]


attn_weights = softmax(attn_weights, axis=-1)
attn_weights.shape

attn_weights[0, 0]

attn_output = ops.matmul(attn_weights, value)

attn_output.shape
GPT-2 Self-attention: 3.5- Merge attention heads
python 复制代码
def merge_heads(tensor, num_heads, attn_head_size):
    """
    Merges attn_head_size dim and num_attn_heads dim into hidden_size
    """
    tensor = ops.transpose(tensor, (0, 2, 1, 3))
    new_shape = tensor.shape[:-2] + (num_heads * attn_head_size,)
    return tensor.view(new_shape)

attn_output = merge_heads(attn_output, num_heads, head_dim)

attn_output.shape
GPT-2 Self-attention: 4- Projecting
python 复制代码
c_proj = Conv1D(embed_dim, embed_dim)
attn_output = c_proj(attn_output)
attn_output.shape
相关推荐
秋邱几秒前
深度解析CANN与AIGC的核心联系:算力底座赋能生成式AI规模化落地
人工智能·aigc
一枕眠秋雨>o<2 分钟前
数学的底座:ops-math如何为AI计算注入确定性
人工智能
Henry-SAP5 分钟前
SAP(ERP)主要生产计划(MPS)业务视角解析
人工智能
猫头虎9 分钟前
2026年AI产业13大趋势预测:Vibe Coding创作者经济元年到来,占冰强专家解读AIGC未来图景
人工智能·开源·prompt·aigc·ai编程·远程工作·agi
程序员清洒9 分钟前
CANN模型部署:从云端到端侧的全场景推理优化实战
大数据·人工智能
deephub9 分钟前
LLM推理时计算技术详解:四种提升大模型推理能力的方法
人工智能·深度学习·大语言模型·推理时计算
lili-felicity13 分钟前
CANN性能调优与实战问题排查:从基础优化到排障工具落地
开发语言·人工智能
User_芊芊君子16 分钟前
HCCL高性能通信库编程指南:构建多卡并行训练系统
人工智能·游戏·ai·agent·测评
冻感糕人~17 分钟前
【珍藏必备】ReAct框架实战指南:从零开始构建AI智能体,让大模型学会思考与行动
java·前端·人工智能·react.js·大模型·就业·大模型学习
hopsky19 分钟前
openclaw AI 学会操作浏览器抓取数据
人工智能