我整理好的1000+面试题,请看
大模型面试题总结-CSDN博客
或者
https://gitee.com/lilitom/ai_interview_questions/blob/master/README.md
最好将URL复制到浏览器中打开,不然可能无法直接打开
好了,我们今天针对上面的问题,
大模型中哪些模型用到的pre-norm和post-norm技术的?
下面的代码都来自transformers库,代码截取
-
LLAMA
代码如下,请看核心注释的地方
class LlamaDecoderLayer(nn.Module):
def init(self, config: LlamaConfig, layer_idx: int):
super().init()
self.hidden_size = config.hidden_sizeself.self_attn = ** self.mlp = LlamaMLP(config) self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) def forward( self, hidden_states: torch.Tensor, *** ) -> **: residual = hidden_states # 输入先norm hidden_states = self.input_layernorm(hidden_states) # 计算attention hidden_states, self_attn_weights, present_key_value = self.self_attn(... ) hidden_states = residual + hidden_states # 先norm后mlp然后加上residual residual = hidden_states hidden_states = self.post_attention_layernorm(hidden_states) hidden_states = self.mlp(hidden_states) hidden_states = residual + hidden_states outputs = (hidden_states,) if output_attentions: outputs += (self_attn_weights,) if use_cache: outputs += (present_key_value,) return outputs
pre-norm
-
Qwen
代码如下,请看核心注释额地方
class Qwen2DecoderLayer(nn.Module):
def init(self, config: Qwen2Config, layer_idx: int):
super().init()
self.hidden_size = config.hidden_sizeself.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) self.mlp = Qwen2MLP(config) self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) def forward( self, hidden_states: torch.Tensor, ** ) -> **: """ residual = hidden_states hidden_states = self.input_layernorm(hidden_states) # Self Attention hidden_states, self_attn_weights, present_key_value = self.self_attn(xx) hidden_states = residual + hidden_states # Fully Connected residual = hidden_states hidden_states = self.post_attention_layernorm(hidden_states) hidden_states = self.mlp(hidden_states) hidden_states = residual + hidden_states outputs = (hidden_states,) if output_attentions: outputs += (self_attn_weights,) if use_cache: outputs += (present_key_value,) return outputs
和llama一样的,是pre-norm.
-
Bert
代码如下,请看核心注释额地方
class BertSelfOutput(nn.Module):
def init(self, config):
super().init()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_statesclass BertOutput(nn.Module):
def init(self, config):
super().init()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states
上面的output代码用在了计算attention那一块,在attention计算完之后和input加起来,再过LN层。 因此是Post-Norm.