# Transformer架构图详解
## 一、整体架构概览
Transformer模型架构是一种革命性的神经网络结构,完全基于注意力机制构建,摒弃了传统的循环神经网络和卷积神经网络,在自然语言处理领域取得了突破性进展[ref_1]。该架构由编码器和解码器两部分组成,采用对称的堆叠结构设计。
### 核心组件对比表
| 组件类型 | 编码器部分 | 解码器部分 | 主要功能 |
|---------|-----------|-----------|---------|
| 注意力机制 | Multi-Head Attention | Masked Multi-Head Attention | 处理序列关系 |
| 连接方式 | Add & Norm × 2 | Add & Norm × 3 | 稳定训练过程 |
| 前馈网络 | Feed Forward | Feed Forward | 非线性变换 |
| 堆叠层数 | N个相同层 | N个相同层 | 深度特征提取 |
## 二、编码器结构深度解析
### 2.1 编码器层级组成
编码器由N个完全相同的层堆叠而成,每一层都包含以下核心组件:
```python
class EncoderLayer:
def __init__(self, d_model, num_heads, d_ff, dropout_rate=0.1):
self.multi_head_attention = MultiHeadAttention(d_model, num_heads)
self.feed_forward = FeedForward(d_model, d_ff)
self.add_norm_1 = AddAndNorm(d_model)
self.add_norm_2 = AddAndNorm(d_model)
def forward(self, x, mask=None):
# 多头注意力计算
attention_output = self.multi_head_attention(x, x, x, mask)
# 第一次残差连接和层归一化
x = self.add_norm_1(x, attention_output)
# 前馈神经网络
ff_output = self.feed_forward(x)
# 第二次残差连接和层归一化
output = self.add_norm_2(x, ff_output)
return output
```
### 2.2 多头注意力机制实现
多头注意力是Transformer的核心创新,它允许模型同时关注来自不同位置的不同表示子空间的信息[ref_2]。
```python
import torch
import torch.nn as nn
import math
class MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads):
super().__init__()
self.d_model = d_model
self.num_heads = num_heads
self.d_k = d_model // num_heads
# 线性变换矩阵
self.w_q = nn.Linear(d_model, d_model)
self.w_k = nn.Linear(d_model, d_model)
self.w_v = nn.Linear(d_model, d_model)
self.w_o = nn.Linear(d_model, d_model)
def scaled_dot_product_attention(self, q, k, v, mask=None):
# 计算QK^T / sqrt(d_k)
scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e9)
# Softmax归一化
attention_weights = torch.softmax(scores, dim=-1)
# 与V相乘得到输出
output = torch.matmul(attention_weights, v)
return output, attention_weights
def forward(self, q, k, v, mask=None):
batch_size = q.size(0)
# 线性变换并分头
q = self.w_q(q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
k = self.w_k(k).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
v = self.w_v(v).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
# 缩放点积注意力
attention_output, attention_weights = self.scaled_dot_product_attention(q, k, v, mask)
# 合并多头
attention_output = attention_output.transpose(1, 2).contiguous().view(
batch_size, -1, self.d_model
)
# 输出线性变换
output = self.w_o(attention_output)
return output
```
## 三、解码器架构详细分析
### 3.1 解码器特殊设计
解码器同样由N个相同层堆叠,但在设计上有关键差异:
```python
class DecoderLayer:
def __init__(self, d_model, num_heads, d_ff, dropout_rate=0.1):
self.masked_multi_head_attention = MultiHeadAttention(d_model, num_heads)
self.multi_head_attention = MultiHeadAttention(d_model, num_heads)
self.feed_forward = FeedForward(d_model, d_ff)
self.add_norm_1 = AddAndNorm(d_model)
self.add_norm_2 = AddAndNorm(d_model)
self.add_norm_3 = AddAndNorm(d_model)
def forward(self, x, encoder_output, look_ahead_mask=None, padding_mask=None):
# 掩码多头注意力(防止信息泄露)
masked_attention_output = self.masked_multi_head_attention(x, x, x, look_ahead_mask)
x = self.add_norm_1(x, masked_attention_output)
# 编码器-解码器注意力
attention_output = self.multi_head_attention(x, encoder_output, encoder_output, padding_mask)
x = self.add_norm_2(x, attention_output)
# 前馈神经网络
ff_output = self.feed_forward(x)
output = self.add_norm_3(x, ff_output)
return output
```
### 3.2 掩码机制详解
解码器中的掩码机制是确保自回归生成的关键:
```python
def create_masks(input_seq, target_seq, pad_token=0):
# 编码器填充掩码
encoder_padding_mask = create_padding_mask(input_seq, pad_token)
# 解码器填充掩码
decoder_padding_mask = create_padding_mask(target_seq, pad_token)
# 前瞻掩码(防止看到未来信息)
look_ahead_mask = create_look_ahead_mask(target_seq.size(1))
# 组合掩码
combined_mask = torch.max(decoder_padding_mask, look_ahead_mask)
return encoder_padding_mask, combined_mask
def create_look_ahead_mask(size):
"""创建上三角掩码矩阵,防止看到未来位置"""
mask = torch.triu(torch.ones(size, size), diagonal=1)
return mask
def create_padding_mask(seq, pad_token):
"""创建填充掩码,忽略填充位置"""
mask = (seq == pad_token).unsqueeze(1).unsqueeze(2)
return mask
```
## 四、位置编码与前馈网络
### 4.1 位置编码实现
由于Transformer没有循环和卷积结构,需要位置编码来注入序列位置信息[ref_3]:
```python
class PositionalEncoding(nn.Module):
def __init__(self, d_model, max_len=5000):
super().__init__()
# 计算位置编码
position = torch.arange(max_len).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2) *
(-math.log(10000.0) / d_model))
pe = torch.zeros(max_len, d_model)
pe[:, 0::2] = torch.sin(position * div_term) # 偶数位置
pe[:, 1::2] = torch.cos(position * div_term) # 奇数位置
self.register_buffer('pe', pe.unsqueeze(0))
def forward(self, x):
# 添加位置编码到输入嵌入
return x + self.pe[:, :x.size(1)]
```
### 4.2 前馈神经网络
位置前馈网络在每个位置独立应用相同的全连接层:
```python
class FeedForward(nn.Module):
def __init__(self, d_model, d_ff, dropout_rate=0.1):
super().__init__()
self.linear1 = nn.Linear(d_model, d_ff)
self.linear2 = nn.Linear(d_ff, d_model)
self.dropout = nn.Dropout(dropout_rate)
self.activation = nn.ReLU()
def forward(self, x):
# 第一层线性变换 + 激活函数
x = self.activation(self.linear1(x))
# Dropout正则化
x = self.dropout(x)
# 第二层线性变换
x = self.linear2(x)
return x
```
## 五、残差连接与层归一化
### 5.1 稳定训练的关键技术
```python
class AddAndNorm(nn.Module):
def __init__(self, d_model, eps=1e-6):
super().__init__()
self.layer_norm = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(0.1)
def forward(self, x, sublayer_output):
# 残差连接 + Dropout + 层归一化
return self.layer_norm(x + self.dropout(sublayer_output))
```
## 六、完整Transformer架构实现
```python
class Transformer(nn.Module):
def __init__(self, src_vocab_size, tgt_vocab_size, d_model=512, num_heads=8,
num_layers=6, d_ff=2048, max_len=5000, dropout_rate=0.1):
super().__init__()
# 嵌入层
self.src_embedding = nn.Embedding(src_vocab_size, d_model)
self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
# 位置编码
self.positional_encoding = PositionalEncoding(d_model, max_len)
# 编码器堆叠
self.encoder_layers = nn.ModuleList([
EncoderLayer(d_model, num_heads, d_ff, dropout_rate)
for _ in range(num_layers)
])
# 解码器堆叠
self.decoder_layers = nn.ModuleList([
DecoderLayer(d_model, num_heads, d_ff, dropout_rate)
for _ in range(num_layers)
])
# 输出层
self.output_linear = nn.Linear(d_model, tgt_vocab_size)
self.dropout = nn.Dropout(dropout_rate)
def forward(self, src, tgt, src_mask=None, tgt_mask=None):
# 编码器前向传播
src_embedded = self.dropout(self.positional_encoding(self.src_embedding(src)))
encoder_output = src_embedded
for layer in self.encoder_layers:
encoder_output = layer(encoder_output, src_mask)
# 解码器前向传播
tgt_embedded = self.dropout(self.positional_encoding(self.tgt_embedding(tgt)))
decoder_output = tgt_embedded
for layer in self.decoder_layers:
decoder_output = layer(decoder_output, encoder_output, tgt_mask, src_mask)
# 输出投影
output = self.output_linear(decoder_output)
return output
```
## 七、架构特点与应用场景
Transformer架构的设计体现了多个重要理念:通过自注意力机制实现全局依赖建模,利用多头注意力捕捉不同类型的关联关系,借助位置编码弥补序列位置信息的缺失,采用残差连接和层归一化确保深层网络的稳定训练[ref_1][ref_2][ref_3]。这种架构不仅在机器翻译任务中表现出色,更为后续的BERT、GPT等预训练语言模型奠定了坚实基础,成为现代自然语言处理的基石架构。