Transformer——多头注意力机制(Pytorch)
创始人
2025-01-11 08:34:26
0

1. 原理图

2. 代码

import torch import torch.nn as nn   class Multi_Head_Self_Attention(nn.Module):     def __init__(self, embed_size, heads):         super(Multi_Head_Self_Attention, self).__init__()         self.embed_size = embed_size         self.heads = heads         self.head_dim = embed_size // heads          self.queries = nn.Linear(self.embed_size, self.embed_size, bias=False)         self.keys = nn.Linear(self.embed_size, self.embed_size, bias=False)         self.values = nn.Linear(self.embed_size, self.embed_size, bias=False)         self.fc_out = nn.Linear(self.embed_size, self.embed_size, bias=False)      def forward(self,queries, keys, values, mask):         N = queries.shape[0]  # batch_size         query_len = queries.shape[1]  # sequence_length         key_len = keys.shape[1]  # sequence_length          value_len = values.shape[1]  # sequence_length          queries = self.queries(queries)         keys = self.keys(keys)         values = self.values(values)          # Split the embedding into self.heads pieces         # batch_size, sequence_length, embed_size(512) -->          # batch_size, sequence_length, heads(8), head_dim(64)         queries = queries.reshape(N, query_len, self.heads, self.head_dim)         keys = keys.reshape(N, key_len, self.heads, self.head_dim)         values = values.reshape(N, value_len, self.heads, self.head_dim)          # batch_size, sequence_length, heads(8), head_dim(64) -->          # batch_size, heads(8), sequence_length, head_dim(64)         queries = queries.transpose(1, 2)         keys = keys.transpose(1, 2)         values = values.transpose(1, 2)          # Scaled dot-product attention         score = torch.matmul(queries, keys.transpose(-2, -1)) / (self.head_dim ** (1/2))          if mask is not None:             score = score.masked_fill(mask == 0, float("-inf"))         # batch_size, heads(8), sequence_length, sequence_length         attention = torch.softmax(score, dim=-1)          out = torch.matmul(attention, values)         # batch_size, heads(8), sequence_length, head_dim(64) -->         # batch_size, sequence_length, heads(8), head_dim(64) -->         # batch_size, sequence_length, embed_size(512)         # 为了方便送入后面的网络         out = out.transpose(1, 2).contiguous().reshape(N, query_len, self.embed_size)         out = self.fc_out(out)          return out       batch_size = 64 sequence_length = 10 embed_size = 512 heads = 8 mask = None  Q = torch.randn(batch_size, sequence_length, embed_size)   K = torch.randn(batch_size, sequence_length, embed_size)   V = torch.randn(batch_size, sequence_length, embed_size)    model = Multi_Head_Self_Attention(embed_size, heads) output = model(Q, K, V, mask) print(output.shape)

 

相关内容

热门资讯

苹果版黑科技!德扑ai机器人软... 苹果版黑科技!德扑ai机器人软件开发(透视)太坑了有挂(攻略教程黑科技教程);1、起透看视 德扑ai...
黑科技软件"wepo... 黑科技软件"wepoke透明好友局!外挂透明挂辅助脚本(黑科技)系统教程"一直存在有挂1、构建自己的...
黑科技数据(WEPOke)软件... 黑科技数据(WEPOke)软件透明挂是真的吗(黑科技)软件(竟然真的有挂)1、金币登录送、破产送、升...
黑科技工具!德州poker有外... 黑科技工具!德州poker有外挂吗(wEpOke)透明黑科技辅助脚本(玩家必看科普黑科技教程)1、点...
工具黑科技!红龙扑克外挂(透视... 工具黑科技!红龙扑克外挂(透视)太坑了是真的有挂(2025新版总结黑科技技巧)1、玩家可以在红龙扑克...
黑科技总结"德州之星... 黑科技总结"德州之星软件!外挂透明挂辅助软件(黑科技)透视教程"一直是有挂;小薇(透视辅助)致您一封...
黑科技科技!微扑克有辅助插件吗... 黑科技科技!微扑克有辅助插件吗(WepOke)透明黑科技辅助挂(揭秘一下黑科技方法);人气非常高,a...
科普辅助(微扑克)ai胜率(辅... 科普辅助(微扑克)ai胜率(辅助挂)ai胜率(果然有挂);是一款可以让一直输的玩家,快速成为一个“必...
代打黑科技!wpk外挂(透视)... 代打黑科技!wpk外挂(透视)太坑了真的是有挂(必备教程黑科技介绍);小薇(透视辅助)致您一封信;亲...
黑科技讲解"微扑克系... 黑科技讲解"微扑克系统发牌逻辑!外挂透明挂辅助助手(黑科技)技巧教程"真是存在有挂1、超多福利:超高...