transformer – ソフトウェアエンジニアの技術ブログ：Software engineer tech blog

import numpy as np 

# 3単語 4次元ベクトル
X = np.array([
    [1, 0, 1, 0],
    [0, 2, 0, 2],
    [1, 1, 1, 1]
])

W_q = np.random.rand(4, 2)
W_k = np.random.rand(4, 2)
W_v = np.random.rand(4, 2)

Q = X @ W_q
K = X @ W_k
V = X @ W_v

attention_scores = Q @ K.T

dk = Q.shape[-1]
attention_scores = attention_scores / np.sqrt(dk)

def softmax(x):
    e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return e_x / e_x.sum(axis=-1, keepdims=True)

attention_weights = softmax(attention_scores)

output = attention_weights @ V

print("Input X:\n", X)
print("\nAttention Weights:\n", attention_weights)
print("\nOutput:\n", output)