sentence = "I like tea"
toy_vocab = {"I": 1, "like": 2, "tea": 3}

tokens = [toy_vocab[word] for word in sentence.split()]
print("Sentence:", sentence)
print("Token IDs:", tokens)

Sentence: I like tea
Token IDs: [1, 2, 3]

import numpy as np

X = np.array([
    [1, 0],  # I
    [0, 1],  # like
    [1, 1],  # tea
], dtype=float)

print("Input embedding matrix X:")
print(X)

Input embedding matrix X:
[[1. 0.]
 [0. 1.]
 [1. 1.]]

Q = X.copy()
K = X.copy()
V = X.copy()

print("Q =")
print(Q)
print("\nK =")
print(K)
print("\nV =")
print(V)

Q =
[[1. 0.]
 [0. 1.]
 [1. 1.]]

K =
[[1. 0.]
 [0. 1.]
 [1. 1.]]

V =
[[1. 0.]
 [0. 1.]
 [1. 1.]]

d_k = Q.shape[1]
raw_scores = (Q @ K.T) / np.sqrt(d_k)

print("Raw attention scores S = QK^T / sqrt(d_k):")
print(np.round(raw_scores, 3))

Raw attention scores S = QK^T / sqrt(d_k):
[[0.707 0.    0.707]
 [0.    0.707 0.707]
 [0.707 0.707 1.414]]

seq_len = raw_scores.shape[0]
mask = np.triu(np.full((seq_len, seq_len), -np.inf), k=1)
masked_scores = raw_scores + mask

print("Causal mask:")
print(mask)
print("\nMasked scores:")
print(masked_scores)

Causal mask:
[[  0. -inf -inf]
 [  0.   0. -inf]
 [  0.   0.   0.]]

Masked scores:
[[0.70710678       -inf       -inf]
 [0.         0.70710678       -inf]
 [0.70710678 0.70710678 1.41421356]]

def row_softmax(x):
    # stable softmax, safe with -inf in masked positions
    x = x - np.max(x, axis=-1, keepdims=True)
    exp_x = np.exp(x)
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

A = row_softmax(masked_scores)

print("Attention weights A:")
print(np.round(A, 3))

Attention weights A:
[[1.    0.    0.   ]
 [0.33  0.67  0.   ]
 [0.248 0.248 0.503]]

O = A @ V

print("Attention output O = A @ V:")
print(np.round(O, 3))

Attention output O = A @ V:
[[1.    0.   ]
 [0.33  0.67 ]
 [0.752 0.752]]

Artificial Intelligence 🤖, Machine Learning ⚙, and Others 🌌

001: Tokenization, Causal Masking, and Attention Pass by Hand¶

§ Learning goals¶

§ Tokenization¶

Example sentence¶

Toy vocabulary¶

Why tokenization matters¶

§ Turn tokens into vectors¶

§ Create Q, K, V¶

§ Projection matrices¶

§ Compute raw attention scores¶

§ Apply the causal mask¶

Intuition¶

§ Softmax each row¶

Row 1¶

Row 2¶

Row 3¶

§ Multiply attention weights by $V$¶

§ Summarisation¶

§ Intuition¶

§ Learnings so far¶

§ Main concept¶

§ Upshot¶