Classes
class	Block
	Transformer block: communication followed by computation. More...

class	FeedFoward
	a simple linear layer followed by a non-linearity More...

class	GPTLanguageModel

class	Head
	one head of self-attention More...

class	MultiHeadAttention
	multiple heads of self-attention in parallel More...

Functions
	get_batch (split)

	estimate_loss ()

Variables
int	batch_size = 64

int	block_size = 256

int	max_iters = 5000

int	eval_interval = 500

int	learning_rate = 3e-4

str	device = "mps"

int	eval_iters = 200

int	n_embd = 384

int	n_head = 6

int	n_layer = 6

float	dropout = 0.2

	encoding

	text = f.read()

	chars = sorted(list(set(text)))

	vocab_size = len(chars)

dict	stoi = {ch: i for i, ch in enumerate(chars)}

dict	itos = {i: ch for i, ch in enumerate(chars)}

	encode

str	decode

	data = torch.tensor(encode(text), dtype=torch.long)

	n = int(0.9 * len(data))

	train_data = data[:n]

	val_data = data[n:]

	model = GPTLanguageModel()

	m = model.to(device)

	optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

	losses = estimate_loss()

	xb

	yb

	logits

	loss

	set_to_none

	context = torch.zeros((1, 1), dtype=torch.long, device=device)

Function Documentation

◆ estimate_loss()

working_gpt.estimate_loss ( )

Definition at line 66 of file working_gpt.py.

def estimate_loss():
    out = {}
    model.eval()
    for split in ["train", "val"]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out
 
 

References get_batch(), and model.

◆ get_batch()

working_gpt.get_batch ( split )

Definition at line 55 of file working_gpt.py.

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i : i + block_size] for i in ix])
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y
 
 
@torch.no_grad()

Referenced by estimate_loss().

Variable Documentation

◆ batch_size

int working_gpt.batch_size = 64

Definition at line 6 of file working_gpt.py.

◆ block_size

int working_gpt.block_size = 256

Definition at line 7 of file working_gpt.py.

◆ chars

working_gpt.chars = sorted(list(set(text)))

Definition at line 35 of file working_gpt.py.

◆ context

working_gpt.context = torch.zeros((1, 1), dtype=torch.long, device=device)

Definition at line 250 of file working_gpt.py.

◆ data

working_gpt.data = torch.tensor(encode(text), dtype=torch.long)

Definition at line 48 of file working_gpt.py.

◆ decode

str working_gpt.decode

Initial value:

=  lambda l: "".join(
    [itos[i] for i in l]
)

Definition at line 43 of file working_gpt.py.

◆ device

str working_gpt.device = "mps"

Definition at line 13 of file working_gpt.py.

◆ dropout

float working_gpt.dropout = 0.2

Definition at line 25 of file working_gpt.py.

◆ encode

working_gpt.encode

Initial value:

=  lambda s: [
    stoi[c] for c in s
]

Definition at line 40 of file working_gpt.py.

◆ encoding

working_gpt.encoding

Definition at line 31 of file working_gpt.py.

◆ eval_interval

int working_gpt.eval_interval = 500

Definition at line 9 of file working_gpt.py.

◆ eval_iters

int working_gpt.eval_iters = 200

Definition at line 21 of file working_gpt.py.

◆ itos

dict working_gpt.itos = {i: ch for i, ch in enumerate(chars)}

Definition at line 39 of file working_gpt.py.

◆ learning_rate

int working_gpt.learning_rate = 3e-4

Definition at line 10 of file working_gpt.py.

◆ logits

working_gpt.logits

Definition at line 244 of file working_gpt.py.

◆ loss

working_gpt.loss

Definition at line 244 of file working_gpt.py.

◆ losses

working_gpt.losses = estimate_loss()

Definition at line 235 of file working_gpt.py.

◆ m

working_gpt.m = model.to(device)

Definition at line 224 of file working_gpt.py.

◆ max_iters

int working_gpt.max_iters = 5000

Definition at line 8 of file working_gpt.py.

◆ model

working_gpt.model = GPTLanguageModel()

Definition at line 223 of file working_gpt.py.

Referenced by estimate_loss().

◆ n

working_gpt.n = int(0.9 * len(data))

Definition at line 49 of file working_gpt.py.

◆ n_embd

int working_gpt.n_embd = 384

Definition at line 22 of file working_gpt.py.

◆ n_head

int working_gpt.n_head = 6

Definition at line 23 of file working_gpt.py.

◆ n_layer

int working_gpt.n_layer = 6

Definition at line 24 of file working_gpt.py.

◆ optimizer

working_gpt.optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

Definition at line 229 of file working_gpt.py.

◆ set_to_none

working_gpt.set_to_none

Definition at line 245 of file working_gpt.py.

◆ stoi

dict working_gpt.stoi = {ch: i for i, ch in enumerate(chars)}

Definition at line 38 of file working_gpt.py.

◆ text

working_gpt.text = f.read()

Definition at line 32 of file working_gpt.py.

◆ train_data

working_gpt.train_data = data[:n]

Definition at line 50 of file working_gpt.py.

◆ val_data

working_gpt.val_data = data[n:]

Definition at line 51 of file working_gpt.py.

◆ vocab_size

working_gpt.vocab_size = len(chars)

Definition at line 36 of file working_gpt.py.

◆ xb

working_gpt.xb

Definition at line 241 of file working_gpt.py.

◆ yb

working_gpt.yb