Pre-trained GPT-2 model end to end
Objective
Explore Pre-trained GPT-2
Run input text - “cat sat on the” through Pre-trained GPT-2 and extract next token predictions
Load Pre-trained GTP-2 model
from collections import OrderedDict
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
import torch.nn.functional as F
/opt/hostedtoolcache/Python/3.11.15/x64/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
%%capture
# Load pre-trained model and tokenizer
model_name = "gpt2"
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Load the model
model = AutoModelForCausalLM.from_pretrained(model_name)
# Force model to use CPU
model = model.to("cpu")
# Set the model to evaluation mode
model.eval()
## Disable Dropout layers to ensure deterministic outputs
Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
print("Model and tokenizer loaded successfully.")
print(f"Model: {model}")
Model and tokenizer loaded successfully.
Model: GPT2LMHeadModel(
(transformer): GPT2Model(
(wte): Embedding(50257, 768)
(wpe): Embedding(1024, 768)
(drop): Dropout(p=0.1, inplace=False)
(h): ModuleList(
(0-11): 12 x GPT2Block(
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(attn): GPT2Attention(
(c_attn): Conv1D(nf=2304, nx=768)
(c_proj): Conv1D(nf=768, nx=768)
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): GPT2MLP(
(c_fc): Conv1D(nf=3072, nx=768)
(c_proj): Conv1D(nf=768, nx=3072)
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(lm_head): Linear(in_features=768, out_features=50257, bias=False)
)
Set custom LM-head
Extract logits
Apply temperature and k-top sampling
Select final token predictions
def lm_head(model, last_token_context_vector, temperature=0.7, k=50):
# 1. Get Logits from the Language Modeling Head
lm_head = model.get_output_embeddings() #model.lm_head
logits = lm_head(last_token_context_vector)
# Implement Temperature Scaling
# Lower temp -> more confident, less random. Higher temp -> more random, creative.
scaled_logits = logits / temperature
# Implement Top-K Sampling
# We limit the sampling pool to the top 'k' most likely tokens
top_k_logits, top_k_indices = torch.topk(scaled_logits, k)
# Create a new tensor filled with a very low value (-inf)
filtered_logits = torch.full_like(logits, -float("Inf"))
# Scatter the top-k logits back into the new tensor at their original positions (Scatter the values along dimension 1)
filtered_logits.scatter_(0, top_k_indices, top_k_logits)
# Convert the filtered logits into a probability distribution
probabilities = F.softmax(filtered_logits, dim=-1)
return probabilities
## Select top-n tokens from the probabilities calculated in the LM-head
def get_top_token(probabilities, tokenizer, top_n=1):
# Sample one token from the final probability distribution
# torch.multinomial is used for sampling from a discrete probability distribution.
final_token_id = torch.multinomial(probabilities, num_samples=top_n)
# Decode the selected token ID(s) to get the final word
final_token = tokenizer.decode(final_token_id.tolist())
print(f"\n✅ Final Selected {top_n} Tokens: '{final_token}'")
return final_token
Run experiment
Run custom LM-head and extract top 5 predictions
input_text = "cat sat on the"
# Tokenize input text
input_ids = tokenizer(input_text, return_tensors="pt").input_ids
print(f"Input prompt: '{input_text}'")
all_token_hidden_states = get_all_hidden_states(model, input_ids)
last_token_rep = all_token_hidden_states[0][-1, -1, :]
probabilities = lm_head(model, last_token_rep)
top_tokens = get_top_token(probabilities, tokenizer, top_n=5)
Input prompt: 'cat sat on the'
Number of hidden state layers: 2
✅ Final Selected 5 Tokens: ' bench bed floor edge steps'
Next token prediction (via LM-head defined in the model)
with torch.no_grad():
outputs = model(input_ids)
print(f"\nModel Output length: {len(outputs)}")
print(f"\nModel Output Keys: {outputs.keys()}")
print(f"\nModel Output Logits Shape: {outputs.logits.shape}")
print(f"\nModel Output last token Logits Shape: {outputs.logits[-1, -1, :].shape}")
Model Output length: 2
Model Output Keys: odict_keys(['logits', 'past_key_values'])
Model Output Logits Shape: torch.Size([1, 4, 50257])
Model Output last token Logits Shape: torch.Size([50257])
# Get the logits for the last token position
next_token_logits = outputs.logits[-1, -1, :]
# Convert logits to probabilities
probs = torch.softmax(next_token_logits, dim=-1)
# Get the most probable next token
predicted_token_id = torch.argmax(probs).item()
predicted_token = tokenizer.decode([predicted_token_id])
print(f"Next token prediction (from default lm-head): '{predicted_token}'")
Next token prediction (from default lm-head): ' floor'