Pre-trained GPT-2 model end to end

Objective

  1. Explore Pre-trained GPT-2

  2. Run input text - “cat sat on the” through Pre-trained GPT-2 and extract next token predictions

Load Pre-trained GTP-2 model

from collections import OrderedDict
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
import torch.nn.functional as F
/opt/hostedtoolcache/Python/3.11.15/x64/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
%%capture

# Load pre-trained model and tokenizer
model_name = "gpt2"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the model
model = AutoModelForCausalLM.from_pretrained(model_name)

# Force model to use CPU
model = model.to("cpu")
# Set the model to evaluation mode
model.eval()
## Disable Dropout layers to ensure deterministic outputs
Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
print("Model and tokenizer loaded successfully.")
print(f"Model: {model}")
Model and tokenizer loaded successfully.
Model: GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

Extract hidden state of each layer in the model

def get_all_hidden_states(model, input_ids, hidden=False):
    with torch.no_grad():
        # Request all hidden states from each layer
        transformer_outputs = model.transformer(input_ids, output_hidden_states=hidden)
        print(f"Number of hidden state layers: {len(transformer_outputs)}")

        if hidden:
            all_hidden_states = transformer_outputs.hidden_states
            print(f"Number of hidden state layers: {len(all_hidden_states)}")

    return all_hidden_states if hidden else transformer_outputs

Set custom LM-head

  • Extract logits

  • Apply temperature and k-top sampling

  • Select final token predictions

def lm_head(model, last_token_context_vector, temperature=0.7, k=50):


    # 1. Get Logits from the Language Modeling Head
    lm_head = model.get_output_embeddings()  #model.lm_head
    logits = lm_head(last_token_context_vector)

    # Implement Temperature Scaling
    # Lower temp -> more confident, less random. Higher temp -> more random, creative.
    scaled_logits = logits / temperature

    # Implement Top-K Sampling
    # We limit the sampling pool to the top 'k' most likely tokens
    top_k_logits, top_k_indices = torch.topk(scaled_logits, k)

    # Create a new tensor filled with a very low value (-inf)
    filtered_logits = torch.full_like(logits, -float("Inf"))
    # Scatter the top-k logits back into the new tensor at their original positions (Scatter the values along dimension 1)
    filtered_logits.scatter_(0, top_k_indices, top_k_logits)

    # Convert the filtered logits into a probability distribution
    probabilities = F.softmax(filtered_logits, dim=-1)
    return probabilities
## Select top-n tokens from the probabilities calculated in the LM-head

def get_top_token(probabilities, tokenizer, top_n=1):

    # Sample one token from the final probability distribution
    # torch.multinomial is used for sampling from a discrete probability distribution.
    final_token_id = torch.multinomial(probabilities, num_samples=top_n)

    # Decode the selected token ID(s) to get the final word
    final_token = tokenizer.decode(final_token_id.tolist())

    print(f"\n✅ Final Selected {top_n} Tokens: '{final_token}'")
    return final_token

Run experiment

Run custom LM-head and extract top 5 predictions

input_text = "cat sat on the"

# Tokenize input text
input_ids = tokenizer(input_text, return_tensors="pt").input_ids
print(f"Input prompt: '{input_text}'")

all_token_hidden_states = get_all_hidden_states(model, input_ids)
last_token_rep = all_token_hidden_states[0][-1, -1, :]

probabilities = lm_head(model, last_token_rep)
top_tokens = get_top_token(probabilities, tokenizer, top_n=5)
Input prompt: 'cat sat on the'
Number of hidden state layers: 2

✅ Final Selected 5 Tokens: ' bench bed floor edge steps'

Next token prediction (via LM-head defined in the model)

with torch.no_grad():
    outputs = model(input_ids)
    print(f"\nModel Output length: {len(outputs)}")
    print(f"\nModel Output Keys: {outputs.keys()}")
    print(f"\nModel Output Logits Shape: {outputs.logits.shape}")
    print(f"\nModel Output last token Logits Shape: {outputs.logits[-1, -1, :].shape}")
Model Output length: 2

Model Output Keys: odict_keys(['logits', 'past_key_values'])

Model Output Logits Shape: torch.Size([1, 4, 50257])

Model Output last token Logits Shape: torch.Size([50257])
# Get the logits for the last token position
next_token_logits = outputs.logits[-1, -1, :]

# Convert logits to probabilities
probs = torch.softmax(next_token_logits, dim=-1)

# Get the most probable next token
predicted_token_id = torch.argmax(probs).item()
predicted_token = tokenizer.decode([predicted_token_id])

print(f"Next token prediction (from default lm-head): '{predicted_token}'")
Next token prediction (from default lm-head): ' floor'