# get datasets, incidently install dependencies to fine tune models from mit
!pip install mitdeeplearning # > /dev/null 2>&1
import mitdeeplearning as mdl

[0mSuccessfully installed boto3-stubs-1.38.0 botocore-stubs-1.38.0 datasets-3.5.0 dill-0.3.8 fsspec-2024.12.0 levenshtein-0.27.1 lion-pytorch-0.2.3 litellm-1.67.1 mitdeeplearning-0.7.5 multiprocess-0.70.16 mypy-boto3-bedrock-runtime-1.38.0 nvidia-cublas-cu12-12.4.5.8 nvidia-cuda-cupti-cu12-12.4.127 nvidia-cuda-nvrtc-cu12-12.4.127 nvidia-cuda-runtime-cu12-12.4.127 nvidia-cudnn-cu12-9.1.0.70 nvidia-cufft-cu12-11.2.1.3 nvidia-curand-cu12-10.3.5.147 nvidia-cusolver-cu12-11.6.1.9 nvidia-cusparse-cu12-12.3.1.170 nvidia-nvjitlink-cu12-12.4.127 opik-1.7.9 pydantic-settings-2.9.1 python-dotenv-1.1.0 rapidfuzz-3.13.0 tiktoken-0.9.0 types-awscrt-0.26.1 types-s3transfer-0.11.5 uuid6-2024.7.10 xxhash-3.5.0

import os
import json
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader

from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from lion_pytorch import Lion

Fine tuning Gemma 2B as a chatbot

Templating and tokenization

Templating

Language models that function as chatbots are able to generate responses to user questies – but how do they do this?

We need to provide them with a way to understand the conversation and generate responses in a coherent maner – some structure of what are inputs and outputs.

Feature	Hugging Face Chat Templates	LangChain Prompt Templates
Primary Goal	Model-specific input formatting	Flexible prompt construction
Integration	Tokenizer-level	LangChain chains and tools
Customization	Limited to model’s expected format	Highly customizable with dynamic variables
Transparency	Templates are accessible and modifiable	Templates are defined in code and fully visible
Use Case	Ensuring compatibility with specific model formats	Building complex, multi-step workflows

# Basic question-answer template
template_without_answer = "<start_of_turn>user\n{question}<end_of_turn>\n<start_of_turn>model\n"
template_with_answer = template_without_answer + "{answer}<end_of_turn>\n"

# Let's try to put something into the template to see how it looks
print(template_with_answer.format(question="What is your name?", answer="My name is Gemma!"))

<start_of_turn>user
What is your name?<end_of_turn>
<start_of_turn>model
My name is Gemma!<end_of_turn>

Tokenization

To operate on language, we need to prepare the text for the model. Fundamentally we can think of language as a sequence of “chunks” of text. We can split the text into individual chunks, and then map these chunks to numerical tokens – collectively this is the process of tokenization. Numerical tokens can then be fed into a language model.

There are several common approaches to tokenizing natural lauguage text:

word-based tokenization: splits text into individual words –> simple but can lead to larg vocabularies and does not handle unknown words well
character-based tokenization: splits text into individual characters. –> involves a very smal vocabulary, produces long sequences and loeses word-level meaning
subword tokenization: breaks words into smaller units (subwords) based on their frequency. The most popular and commonly used approach : byte-pair encoding(BPE), which iteratively merges the most frequent character pairs. Mordern language models typically use subword tokenization as it balances vocabulary size and sequence length while handling unknown words effectively by breaking them into known subword units.

# we will use the tokenizer from the Gemma 2B model, which uses BPE. Let's load it and inspect it.

# Load the tokenizer for Gemma 2B
model_id = "unsloth/gemma-2-2b-it" #"google/gemma-2-2b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# How big is the tokenizer?
print(f"Vocab size: {len(tokenizer.get_vocab())}")

Vocab size: 256000

We not only need to be able to tokenize the text into tokens (encode), but also de-tokenize the tokens back into text (decode). Our tokenizer will have:

an encode function to tokenize the text into tokens, and
a decode function to de-tokenize back to text so that we can read out the model’s outputs.

# Lets test out both steps:
text = "Here is some sample text!"
print(f"Original text: {text}")

# Tokenize the text
tokens = tokenizer.encode(text, return_tensors="pt")
print(f"Encoded tokens: {tokens}")

# Decode the tokens
decoded_text = tokenizer.decode(tokens[0], skip_special_tokens=True)
print(f"Decoded text: {decoded_text}")

Original text: Here is some sample text!
Encoded tokens: tensor([[     2,   4858,    603,   1009,   6453,   2793, 235341]])
Decoded text: Here is some sample text!

To “chat” with our LLM chatbot, we need to use the tokenizer and the chat template together, in order for the model to respond to the user’s question. We can use the templates defined earlier to construct a prompt for the model, without the answer.

template_without_answer

'<start_of_turn>user\n{question}<end_of_turn>\n<start_of_turn>model\n'

prompt = template_without_answer.format(question="What is the capital of France? Use one word.")
print(prompt)

<start_of_turn>user
What is the capital of France? Use one word.<end_of_turn>
<start_of_turn>model

If we were to feed this to the model, it would see that it is now the start of the model’s turn, and it would generate the answer to this question.

Getting started with the LLM

Now we have a way to prepare our data, we’re ready to work with our LLM.

LLM like Gemma 2B are trained on a large corpus of text, on the task of predicting the next token in a sequence, given the previous tokens.

We call this training task “next token prediction” or “causal language modeling” or “autoregressive language modeling”.

We can leverage models trained in this way to generate new text by sampling from the predited probability distribution over the next token.

In next few lines fo experiment, we’ll

load Gemma 2B
construct a prompt in chat template form and tokenize it
feed it to the model to predict next token probabilities
get the next token(which is still numerical) and decode it to text

# 1. Load the model -- note that this may take a few minutes
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")

config.json:   0%|          | 0.00/913 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/5.23G [00:00<?, ?B/s]



generation_config.json:   0%|          | 0.00/209 [00:00<?, ?B/s]

# ==== Putting it together to prompt the model and generate a response ===

# 1. Construct the prompt in chat template form
question = "What is the capital of France? Use one word."
prompt = template_without_answer.format(question=question)

# 2. Tokenize the prompt
tokens = tokenizer.encode(prompt, return_tensors="pt").to(model.device) # return pytorch(pt) torch.tensor/tensorflow(tf) tf.constant/numpy object numpy.ndarray

# 3. Feed through the model to predict the next token probabilities
with torch.no_grad():
    output = model(tokens)
    probs = F.softmax(output.logits, dim=-1)

# 4. Get the next token, according to the maximum probability
next_token = torch.argmax(probs[0, -1, :]).item()

# 5. Decode the next token
next_token_text = tokenizer.decode(next_token)

print(f"Prompt: {prompt}")
print(f"Predicted next token: {next_token_text}")

Prompt: <start_of_turn>user
What is the capital of France? Use one word.<end_of_turn>
<start_of_turn>model

Predicted next token: Paris

Note that the model is not able to predict the answer to the question, in this case it could be answer in one word so model sneak through. it is only able to predict the next token in the sequence. for more complex questions, we cannot just generate one token, but rather we need to generate a sequence of tokens.

This can be done by doing the process above iteratively, step by step – after each step we feed the generated token back into the model and predict the next token again.

Instead of doing this manually ourselves, we can use the model’s built-in model.generate() functionality (supported by HuggingFace’s Transformers library) to generate max_new_tokens number of tokens, and decode the output back to text.

template_without_answer

'<start_of_turn>user\n{question}<end_of_turn>\n<start_of_turn>model\n'

prompt = template_without_answer.format(question="What does Deez stand for?")
tokens = tokenizer.encode(prompt, return_tensors="pt").to(model.device)

output = model.generate(tokens, max_new_tokens=20)
print(tokenizer.decode(output[0]))

<bos><start_of_turn>user
What does Deez stand for?<end_of_turn>
<start_of_turn>model
"Deez" is a slang term that doesn't have a single, official meaning.

Now we have the basic pipeline for generating text with an LLM!

1.3: Fine-tuning

Fine-tuning is a technique that allows us to adapt a pre-trained neural network to better suit a downstream task, domain, or style, or capabilities. Fine-tuning is used in a variety of applications, not just language modeling. But in language modeling, fine-tuning can be used to:

Adapt the model’s writing style
Improve performance on specific tasks or domains
Teach the model new capabilities or knowledge
Reduce unwanted behaviors or biases

In next task, you will fine-tune the Gemma LLM to adapt the model’s writing style. Continuing with our Irish theme, we will first fine-tune the LLM to chat in the style of a leprechaun.

We have prepared a question-answer dataset where the questions are in standard English style (i.e. “base” style) and the answers are in “leprechaun” style (written by another LLM). Let’s load the dataset and inspect it.

train_loader, test_loader = mdl.lab3.create_dataloader(style="leprechaun")

sample = train_loader.dataset[44]
question = sample['instruction']
answer = sample['response']
answer_style = sample['response_style']

print(f"Question: {question}\n\n" +
      f"Original Answer: {answer}\n\n" +
      f"Answer Style: {answer_style}")

README.md:   0%|          | 0.00/8.20k [00:00<?, ?B/s]
databricks-dolly-15k.jsonl:   0%|          | 0.00/13.1M [00:00<?, ?B/s]
Generating train split:   0%|          | 0/15011 [00:00<?, ? examples/s]
Map:   0%|          | 0/2048 [00:00<?, ? examples/s]

Question: Are lilies safe for cats?

Original Answer: No, lilies are toxic to cats if consumed and should not be kept in a household with cats

Answer Style: Och, no indeed, me hearty! Them lilies there be as dangerous as a pot o' gold guarded by a banshee to a wee kitty cat! If a whiskered lad or lass takes a bite of one, it's as bad as swallowing a curse from the old Hag herself. So, ye best keep them far from yer feline friends, or else ye'll be needin' more than just a four-leaf clover to bring luck back into yer home!

train_loader.dataset

Dataset({
    features: ['instruction', 'context', 'response', 'category', 'response_style'],
    num_rows: 2048
})

1.3.1: Chat function

Before we start finetuning, we will build a function to easily chat with the model, both so we can monitor its progress over the course of finetuning and also to generate responses to questions.

Recall our core steps from before:

Construct the question prompt using the template
Tokenize the text
Feed the tokensthrough the model to predict the next token probabilities
Decode the predicted tokens back to text

Use these steps to build out the chat function below.

def chat(question, max_new_tokens=32, temperature=0.7, only_answer=False):
    # 1. Construct the prompt using the template
    prompt = template_without_answer.format(question=question)

    # 2. Tokenize the text
    input_ids = tokenizer(prompt, return_tensors="pt").to(model.device)

    # 3. Feed through the model to predict the next token probabilities
    with torch.no_grad():
        outputs = model.generate(**input_ids, do_sample=True, max_new_tokens=max_new_tokens, temperature=temperature)

    # 4. Only return the answer if only_answer is True
    output_tokens = outputs[0]
    if only_answer:
        output_tokens = output_tokens[input_ids['input_ids'].shape[1]:]

    # 5. Decode the tokens
    result = tokenizer.decode(output_tokens, skip_special_tokens=True)

    return result

# Let's try chatting with the model now to test if it works!
answer = chat(
    "Write me a story",
    only_answer=True,
    max_new_tokens=32,
    temperature=0.000001
)

print(answer)

print("\n==============\n")

answer = chat(
    "Write me a story",
    only_answer=True,
    max_new_tokens=32,
    temperature=0.000001
)

print(answer)

print("\n==============\n")

answer = chat(
    "Write me a story",
    only_answer=True,
    max_new_tokens=32,
    temperature=0.999999
)

print(answer)

The wind whipped Elara's cloak around her like a phantom, carrying the scent of salt and brine. She stood at the edge of the cliff, the

==============

The wind whipped Elara's cloak around her like a phantom, carrying the scent of salt and brine. She stood at the edge of the cliff, the

==============

The wind whipped Aella's braid across her face as she scaled the crumbling city wall. Below, the sun bled into the horizon, casting the cobbl

Parameter-efficient fine-tuning

In fine-tuning, the weights of the model are updated to better fit the fine-tuning dataset and/or task. Updating all the weights in a language model like Gemma 2B – which has ~2 billion parameters – is computationally expensive. There are many techniques to make fine-tuning more efficient.

We will use a technique called LoRA – low-rank adaptation – to make the fine-tuning process more efficient. LoRA is a way to fine-tune LLMs very efficiently by only updating a small subset of the model’s parameters, and it works by adding trainable low-rank matrices to the model. While we will not go into the details of LoRA here, you can read more about it in the LoRA paper. We will use the peft library to apply LoRA to the Gemma model.

# LoRA is a way to finetune LLMs very efficiently by only updating a small subset of the model's parameters

def apply_lora(model):
    # Define LoRA config
    lora_config = LoraConfig(
        r=8, # rank of the LoRA matrices
        task_type="CAUSAL_LM",
        target_modules=[
            "q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"
        ],
    )

    # Apply LoRA to the model
    lora_model = get_peft_model(model, lora_config)
    return lora_model

model = apply_lora(model)

# Print the number of trainable parameters after applying LoRA
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"number of trainable parameters: {trainable_params}")
print(f"total parameters: {total_params}")
print(f"percentage of trainable parameters: {trainable_params / total_params * 100:.2f}%")

number of trainable parameters: 10383360
total parameters: 2624725248
percentage of trainable parameters: 0.40%

1.3.3: Forward pass and loss computation

Now let’s define a function to perform a forward pass through the LLM and compute the loss. The forward pass gives us the logits – which reflect the probability distribution over the next token – for the next token. We can compute the loss by comparing the predicted logits to the true next token – our target label. Note that this is effectively a classification problem! So, our loss can be captured by the cross entropy loss, and we can use PyTorch’s nn.functional.cross_entropy function to compute it.



def forward_and_compute_loss(model, tokens, mask, context_length=512):
    # Truncate to context length
    tokens = tokens[:, :context_length]
    mask = mask[:, :context_length]

    # Construct the input, output, and mask
    x = tokens[:, :-1] # All tokens except the last one in each sequence. This serves as the input to the model.
    y = tokens[:, 1:] # All tokens except the first one in each sequence. These are the expected outputs the model should predict.
    mask = mask[:, 1:] # Aligns the mask with the target sequence by removing the first token's mask value.

    """
    e.g.
    tokens: [CLS] The cat sat on the mat [SEP]
    x: [CLS] The cat sat on the
    y: The cat sat on the mat [SEP]
    mask: [1, 1, 1, 1, 1, 1, 0]
    """

    # Forward pass to compute logits
    logits = model(x).logits

    """
    Output (logits): A tensor of shape (batch_size, sequence_length, vocab_size)
     containing the unnormalized scores for each token in the vocabulary at
     each position in the sequence.
    """

    # Compute loss
    loss = F.cross_entropy(
        logits.view(-1, logits.size(-1)),  # Flattens the logits to shape (batch_size * sequence_length, vocab_size)
        y.view(-1),  # Flattens the target tokens to shape (batch_size * sequence_length).
        reduction="none" # Computes the loss for each individual token without aggregating.
    )

    # Mask out the loss for non-answer tokens
    """
    mask.view(-1): Flattens the mask to align with the flattened loss tensor.

    loss[mask.view(-1)]: Selects the loss values corresponding to positions where the mask is True.

    Focuses the loss computation on relevant tokens (e.g., non-padding tokens)
    by applying the mask,
    ensuring that the model is trained effectively on meaningful data
    """
    loss = loss[mask.view(-1)].mean()

    return loss

1.3.4: Training loop for fine-tuning

With this function to compute the loss, we can now define a training loop to fine-tune the model using LoRA. This training loop has the same core components as we’ve seen before in other labs:

Grab a batch of data from the dataset (using the DataLoader)
Feed the data through the model to complete a forward pass and compute the loss
Backward pass to update the model weights

The data in our DataLoader is initially text, and is not structured in our question-answer template. So in step (1) we will need to format the data into our question-answer template previously defined, and then tokenize the text.

We care about the model’s answer to the question; the “answer” tokens are the part of the text we want to predict and compute the loss for. So, after tokenizing the text we need to denote to the model which tokens are part of the “answer” and which are part of the “question”. We can do this by computing a mask for the answer tokens, and then using this mask to compute the loss.

Finally, we will complete the backward pass to update the model weights.

Let’s put this all together in the training loop below.

train_loader.dataset

Dataset({
    features: ['instruction', 'context', 'response', 'category', 'response_style'],
    num_rows: 2048
})

model

model structures (click to expand)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PeftModelForCausalLM(
      (base_model): LoraModel(
        (model): Gemma2ForCausalLM(
          (model): Gemma2Model(
            (embed_tokens): Embedding(256000, 2304, padding_idx=0)
            (layers): ModuleList(
              (0-25): 26 x Gemma2DecoderLayer(
                (self_attn): Gemma2Attention(
                  (q_proj): lora.Linear(
                    (base_layer): Linear(in_features=2304, out_features=2048, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Identity()
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=2304, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=2048, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
                    (lora_embedding_B): ParameterDict()
                    (lora_magnitude_vector): ModuleDict()
                  )
                  (k_proj): lora.Linear(
                    (base_layer): Linear(in_features=2304, out_features=1024, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Identity()
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=2304, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=1024, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
                    (lora_embedding_B): ParameterDict()
                    (lora_magnitude_vector): ModuleDict()
                  )
                  (v_proj): lora.Linear(
                    (base_layer): Linear(in_features=2304, out_features=1024, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Identity()
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=2304, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=1024, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
                    (lora_embedding_B): ParameterDict()
                    (lora_magnitude_vector): ModuleDict()
                  )
                  (o_proj): lora.Linear(
                    (base_layer): Linear(in_features=2048, out_features=2304, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Identity()
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=2048, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=2304, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
                    (lora_embedding_B): ParameterDict()
                    (lora_magnitude_vector): ModuleDict()
                  )
                )
                (mlp): Gemma2MLP(
                  (gate_proj): lora.Linear(
                    (base_layer): Linear(in_features=2304, out_features=9216, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Identity()
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=2304, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=9216, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
                    (lora_embedding_B): ParameterDict()
                    (lora_magnitude_vector): ModuleDict()
                  )
                  (up_proj): lora.Linear(
                    (base_layer): Linear(in_features=2304, out_features=9216, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Identity()
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=2304, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=9216, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
                    (lora_embedding_B): ParameterDict()
                    (lora_magnitude_vector): ModuleDict()
                  )
                  (down_proj): lora.Linear(
                    (base_layer): Linear(in_features=9216, out_features=2304, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Identity()
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=9216, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=2304, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
                    (lora_embedding_B): ParameterDict()
                    (lora_magnitude_vector): ModuleDict()
                  )
                  (act_fn): PytorchGELUTanh()
                )
                (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
                (post_attention_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
                (pre_feedforward_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
                (post_feedforward_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
              )
            )
            (norm): Gemma2RMSNorm((2304,), eps=1e-06)
            (rotary_emb): Gemma2RotaryEmbedding()
          )
          (lm_head): Linear(in_features=2304, out_features=256000, bias=False)
        )
      )
    )
  )
)

### Training loop ###

def train(
        model,
        dataloader,
        tokenizer,
        max_steps=200,
        context_length=512,
        learning_rate=1e-4
):
    losses= []

    # apply LoRA to the model
    model = apply_lora(model)

    optimizer = Lion(model.parameters(), lr=learning_rate)

    # Training loop
    for step, batch in enumerate(dataloader):

        question = batch["instruction"][0]
        answer = batch["response_style"][0]

        # format the question and answer into the template
        text = template_with_answer.format(question=question, answer=answer)

        # tokenize the text and compute the mast for the answer
        ids = tokenizer(text, return_tensors="pt", return_offsets_mapping=True).to(model.device)
        mask = ids["offset_mapping"][:,:,0] >= text.index(answer)

        # Feed the tokens through the model and compute the loss
        loss = forward_and_compute_loss(
            model=model,
            tokens=ids["input_ids"],
            mask=mask,
            context_length=context_length,
        )

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        losses.append(loss.item())

        # monitor progress
        if step % 10 == 0:
            print(chat("What is the capital of France?", only_answer=True))
            print(f"step {step} loss: {torch.mean(torch.tensor(losses)).item()}")
            losses = []

        if step > 0 and step % max_steps == 0:
            break

    return model

# Call the train function to fine-tune the model! Hint: you'll start to see results after a few dozen steps.
model = train(model, train_loader, tokenizer, max_steps=50)

The capital of France is **Paris**. 🇫🇷 

step 0 loss: 1.938644528388977
The capital of France is **Paris**. 🇫🇷 

step 10 loss: 2.0876097679138184
The capital of France is **Paris**. 🇫🇷

step 20 loss: 1.8085353374481201
Top o' the mornin' now! Why, the capital o' grand ol' France is none other than Paris, ye hear? Top o' the
step 30 loss: 1.4446303844451904
Top o' the mornin' to ye! Now, the capital o' the land o' the French, which is as grand as it gets, is
step 40 loss: 1.469994068145752
Och, the capital o' France, Ye see, is Paris, the grandest city in the land!
step 50 loss: 1.5635912418365479

# Let's try chatting with the model again to see how it has changed!
print(chat("What is a good story about tennis", only_answer=True, max_new_tokens=200))

Top o' the mornin' to ye, me hearty! Now, listen here ye, for I'm tellin' ye a tale about a wee lad called Patrick, as he's spittin' out his top o' the mornin' in the wee hours o' the morning, ye see, and he's tellin' ye about this grand match he witnessed between two lads, Jimmy and Tommy.

Now, Jimmy's a lad with a right fiery temper, ye see, and he's been havin' a bit o' a struggle in the match, ye know, and he's tryin' to keep his head above the water. But then, Tommy, he's a big lad with a heart o' gold, and he's been out there showin' Jimmy how to win, ye see, and he's been teachin' him the art o' the comeback, ye know, and he's been help

Part 2: Evaluating a style-tuned LLM

While benchmarks have been developed to evaluate the performance of language models on a variety of tasks, these benchmarks are not always representative of the real-world performance of the model.
For example, a model may perform well on a benchmark but poorly on a more realistic task. Benchmarks are also limited in the scope of tasks they can cover and capabilities they can reflect, and there can be concerns about whether the data in the benchmark was used to train the model.
Synthetic data generation and synthetic tasks are a way to address these limitations, and this is an active area of research.
We can also turn a qualitative evaluation of a generated response quantitative by deploying someone or something to “judge” the outputs. In this lab, we will use a technique called LLM as a judge to do exactly this. This involves using a larger LLM to score the outputs of a smaller LLM. The larger LLM is used as a judge, and it is given a system prompt that describes the task we want the smaller LLM to perform and the judging criteria. A “system prompt” is a way to set the general context and guide an LLM’s behavior. Contextualized with this system prompt, the judge LLM can score the outputs of the smaller LLM, and we can use this score to evaluate how well the smaller LLM is doing.


### LLM as a judge ###

'''TODO: Experiment with different system prompts to see how they affect the judge LLM's evaluation!
        Come back to this cell after you've generated some text from your model.'''

system_prompt = """
You are an impartial judge that evaluates if text was written by {style}.

An example piece of text from {style} is:
{example}

Now, analyze some new text carefully and respond on if it follows the
same style of {style}. Be critical to identify any issues in the text.
Then convert your feedback into a number between 0 and 10: 10 if the text
is written exactly in the style of {style}, 5 if mixed faithfulness to the
style, or 0 if the text is not at all written in the style of {style}.

The format of the your response should be a JSON dictionary and nothing else:
{{"score": <score between 0 and 10>}}
"""

style = "Yoda"
# example = """The very Republic is threatened, if involved the Sith are. Hard to see, the dark side is. """
example = "The very Republic is threatened, if involved the Sith are. Hard to see, the dark side is. Discover who this assassin is, we must. With this Naboo queen you must stay, Qui-Gon. Protect her. May the Force be with you. A vergence, you say? But you do! Revealed your opinion is. Trained as a Jedi, you request for him? Good, good, young one."

system_prompt = system_prompt.format(style=style, example=example)
print("=== System prompt ===")
print(system_prompt)

# basically same shit as langsmith

=== System prompt ===

You are an impartial judge that evaluates if text was written by Yoda.

An example piece of text from Yoda is: The very Republic is threatened, if involved the Sith are. Hard to see, the dark side is. Discover who this assassin is, we must. With this Naboo queen you must stay, Qui-Gon. Protect her. May the Force be with you. A vergence, you say? But you do! Revealed your opinion is. Trained as a Jedi, you request for him? Good, good, young one.

Now, analyze some new text carefully and respond on if it follows the same style of Yoda. Be critical to identify any issues in the text. Then convert your feedback into a number between 0 and 10: 10 if the text is written exactly in the style of Yoda, 5 if mixed faithfulness to the style, or 0 if the text is not at all written in the style of Yoda.

The format of the your response should be a JSON dictionary and nothing else: {“score”: <score between 0 and 10>}



OPENROUTER_API_KEY = "" # add your OpenRouter API key here
assert OPENROUTER_API_KEY != "", "You must set your OpenRouter API key before running this cell!"

model_name = "liquid/lfm-40b"
# model_name = "google/gemma-2-9b-it"
llm = mdl.lab3.LLMClient(model=model_name, api_key=OPENROUTER_API_KEY)

well open router cost money too… nah I’m good, already have too many llm services, just use deepseek this time.

[!NOTE] Alternative

import os
import getpass

# Prompt the user for their API key without echoing the input
api_key = getpass.getpass("Enter your DeepSeek API key: ")

# use deepseek-chat(DeepSeek-V3) for example
llm = mdl.lab3.LLMClient(model="deepseek-chat",api_key=api_key, api_base="https://api.deepseek.com")

Enter your DeepSeek API key: ········

We have set up our judge LLM, but we still need to make this quantitative. We can do this by defining a metric that uses the judge LLM to score the outputs of the model. Doing this is streamlined with Comet ML’s Opik library, a platform for LLM evaluation and benchmarking.

from opik.evaluation.metrics import base_metric, score_result

class LLMJudgeEvaluator(base_metric.BaseMetric):
    def __init__(self, judge: mdl.lab3.LLMClient = None, system_prompt: str = None):
        self.judge = judge
        self.system_prompt = system_prompt
        self.prompt_template = "Evaluate this text: {text}"

    def score(self, text: str, n_tries=20, **kwargs):
        """ Evaluate by asking an LLM to score it. """

        for attempt in range(n_tries):
            try:
                # Convert the text to template form before passing it to the judge LLM
                prompt = self.prompt_template.format(text=text)

                # The system prompt asks the judge to output a JSON dictionary of the form: 
                # {"score": <score between 0 and 10>}
                # To do this, we need to specify the judge to stop generating after it 
                # closes the JSON dictionary (i.e., when it outputs "}")
                # Hint: Use the stop=["}"] argument within the judge.ask() method to specify this.
                stop = "}"

                # Call the judge LLM with the system prompt and the prompt template. 
                # Remember to stop the generation when the judge LLM outputs "}".
                res = self.judge.ask(
                    system=self.system_prompt,
                    user=prompt,
                    max_tokens=10,
                    stop=[stop]
                )

                # Extract the assistant's content from the API response
                # Remember to add the stop character back to the end of the response to be a 
                # valid JSON dictionary (its not there  the judge LLM stoped once it saw it)
                res = res.choices[0].message.content + stop
                res_dict = json.loads(res)

                max_score = 10 # The maximum score that the LLM should output
                score = res_dict["score"] / max_score # Normalize
                score = max(0.0, min(score, 1.0)) # Clip between 0 and 1

                # Return the score object
                return score_result.ScoreResult(name="StyleScore", value=score)

            except Exception as e:
                if attempt == n_tries - 1:  # Last attempt
                    raise e  # Re-raise the exception if all attempts failed
                continue  # Try again if not the last attempt

Instaniate your Comet Opik judge using the LLMJudgeEvaluator class and system prompt.

judge = LLMJudgeEvaluator(llm, system_prompt=system_prompt)

Evaluating the model by scoring with your judge LLM

Now we can use the judge LLM to score the outputs of the model. We will use the scoring_fuction to score text using the judge LLM.

Feed in a few probe sentences to get a vibe check on the judge LLM.

def scoring_function(text):
    return judge.score(text).value

test_texts = [
    "Tennis is a fun sport. But you must concentrate.",
    "Fun sport, tennis is. But work hard, you must.",
    "Hard to see, the dark side is."
]

for text in test_texts:
    score = scoring_function(text)
    print(f"{text} ==> Score: {score}")

Finetune LLM Using Pytorch