Python 6 min read

Fine-Tuning LLMs: Complete Guide to Custom AI Models

Learn to fine-tune large language models for your use case. Master LoRA, QLoRA, dataset preparation, and deploy custom models with OpenAI and Hugging Face.

MR

Moshiour Rahman

Advertisement

What is Fine-Tuning?

Fine-tuning adapts a pre-trained LLM to your specific use case by training it on your custom data. It improves model performance for domain-specific tasks while preserving general capabilities.

When to Fine-Tune?

Use Fine-TuningUse RAG Instead
Specific output formatNeed current information
Domain terminologyLarge knowledge base
Consistent style/toneFrequently updated data
Classification tasksCitation required

OpenAI Fine-Tuning

Prepare Training Data

import json

# Training data format (JSONL)
training_data = [
    {
        "messages": [
            {"role": "system", "content": "You are a helpful customer support agent."},
            {"role": "user", "content": "How do I reset my password?"},
            {"role": "assistant", "content": "To reset your password, go to Settings > Security > Reset Password. You'll receive an email with a reset link."}
        ]
    },
    {
        "messages": [
            {"role": "system", "content": "You are a helpful customer support agent."},
            {"role": "user", "content": "What are your business hours?"},
            {"role": "assistant", "content": "We're available Monday to Friday, 9 AM to 6 PM EST. For urgent issues, our emergency line is available 24/7."}
        ]
    }
]

# Save as JSONL
with open("training_data.jsonl", "w") as f:
    for item in training_data:
        f.write(json.dumps(item) + "\n")

Upload and Fine-Tune

from openai import OpenAI

client = OpenAI()

# Upload training file
with open("training_data.jsonl", "rb") as f:
    training_file = client.files.create(
        file=f,
        purpose="fine-tune"
    )

print(f"Training file ID: {training_file.id}")

# Create fine-tuning job
job = client.fine_tuning.jobs.create(
    training_file=training_file.id,
    model="gpt-3.5-turbo",  # Base model
    hyperparameters={
        "n_epochs": 3,
        "batch_size": 4,
        "learning_rate_multiplier": 1.0
    }
)

print(f"Fine-tuning job ID: {job.id}")

# Monitor progress
while True:
    job_status = client.fine_tuning.jobs.retrieve(job.id)
    print(f"Status: {job_status.status}")

    if job_status.status in ["succeeded", "failed"]:
        break

    time.sleep(60)

# Get fine-tuned model name
if job_status.status == "succeeded":
    model_name = job_status.fine_tuned_model
    print(f"Fine-tuned model: {model_name}")

Use Fine-Tuned Model

response = client.chat.completions.create(
    model="ft:gpt-3.5-turbo:org-name::job-id",  # Your fine-tuned model
    messages=[
        {"role": "system", "content": "You are a helpful customer support agent."},
        {"role": "user", "content": "How do I cancel my subscription?"}
    ]
)

print(response.choices[0].message.content)

Hugging Face Fine-Tuning

Setup

pip install transformers datasets peft accelerate bitsandbytes
pip install trl  # For supervised fine-tuning

Load Model and Tokenizer

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "meta-llama/Llama-2-7b-hf"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

Prepare Dataset

from datasets import Dataset, load_dataset

# Custom dataset
data = {
    "instruction": [
        "Write a product description for a smartphone",
        "Summarize this article about AI",
    ],
    "input": [
        "Features: 6.5 inch display, 128GB storage, 48MP camera",
        "Long article text here...",
    ],
    "output": [
        "Introducing the perfect smartphone...",
        "This article discusses...",
    ]
}

dataset = Dataset.from_dict(data)

# Format for training
def format_instruction(example):
    return {
        "text": f"""### Instruction:
{example['instruction']}

### Input:
{example['input']}

### Response:
{example['output']}"""
    }

dataset = dataset.map(format_instruction)

LoRA Fine-Tuning

What is LoRA?

LoRA (Low-Rank Adaptation) fine-tunes only a small number of parameters by adding trainable low-rank matrices, making fine-tuning much faster and memory-efficient.

from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=16,  # Rank
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# Apply LoRA to model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.06

QLoRA (Quantized LoRA)

from transformers import BitsAndBytesConfig

# 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# Load quantized model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# Prepare for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

Training with SFTTrainer

from trl import SFTTrainer
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    warmup_steps=100,
    logging_steps=10,
    save_steps=100,
    fp16=True,
    optim="paged_adamw_8bit",
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    dataset_text_field="text",
    max_seq_length=512,
)

# Train
trainer.train()

# Save LoRA weights
trainer.model.save_pretrained("./lora-weights")
tokenizer.save_pretrained("./lora-weights")

Load Fine-Tuned Model

from peft import PeftModel

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Load LoRA weights
model = PeftModel.from_pretrained(base_model, "./lora-weights")

# Merge weights (optional, for deployment)
model = model.merge_and_unload()

Data Preparation Best Practices

Dataset Quality

def prepare_training_data(examples):
    processed = []

    for ex in examples:
        # Clean text
        instruction = ex["instruction"].strip()
        response = ex["response"].strip()

        # Skip low quality
        if len(response) < 50:
            continue
        if len(instruction) < 10:
            continue

        # Format consistently
        processed.append({
            "messages": [
                {"role": "user", "content": instruction},
                {"role": "assistant", "content": response}
            ]
        })

    return processed

# Validate data
def validate_jsonl(file_path):
    errors = []
    with open(file_path) as f:
        for i, line in enumerate(f):
            try:
                data = json.loads(line)
                if "messages" not in data:
                    errors.append(f"Line {i}: Missing 'messages' key")
            except json.JSONDecodeError:
                errors.append(f"Line {i}: Invalid JSON")

    return errors

Data Augmentation

import random

def augment_data(examples):
    augmented = []

    for ex in examples:
        augmented.append(ex)

        # Paraphrase variations
        if random.random() < 0.3:
            augmented.append({
                "instruction": paraphrase(ex["instruction"]),
                "response": ex["response"]
            })

    return augmented

Evaluation

from datasets import load_metric
import numpy as np

def evaluate_model(model, tokenizer, test_data):
    predictions = []
    references = []

    for example in test_data:
        inputs = tokenizer(example["instruction"], return_tensors="pt")
        outputs = model.generate(**inputs, max_length=512)
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)

        predictions.append(pred)
        references.append(example["response"])

    # Calculate metrics
    from rouge_score import rouge_scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

    scores = []
    for pred, ref in zip(predictions, references):
        score = scorer.score(ref, pred)
        scores.append(score['rougeL'].fmeasure)

    return {
        "rouge_l": np.mean(scores),
        "samples": len(predictions)
    }

Deployment

Using vLLM

from vllm import LLM, SamplingParams

llm = LLM(
    model="./merged-model",
    tensor_parallel_size=1
)

sampling_params = SamplingParams(
    temperature=0.7,
    max_tokens=512
)

outputs = llm.generate(["Your prompt here"], sampling_params)
print(outputs[0].outputs[0].text)

API Server

from fastapi import FastAPI
from transformers import pipeline

app = FastAPI()
pipe = pipeline("text-generation", model="./fine-tuned-model")

@app.post("/generate")
async def generate(prompt: str, max_length: int = 512):
    result = pipe(prompt, max_length=max_length)
    return {"response": result[0]["generated_text"]}

Summary

MethodMemorySpeedQuality
Full Fine-tuningHighSlowBest
LoRAMediumFastGood
QLoRALowFastGood
Prompt TuningVery LowFastestModerate

Fine-tuning enables creating specialized AI models that excel at your specific tasks.

Advertisement

MR

Moshiour Rahman

Software Architect & AI Engineer

Share:
MR

Moshiour Rahman

Software Architect & AI Engineer

Enterprise software architect with deep expertise in financial systems, distributed architecture, and AI-powered applications. Building large-scale systems at Fortune 500 companies. Specializing in LLM orchestration, multi-agent systems, and cloud-native solutions. I share battle-tested patterns from real enterprise projects.

Related Articles

Comments

Comments are powered by GitHub Discussions.

Configure Giscus at giscus.app to enable comments.