Python 8 min read

OpenAI Whisper API: Audio Transcription and Translation Guide

Master OpenAI's Whisper API for audio transcription. Learn speech-to-text, translation, timestamps, and build production transcription services.

MR

Moshiour Rahman

Advertisement

What is Whisper API?

OpenAI’s Whisper API provides state-of-the-art speech recognition. It supports transcription in 97 languages, translation to English, and provides word-level timestamps for precise audio processing.

Key Features

FeatureDescription
Multi-language97 languages supported
TranslationAny language to English
TimestampsWord and segment level
Formatsmp3, mp4, wav, webm, and more
File SizeUp to 25 MB

Getting Started

Installation

pip install openai
pip install pydub  # For audio processing

Basic Transcription

from openai import OpenAI

client = OpenAI()

# Transcribe audio file
with open("audio.mp3", "rb") as audio_file:
    transcript = client.audio.transcriptions.create(
        model="whisper-1",
        file=audio_file
    )

print(transcript.text)

Response Formats

from openai import OpenAI

client = OpenAI()

# Text format (default)
with open("audio.mp3", "rb") as f:
    result = client.audio.transcriptions.create(
        model="whisper-1",
        file=f,
        response_format="text"
    )
    print(result)  # Plain text string

# JSON format
with open("audio.mp3", "rb") as f:
    result = client.audio.transcriptions.create(
        model="whisper-1",
        file=f,
        response_format="json"
    )
    print(result.text)

# Verbose JSON (with timestamps)
with open("audio.mp3", "rb") as f:
    result = client.audio.transcriptions.create(
        model="whisper-1",
        file=f,
        response_format="verbose_json"
    )
    print(f"Language: {result.language}")
    print(f"Duration: {result.duration}s")
    for segment in result.segments:
        print(f"[{segment.start:.2f}s - {segment.end:.2f}s]: {segment.text}")

# SRT format (subtitles)
with open("audio.mp3", "rb") as f:
    result = client.audio.transcriptions.create(
        model="whisper-1",
        file=f,
        response_format="srt"
    )
    print(result)  # SRT formatted string

# VTT format (web subtitles)
with open("audio.mp3", "rb") as f:
    result = client.audio.transcriptions.create(
        model="whisper-1",
        file=f,
        response_format="vtt"
    )
    print(result)

Translation

Translate to English

from openai import OpenAI

client = OpenAI()

# Translate Spanish audio to English text
with open("spanish_audio.mp3", "rb") as audio_file:
    translation = client.audio.translations.create(
        model="whisper-1",
        file=audio_file
    )

print(translation.text)

# With verbose output
with open("french_audio.mp3", "rb") as audio_file:
    translation = client.audio.translations.create(
        model="whisper-1",
        file=audio_file,
        response_format="verbose_json"
    )

print(f"Original language: {translation.language}")
print(f"English translation: {translation.text}")

Advanced Features

Word-Level Timestamps

from openai import OpenAI

client = OpenAI()

with open("audio.mp3", "rb") as audio_file:
    result = client.audio.transcriptions.create(
        model="whisper-1",
        file=audio_file,
        response_format="verbose_json",
        timestamp_granularities=["word", "segment"]
    )

# Segment-level timestamps
print("Segments:")
for segment in result.segments:
    print(f"  [{segment.start:.2f}s - {segment.end:.2f}s]: {segment.text}")

# Word-level timestamps
print("\nWords:")
for word in result.words:
    print(f"  [{word.start:.2f}s - {word.end:.2f}s]: {word.word}")

Prompting for Better Results

from openai import OpenAI

client = OpenAI()

# Use prompt to guide transcription
with open("technical_audio.mp3", "rb") as audio_file:
    result = client.audio.transcriptions.create(
        model="whisper-1",
        file=audio_file,
        prompt="Technical discussion about Python, TensorFlow, and machine learning."
    )

# Prompt for specific terminology
with open("medical_audio.mp3", "rb") as audio_file:
    result = client.audio.transcriptions.create(
        model="whisper-1",
        file=audio_file,
        prompt="Medical terminology: cardiovascular, pulmonary, neurological, oncology."
    )

# Prompt for proper nouns
with open("meeting.mp3", "rb") as audio_file:
    result = client.audio.transcriptions.create(
        model="whisper-1",
        file=audio_file,
        prompt="Speakers: John Smith, Sarah Johnson. Company: Acme Corp."
    )

Language Specification

from openai import OpenAI

client = OpenAI()

# Specify input language
with open("audio.mp3", "rb") as audio_file:
    result = client.audio.transcriptions.create(
        model="whisper-1",
        file=audio_file,
        language="es"  # Spanish
    )

# Supported language codes
SUPPORTED_LANGUAGES = {
    "en": "English",
    "es": "Spanish",
    "fr": "French",
    "de": "German",
    "it": "Italian",
    "pt": "Portuguese",
    "ru": "Russian",
    "ja": "Japanese",
    "ko": "Korean",
    "zh": "Chinese",
    # ... 97 languages total
}

Audio Processing

Handling Large Files

from openai import OpenAI
from pydub import AudioSegment
import math
import os

client = OpenAI()

def split_audio(file_path: str, max_size_mb: int = 24) -> list:
    """Split audio file into chunks under max_size_mb."""
    audio = AudioSegment.from_file(file_path)

    # Calculate chunk duration
    file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
    duration_ms = len(audio)

    if file_size_mb <= max_size_mb:
        return [file_path]

    # Split into chunks
    chunk_duration_ms = int(duration_ms * (max_size_mb / file_size_mb) * 0.9)
    chunks = []

    for i in range(0, duration_ms, chunk_duration_ms):
        chunk = audio[i:i + chunk_duration_ms]
        chunk_path = f"chunk_{i}.mp3"
        chunk.export(chunk_path, format="mp3")
        chunks.append(chunk_path)

    return chunks

def transcribe_large_file(file_path: str) -> str:
    """Transcribe audio file of any size."""
    chunks = split_audio(file_path)
    full_transcript = []

    for chunk_path in chunks:
        with open(chunk_path, "rb") as f:
            result = client.audio.transcriptions.create(
                model="whisper-1",
                file=f
            )
            full_transcript.append(result.text)

        # Cleanup temporary files
        if chunk_path != file_path:
            os.remove(chunk_path)

    return " ".join(full_transcript)

# Usage
transcript = transcribe_large_file("long_audio.mp3")
print(transcript)

Audio Format Conversion

from pydub import AudioSegment
import io

def convert_to_mp3(file_path: str) -> io.BytesIO:
    """Convert any audio format to MP3."""
    audio = AudioSegment.from_file(file_path)

    # Optimize for Whisper
    audio = audio.set_frame_rate(16000)
    audio = audio.set_channels(1)

    buffer = io.BytesIO()
    audio.export(buffer, format="mp3", bitrate="64k")
    buffer.seek(0)
    buffer.name = "audio.mp3"

    return buffer

def prepare_audio_for_whisper(file_path: str) -> io.BytesIO:
    """Prepare audio file for optimal Whisper processing."""
    audio = AudioSegment.from_file(file_path)

    # Normalize audio
    from pydub.effects import normalize
    audio = normalize(audio)

    # Convert to mono
    audio = audio.set_channels(1)

    # Set sample rate
    audio = audio.set_frame_rate(16000)

    # Export to buffer
    buffer = io.BytesIO()
    audio.export(buffer, format="mp3")
    buffer.seek(0)
    buffer.name = "processed.mp3"

    return buffer

Production Service

FastAPI Transcription API

from fastapi import FastAPI, UploadFile, HTTPException, BackgroundTasks
from pydantic import BaseModel
from typing import Optional, List
from openai import OpenAI
import tempfile
import os
import uuid

app = FastAPI()
client = OpenAI()

class TranscriptionRequest(BaseModel):
    language: Optional[str] = None
    prompt: Optional[str] = None
    response_format: str = "json"
    timestamps: bool = False

class TranscriptionResult(BaseModel):
    id: str
    text: str
    language: Optional[str] = None
    duration: Optional[float] = None
    segments: Optional[List[dict]] = None

# In-memory storage (use Redis in production)
transcriptions = {}

@app.post("/transcribe", response_model=TranscriptionResult)
async def transcribe_audio(
    file: UploadFile,
    language: Optional[str] = None,
    prompt: Optional[str] = None,
    timestamps: bool = False
):
    # Validate file type
    allowed_types = [
        "audio/mpeg", "audio/mp3", "audio/wav",
        "audio/x-wav", "audio/mp4", "audio/webm"
    ]

    if file.content_type not in allowed_types:
        raise HTTPException(400, f"Unsupported file type: {file.content_type}")

    # Save uploaded file
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
        content = await file.read()
        tmp.write(content)
        tmp_path = tmp.name

    try:
        # Prepare request
        kwargs = {
            "model": "whisper-1",
            "file": open(tmp_path, "rb"),
            "response_format": "verbose_json" if timestamps else "json"
        }

        if language:
            kwargs["language"] = language
        if prompt:
            kwargs["prompt"] = prompt
        if timestamps:
            kwargs["timestamp_granularities"] = ["segment"]

        # Transcribe
        result = client.audio.transcriptions.create(**kwargs)

        # Build response
        response = TranscriptionResult(
            id=str(uuid.uuid4()),
            text=result.text,
            language=getattr(result, "language", None),
            duration=getattr(result, "duration", None)
        )

        if timestamps and hasattr(result, "segments"):
            response.segments = [
                {
                    "start": s.start,
                    "end": s.end,
                    "text": s.text
                }
                for s in result.segments
            ]

        return response

    finally:
        os.unlink(tmp_path)

@app.post("/translate")
async def translate_audio(
    file: UploadFile,
    prompt: Optional[str] = None
):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
        content = await file.read()
        tmp.write(content)
        tmp_path = tmp.name

    try:
        kwargs = {
            "model": "whisper-1",
            "file": open(tmp_path, "rb"),
            "response_format": "verbose_json"
        }

        if prompt:
            kwargs["prompt"] = prompt

        result = client.audio.translations.create(**kwargs)

        return {
            "original_language": result.language,
            "translated_text": result.text,
            "duration": result.duration
        }

    finally:
        os.unlink(tmp_path)

Async Processing

from fastapi import FastAPI, UploadFile, BackgroundTasks
from openai import OpenAI
import asyncio
import aiofiles
import uuid

app = FastAPI()
client = OpenAI()

# Job storage
jobs = {}

async def process_transcription(job_id: str, file_path: str, options: dict):
    """Background transcription task."""
    try:
        jobs[job_id]["status"] = "processing"

        with open(file_path, "rb") as f:
            result = client.audio.transcriptions.create(
                model="whisper-1",
                file=f,
                **options
            )

        jobs[job_id]["status"] = "completed"
        jobs[job_id]["result"] = result.text

    except Exception as e:
        jobs[job_id]["status"] = "failed"
        jobs[job_id]["error"] = str(e)

    finally:
        import os
        os.unlink(file_path)

@app.post("/transcribe/async")
async def transcribe_async(
    file: UploadFile,
    background_tasks: BackgroundTasks
):
    job_id = str(uuid.uuid4())

    # Save file
    file_path = f"/tmp/{job_id}.mp3"
    async with aiofiles.open(file_path, "wb") as f:
        await f.write(await file.read())

    # Initialize job
    jobs[job_id] = {"status": "queued", "result": None}

    # Add background task
    background_tasks.add_task(
        process_transcription,
        job_id,
        file_path,
        {"response_format": "json"}
    )

    return {"job_id": job_id}

@app.get("/transcribe/status/{job_id}")
async def get_transcription_status(job_id: str):
    if job_id not in jobs:
        raise HTTPException(404, "Job not found")

    return jobs[job_id]

Subtitle Generation

Generate SRT Subtitles

from openai import OpenAI
from datetime import timedelta

client = OpenAI()

def format_timestamp_srt(seconds: float) -> str:
    """Convert seconds to SRT timestamp format."""
    td = timedelta(seconds=seconds)
    hours, remainder = divmod(td.seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    milliseconds = td.microseconds // 1000
    return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"

def generate_srt(audio_path: str, max_chars_per_line: int = 42) -> str:
    """Generate SRT subtitles from audio."""
    with open(audio_path, "rb") as f:
        result = client.audio.transcriptions.create(
            model="whisper-1",
            file=f,
            response_format="verbose_json",
            timestamp_granularities=["segment"]
        )

    srt_content = []

    for i, segment in enumerate(result.segments, 1):
        start = format_timestamp_srt(segment.start)
        end = format_timestamp_srt(segment.end)
        text = segment.text.strip()

        # Split long lines
        if len(text) > max_chars_per_line:
            words = text.split()
            lines = []
            current_line = []

            for word in words:
                if len(" ".join(current_line + [word])) <= max_chars_per_line:
                    current_line.append(word)
                else:
                    lines.append(" ".join(current_line))
                    current_line = [word]

            if current_line:
                lines.append(" ".join(current_line))

            text = "\n".join(lines)

        srt_content.append(f"{i}\n{start} --> {end}\n{text}\n")

    return "\n".join(srt_content)

# Usage
srt = generate_srt("video.mp3")
with open("subtitles.srt", "w") as f:
    f.write(srt)

Generate VTT for Web

def generate_vtt(audio_path: str) -> str:
    """Generate WebVTT subtitles."""
    with open(audio_path, "rb") as f:
        result = client.audio.transcriptions.create(
            model="whisper-1",
            file=f,
            response_format="verbose_json",
            timestamp_granularities=["segment"]
        )

    vtt_content = ["WEBVTT\n"]

    for segment in result.segments:
        start = format_timestamp_vtt(segment.start)
        end = format_timestamp_vtt(segment.end)
        text = segment.text.strip()

        vtt_content.append(f"{start} --> {end}\n{text}\n")

    return "\n".join(vtt_content)

def format_timestamp_vtt(seconds: float) -> str:
    """Convert seconds to VTT timestamp format."""
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    millis = int((seconds % 1) * 1000)
    return f"{hours:02d}:{minutes:02d}:{secs:02d}.{millis:03d}"

Summary

FeatureMethod
Transcribeclient.audio.transcriptions.create()
Translateclient.audio.translations.create()
Timestampstimestamp_granularities=["word"]
Subtitlesresponse_format="srt"
Prompt Guideprompt="context..."

Whisper API provides powerful, accurate speech recognition for building transcription and translation services.

Advertisement

MR

Moshiour Rahman

Software Architect & AI Engineer

Share:
MR

Moshiour Rahman

Software Architect & AI Engineer

Enterprise software architect with deep expertise in financial systems, distributed architecture, and AI-powered applications. Building large-scale systems at Fortune 500 companies. Specializing in LLM orchestration, multi-agent systems, and cloud-native solutions. I share battle-tested patterns from real enterprise projects.

Related Articles

Comments

Comments are powered by GitHub Discussions.

Configure Giscus at giscus.app to enable comments.