OpenAI Whisper API: Audio Transcription and Translation Guide
Master OpenAI's Whisper API for audio transcription. Learn speech-to-text, translation, timestamps, and build production transcription services.
Moshiour Rahman
Advertisement
What is Whisper API?
OpenAI’s Whisper API provides state-of-the-art speech recognition. It supports transcription in 97 languages, translation to English, and provides word-level timestamps for precise audio processing.
Key Features
| Feature | Description |
|---|---|
| Multi-language | 97 languages supported |
| Translation | Any language to English |
| Timestamps | Word and segment level |
| Formats | mp3, mp4, wav, webm, and more |
| File Size | Up to 25 MB |
Getting Started
Installation
pip install openai
pip install pydub # For audio processing
Basic Transcription
from openai import OpenAI
client = OpenAI()
# Transcribe audio file
with open("audio.mp3", "rb") as audio_file:
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file
)
print(transcript.text)
Response Formats
from openai import OpenAI
client = OpenAI()
# Text format (default)
with open("audio.mp3", "rb") as f:
result = client.audio.transcriptions.create(
model="whisper-1",
file=f,
response_format="text"
)
print(result) # Plain text string
# JSON format
with open("audio.mp3", "rb") as f:
result = client.audio.transcriptions.create(
model="whisper-1",
file=f,
response_format="json"
)
print(result.text)
# Verbose JSON (with timestamps)
with open("audio.mp3", "rb") as f:
result = client.audio.transcriptions.create(
model="whisper-1",
file=f,
response_format="verbose_json"
)
print(f"Language: {result.language}")
print(f"Duration: {result.duration}s")
for segment in result.segments:
print(f"[{segment.start:.2f}s - {segment.end:.2f}s]: {segment.text}")
# SRT format (subtitles)
with open("audio.mp3", "rb") as f:
result = client.audio.transcriptions.create(
model="whisper-1",
file=f,
response_format="srt"
)
print(result) # SRT formatted string
# VTT format (web subtitles)
with open("audio.mp3", "rb") as f:
result = client.audio.transcriptions.create(
model="whisper-1",
file=f,
response_format="vtt"
)
print(result)
Translation
Translate to English
from openai import OpenAI
client = OpenAI()
# Translate Spanish audio to English text
with open("spanish_audio.mp3", "rb") as audio_file:
translation = client.audio.translations.create(
model="whisper-1",
file=audio_file
)
print(translation.text)
# With verbose output
with open("french_audio.mp3", "rb") as audio_file:
translation = client.audio.translations.create(
model="whisper-1",
file=audio_file,
response_format="verbose_json"
)
print(f"Original language: {translation.language}")
print(f"English translation: {translation.text}")
Advanced Features
Word-Level Timestamps
from openai import OpenAI
client = OpenAI()
with open("audio.mp3", "rb") as audio_file:
result = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
response_format="verbose_json",
timestamp_granularities=["word", "segment"]
)
# Segment-level timestamps
print("Segments:")
for segment in result.segments:
print(f" [{segment.start:.2f}s - {segment.end:.2f}s]: {segment.text}")
# Word-level timestamps
print("\nWords:")
for word in result.words:
print(f" [{word.start:.2f}s - {word.end:.2f}s]: {word.word}")
Prompting for Better Results
from openai import OpenAI
client = OpenAI()
# Use prompt to guide transcription
with open("technical_audio.mp3", "rb") as audio_file:
result = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
prompt="Technical discussion about Python, TensorFlow, and machine learning."
)
# Prompt for specific terminology
with open("medical_audio.mp3", "rb") as audio_file:
result = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
prompt="Medical terminology: cardiovascular, pulmonary, neurological, oncology."
)
# Prompt for proper nouns
with open("meeting.mp3", "rb") as audio_file:
result = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
prompt="Speakers: John Smith, Sarah Johnson. Company: Acme Corp."
)
Language Specification
from openai import OpenAI
client = OpenAI()
# Specify input language
with open("audio.mp3", "rb") as audio_file:
result = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
language="es" # Spanish
)
# Supported language codes
SUPPORTED_LANGUAGES = {
"en": "English",
"es": "Spanish",
"fr": "French",
"de": "German",
"it": "Italian",
"pt": "Portuguese",
"ru": "Russian",
"ja": "Japanese",
"ko": "Korean",
"zh": "Chinese",
# ... 97 languages total
}
Audio Processing
Handling Large Files
from openai import OpenAI
from pydub import AudioSegment
import math
import os
client = OpenAI()
def split_audio(file_path: str, max_size_mb: int = 24) -> list:
"""Split audio file into chunks under max_size_mb."""
audio = AudioSegment.from_file(file_path)
# Calculate chunk duration
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
duration_ms = len(audio)
if file_size_mb <= max_size_mb:
return [file_path]
# Split into chunks
chunk_duration_ms = int(duration_ms * (max_size_mb / file_size_mb) * 0.9)
chunks = []
for i in range(0, duration_ms, chunk_duration_ms):
chunk = audio[i:i + chunk_duration_ms]
chunk_path = f"chunk_{i}.mp3"
chunk.export(chunk_path, format="mp3")
chunks.append(chunk_path)
return chunks
def transcribe_large_file(file_path: str) -> str:
"""Transcribe audio file of any size."""
chunks = split_audio(file_path)
full_transcript = []
for chunk_path in chunks:
with open(chunk_path, "rb") as f:
result = client.audio.transcriptions.create(
model="whisper-1",
file=f
)
full_transcript.append(result.text)
# Cleanup temporary files
if chunk_path != file_path:
os.remove(chunk_path)
return " ".join(full_transcript)
# Usage
transcript = transcribe_large_file("long_audio.mp3")
print(transcript)
Audio Format Conversion
from pydub import AudioSegment
import io
def convert_to_mp3(file_path: str) -> io.BytesIO:
"""Convert any audio format to MP3."""
audio = AudioSegment.from_file(file_path)
# Optimize for Whisper
audio = audio.set_frame_rate(16000)
audio = audio.set_channels(1)
buffer = io.BytesIO()
audio.export(buffer, format="mp3", bitrate="64k")
buffer.seek(0)
buffer.name = "audio.mp3"
return buffer
def prepare_audio_for_whisper(file_path: str) -> io.BytesIO:
"""Prepare audio file for optimal Whisper processing."""
audio = AudioSegment.from_file(file_path)
# Normalize audio
from pydub.effects import normalize
audio = normalize(audio)
# Convert to mono
audio = audio.set_channels(1)
# Set sample rate
audio = audio.set_frame_rate(16000)
# Export to buffer
buffer = io.BytesIO()
audio.export(buffer, format="mp3")
buffer.seek(0)
buffer.name = "processed.mp3"
return buffer
Production Service
FastAPI Transcription API
from fastapi import FastAPI, UploadFile, HTTPException, BackgroundTasks
from pydantic import BaseModel
from typing import Optional, List
from openai import OpenAI
import tempfile
import os
import uuid
app = FastAPI()
client = OpenAI()
class TranscriptionRequest(BaseModel):
language: Optional[str] = None
prompt: Optional[str] = None
response_format: str = "json"
timestamps: bool = False
class TranscriptionResult(BaseModel):
id: str
text: str
language: Optional[str] = None
duration: Optional[float] = None
segments: Optional[List[dict]] = None
# In-memory storage (use Redis in production)
transcriptions = {}
@app.post("/transcribe", response_model=TranscriptionResult)
async def transcribe_audio(
file: UploadFile,
language: Optional[str] = None,
prompt: Optional[str] = None,
timestamps: bool = False
):
# Validate file type
allowed_types = [
"audio/mpeg", "audio/mp3", "audio/wav",
"audio/x-wav", "audio/mp4", "audio/webm"
]
if file.content_type not in allowed_types:
raise HTTPException(400, f"Unsupported file type: {file.content_type}")
# Save uploaded file
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
content = await file.read()
tmp.write(content)
tmp_path = tmp.name
try:
# Prepare request
kwargs = {
"model": "whisper-1",
"file": open(tmp_path, "rb"),
"response_format": "verbose_json" if timestamps else "json"
}
if language:
kwargs["language"] = language
if prompt:
kwargs["prompt"] = prompt
if timestamps:
kwargs["timestamp_granularities"] = ["segment"]
# Transcribe
result = client.audio.transcriptions.create(**kwargs)
# Build response
response = TranscriptionResult(
id=str(uuid.uuid4()),
text=result.text,
language=getattr(result, "language", None),
duration=getattr(result, "duration", None)
)
if timestamps and hasattr(result, "segments"):
response.segments = [
{
"start": s.start,
"end": s.end,
"text": s.text
}
for s in result.segments
]
return response
finally:
os.unlink(tmp_path)
@app.post("/translate")
async def translate_audio(
file: UploadFile,
prompt: Optional[str] = None
):
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
content = await file.read()
tmp.write(content)
tmp_path = tmp.name
try:
kwargs = {
"model": "whisper-1",
"file": open(tmp_path, "rb"),
"response_format": "verbose_json"
}
if prompt:
kwargs["prompt"] = prompt
result = client.audio.translations.create(**kwargs)
return {
"original_language": result.language,
"translated_text": result.text,
"duration": result.duration
}
finally:
os.unlink(tmp_path)
Async Processing
from fastapi import FastAPI, UploadFile, BackgroundTasks
from openai import OpenAI
import asyncio
import aiofiles
import uuid
app = FastAPI()
client = OpenAI()
# Job storage
jobs = {}
async def process_transcription(job_id: str, file_path: str, options: dict):
"""Background transcription task."""
try:
jobs[job_id]["status"] = "processing"
with open(file_path, "rb") as f:
result = client.audio.transcriptions.create(
model="whisper-1",
file=f,
**options
)
jobs[job_id]["status"] = "completed"
jobs[job_id]["result"] = result.text
except Exception as e:
jobs[job_id]["status"] = "failed"
jobs[job_id]["error"] = str(e)
finally:
import os
os.unlink(file_path)
@app.post("/transcribe/async")
async def transcribe_async(
file: UploadFile,
background_tasks: BackgroundTasks
):
job_id = str(uuid.uuid4())
# Save file
file_path = f"/tmp/{job_id}.mp3"
async with aiofiles.open(file_path, "wb") as f:
await f.write(await file.read())
# Initialize job
jobs[job_id] = {"status": "queued", "result": None}
# Add background task
background_tasks.add_task(
process_transcription,
job_id,
file_path,
{"response_format": "json"}
)
return {"job_id": job_id}
@app.get("/transcribe/status/{job_id}")
async def get_transcription_status(job_id: str):
if job_id not in jobs:
raise HTTPException(404, "Job not found")
return jobs[job_id]
Subtitle Generation
Generate SRT Subtitles
from openai import OpenAI
from datetime import timedelta
client = OpenAI()
def format_timestamp_srt(seconds: float) -> str:
"""Convert seconds to SRT timestamp format."""
td = timedelta(seconds=seconds)
hours, remainder = divmod(td.seconds, 3600)
minutes, seconds = divmod(remainder, 60)
milliseconds = td.microseconds // 1000
return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
def generate_srt(audio_path: str, max_chars_per_line: int = 42) -> str:
"""Generate SRT subtitles from audio."""
with open(audio_path, "rb") as f:
result = client.audio.transcriptions.create(
model="whisper-1",
file=f,
response_format="verbose_json",
timestamp_granularities=["segment"]
)
srt_content = []
for i, segment in enumerate(result.segments, 1):
start = format_timestamp_srt(segment.start)
end = format_timestamp_srt(segment.end)
text = segment.text.strip()
# Split long lines
if len(text) > max_chars_per_line:
words = text.split()
lines = []
current_line = []
for word in words:
if len(" ".join(current_line + [word])) <= max_chars_per_line:
current_line.append(word)
else:
lines.append(" ".join(current_line))
current_line = [word]
if current_line:
lines.append(" ".join(current_line))
text = "\n".join(lines)
srt_content.append(f"{i}\n{start} --> {end}\n{text}\n")
return "\n".join(srt_content)
# Usage
srt = generate_srt("video.mp3")
with open("subtitles.srt", "w") as f:
f.write(srt)
Generate VTT for Web
def generate_vtt(audio_path: str) -> str:
"""Generate WebVTT subtitles."""
with open(audio_path, "rb") as f:
result = client.audio.transcriptions.create(
model="whisper-1",
file=f,
response_format="verbose_json",
timestamp_granularities=["segment"]
)
vtt_content = ["WEBVTT\n"]
for segment in result.segments:
start = format_timestamp_vtt(segment.start)
end = format_timestamp_vtt(segment.end)
text = segment.text.strip()
vtt_content.append(f"{start} --> {end}\n{text}\n")
return "\n".join(vtt_content)
def format_timestamp_vtt(seconds: float) -> str:
"""Convert seconds to VTT timestamp format."""
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = int(seconds % 60)
millis = int((seconds % 1) * 1000)
return f"{hours:02d}:{minutes:02d}:{secs:02d}.{millis:03d}"
Summary
| Feature | Method |
|---|---|
| Transcribe | client.audio.transcriptions.create() |
| Translate | client.audio.translations.create() |
| Timestamps | timestamp_granularities=["word"] |
| Subtitles | response_format="srt" |
| Prompt Guide | prompt="context..." |
Whisper API provides powerful, accurate speech recognition for building transcription and translation services.
Advertisement
Moshiour Rahman
Software Architect & AI Engineer
Enterprise software architect with deep expertise in financial systems, distributed architecture, and AI-powered applications. Building large-scale systems at Fortune 500 companies. Specializing in LLM orchestration, multi-agent systems, and cloud-native solutions. I share battle-tested patterns from real enterprise projects.
Related Articles
OpenAI API with Python: Build AI-Powered Applications
Master the OpenAI API for building AI applications. Learn GPT-4, embeddings, function calling, assistants API, and production best practices.
PythonOpenAI ChatGPT API Tutorial: Build AI Apps with Python
Learn to use OpenAI's ChatGPT API in Python. Build chatbots, generate content, and integrate AI into your applications with practical examples.
PythonFine-Tuning LLMs: Complete Guide to Custom AI Models
Learn to fine-tune large language models for your use case. Master LoRA, QLoRA, dataset preparation, and deploy custom models with OpenAI and Hugging Face.
Comments
Comments are powered by GitHub Discussions.
Configure Giscus at giscus.app to enable comments.