Speech Recognition in Python: Complete Voice Processing Guide
Build speech recognition applications with Python. Learn speech-to-text, voice commands, audio processing with Whisper, SpeechRecognition, and real-time transcription.
Moshiour Rahman
Advertisement
What is Speech Recognition?
Speech recognition converts spoken language into text. Modern AI models like Whisper achieve near-human accuracy, enabling voice assistants, transcription services, and accessibility tools.
Applications
| Use Case | Examples |
|---|---|
| Voice Assistants | Siri, Alexa, Google Assistant |
| Transcription | Meeting notes, subtitles |
| Voice Commands | Smart home, car systems |
| Accessibility | Screen readers, dictation |
Getting Started
Installation
pip install SpeechRecognition pyaudio
pip install openai-whisper # For Whisper
pip install pydub # Audio processing
Basic Speech Recognition
import speech_recognition as sr
# Initialize recognizer
recognizer = sr.Recognizer()
# Recognize from microphone
with sr.Microphone() as source:
print("Adjusting for ambient noise...")
recognizer.adjust_for_ambient_noise(source, duration=1)
print("Speak now...")
audio = recognizer.listen(source, timeout=5)
try:
# Using Google's free API
text = recognizer.recognize_google(audio)
print(f"You said: {text}")
except sr.UnknownValueError:
print("Could not understand audio")
except sr.RequestError as e:
print(f"API error: {e}")
Recognize from Audio File
import speech_recognition as sr
recognizer = sr.Recognizer()
# Load audio file
with sr.AudioFile("audio.wav") as source:
audio = recognizer.record(source)
# Recognize with different engines
# Google (free, requires internet)
text = recognizer.recognize_google(audio)
# Google Cloud (paid, more accurate)
# text = recognizer.recognize_google_cloud(audio, credentials_json=CREDENTIALS)
# Sphinx (offline, less accurate)
# text = recognizer.recognize_sphinx(audio)
print(f"Transcription: {text}")
OpenAI Whisper
Local Whisper
import whisper
# Load model (tiny, base, small, medium, large)
model = whisper.load_model("base")
# Transcribe audio
result = model.transcribe("audio.mp3")
print(result["text"])
# With options
result = model.transcribe(
"audio.mp3",
language="en",
task="transcribe", # or "translate"
fp16=False, # For CPU
verbose=True
)
# Access segments with timestamps
for segment in result["segments"]:
print(f"[{segment['start']:.2f}s - {segment['end']:.2f}s]: {segment['text']}")
Whisper API
from openai import OpenAI
client = OpenAI()
# Transcribe audio
with open("audio.mp3", "rb") as audio_file:
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
response_format="text"
)
print(transcript)
# With timestamps
with open("audio.mp3", "rb") as audio_file:
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
response_format="verbose_json",
timestamp_granularities=["word", "segment"]
)
for segment in transcript.segments:
print(f"[{segment.start:.2f}s]: {segment.text}")
# Translate to English
with open("spanish_audio.mp3", "rb") as audio_file:
translation = client.audio.translations.create(
model="whisper-1",
file=audio_file
)
print(translation.text)
Real-Time Transcription
Continuous Listening
import speech_recognition as sr
import threading
import queue
class RealtimeTranscriber:
def __init__(self):
self.recognizer = sr.Recognizer()
self.audio_queue = queue.Queue()
self.running = False
def listen(self):
with sr.Microphone() as source:
self.recognizer.adjust_for_ambient_noise(source)
print("Listening...")
while self.running:
try:
audio = self.recognizer.listen(source, timeout=1, phrase_time_limit=5)
self.audio_queue.put(audio)
except sr.WaitTimeoutError:
continue
def transcribe(self):
while self.running or not self.audio_queue.empty():
try:
audio = self.audio_queue.get(timeout=1)
text = self.recognizer.recognize_google(audio)
print(f">> {text}")
except queue.Empty:
continue
except sr.UnknownValueError:
pass
except sr.RequestError as e:
print(f"Error: {e}")
def start(self):
self.running = True
listen_thread = threading.Thread(target=self.listen)
transcribe_thread = threading.Thread(target=self.transcribe)
listen_thread.start()
transcribe_thread.start()
return listen_thread, transcribe_thread
def stop(self):
self.running = False
# Usage
transcriber = RealtimeTranscriber()
threads = transcriber.start()
input("Press Enter to stop...")
transcriber.stop()
for t in threads:
t.join()
Streaming with Whisper
import pyaudio
import numpy as np
import whisper
import threading
import queue
class WhisperStreamer:
def __init__(self, model_name: str = "base"):
self.model = whisper.load_model(model_name)
self.audio_queue = queue.Queue()
self.sample_rate = 16000
self.chunk_duration = 5 # seconds
def record_audio(self):
p = pyaudio.PyAudio()
stream = p.open(
format=pyaudio.paFloat32,
channels=1,
rate=self.sample_rate,
input=True,
frames_per_buffer=1024
)
print("Recording...")
while self.running:
frames = []
for _ in range(int(self.sample_rate / 1024 * self.chunk_duration)):
if not self.running:
break
data = stream.read(1024, exception_on_overflow=False)
frames.append(np.frombuffer(data, dtype=np.float32))
if frames:
audio_data = np.concatenate(frames)
self.audio_queue.put(audio_data)
stream.stop_stream()
stream.close()
p.terminate()
def transcribe_stream(self):
while self.running or not self.audio_queue.empty():
try:
audio = self.audio_queue.get(timeout=1)
result = self.model.transcribe(audio, fp16=False)
if result["text"].strip():
print(f">> {result['text']}")
except queue.Empty:
continue
def start(self):
self.running = True
record_thread = threading.Thread(target=self.record_audio)
transcribe_thread = threading.Thread(target=self.transcribe_stream)
record_thread.start()
transcribe_thread.start()
return record_thread, transcribe_thread
def stop(self):
self.running = False
# Usage
streamer = WhisperStreamer("base")
threads = streamer.start()
input("Press Enter to stop...")
streamer.stop()
Voice Commands
Command Recognition
import speech_recognition as sr
from typing import Callable, Dict
class VoiceCommandHandler:
def __init__(self):
self.recognizer = sr.Recognizer()
self.commands: Dict[str, Callable] = {}
def register_command(self, trigger: str, action: Callable):
self.commands[trigger.lower()] = action
def listen_and_execute(self):
with sr.Microphone() as source:
self.recognizer.adjust_for_ambient_noise(source)
print("Listening for commands...")
while True:
try:
audio = self.recognizer.listen(source, timeout=5)
text = self.recognizer.recognize_google(audio).lower()
print(f"Heard: {text}")
for trigger, action in self.commands.items():
if trigger in text:
action(text)
break
else:
print("Command not recognized")
except sr.WaitTimeoutError:
continue
except sr.UnknownValueError:
continue
# Define commands
def open_browser(text):
import webbrowser
webbrowser.open("https://google.com")
print("Opening browser...")
def tell_time(text):
from datetime import datetime
print(f"The time is {datetime.now().strftime('%H:%M')}")
def stop_listening(text):
print("Goodbye!")
exit()
# Register and run
handler = VoiceCommandHandler()
handler.register_command("open browser", open_browser)
handler.register_command("what time", tell_time)
handler.register_command("stop listening", stop_listening)
handler.listen_and_execute()
Intent Recognition with NLP
import speech_recognition as sr
from transformers import pipeline
class SmartVoiceAssistant:
def __init__(self):
self.recognizer = sr.Recognizer()
self.classifier = pipeline(
"zero-shot-classification",
model="facebook/bart-large-mnli"
)
self.intents = [
"play music",
"set reminder",
"search web",
"send message",
"get weather",
"control lights"
]
def classify_intent(self, text: str) -> dict:
result = self.classifier(text, self.intents)
return {
"intent": result["labels"][0],
"confidence": result["scores"][0],
"text": text
}
def listen_and_classify(self):
with sr.Microphone() as source:
self.recognizer.adjust_for_ambient_noise(source)
print("Listening...")
audio = self.recognizer.listen(source)
text = self.recognizer.recognize_google(audio)
intent_result = self.classify_intent(text)
return intent_result
# Usage
assistant = SmartVoiceAssistant()
result = assistant.listen_and_classify()
print(f"Intent: {result['intent']} ({result['confidence']:.2%})")
print(f"Text: {result['text']}")
Audio Processing
Audio File Conversion
from pydub import AudioSegment
# Load audio
audio = AudioSegment.from_file("input.mp3")
# Convert format
audio.export("output.wav", format="wav")
# Change sample rate
audio = audio.set_frame_rate(16000)
# Convert to mono
audio = audio.set_channels(1)
# Normalize volume
from pydub.effects import normalize
audio = normalize(audio)
# Trim silence
from pydub.silence import split_on_silence
chunks = split_on_silence(
audio,
min_silence_len=500,
silence_thresh=-40
)
# Concatenate non-silent parts
trimmed = sum(chunks)
trimmed.export("trimmed.wav", format="wav")
Audio Enhancement
import numpy as np
from scipy import signal
from pydub import AudioSegment
def reduce_noise(audio_path: str, output_path: str):
# Load audio
audio = AudioSegment.from_file(audio_path)
samples = np.array(audio.get_array_of_samples())
# Apply noise reduction (simple spectral subtraction)
# Estimate noise from first 0.5 seconds
noise_sample = samples[:int(audio.frame_rate * 0.5)]
noise_profile = np.abs(np.fft.fft(noise_sample))
# Apply to full audio
audio_fft = np.fft.fft(samples)
audio_fft_clean = audio_fft - noise_profile.mean()
# Inverse FFT
cleaned = np.real(np.fft.ifft(audio_fft_clean)).astype(np.int16)
# Save
cleaned_audio = AudioSegment(
cleaned.tobytes(),
frame_rate=audio.frame_rate,
sample_width=audio.sample_width,
channels=audio.channels
)
cleaned_audio.export(output_path, format="wav")
def apply_bandpass_filter(audio_path: str, low: int = 300, high: int = 3400):
"""Apply bandpass filter for voice frequencies."""
audio = AudioSegment.from_file(audio_path)
samples = np.array(audio.get_array_of_samples(), dtype=np.float32)
# Design bandpass filter
nyquist = audio.frame_rate / 2
low_norm = low / nyquist
high_norm = high / nyquist
b, a = signal.butter(4, [low_norm, high_norm], btype='band')
filtered = signal.filtfilt(b, a, samples)
return filtered.astype(np.int16)
FastAPI Transcription Service
from fastapi import FastAPI, UploadFile, HTTPException
from fastapi.responses import JSONResponse
import whisper
import tempfile
import os
app = FastAPI()
model = whisper.load_model("base")
@app.post("/transcribe")
async def transcribe_audio(file: UploadFile):
# Validate file type
allowed_types = ["audio/mpeg", "audio/wav", "audio/mp3", "audio/x-wav"]
if file.content_type not in allowed_types:
raise HTTPException(400, "Invalid file type")
# Save uploaded file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp:
content = await file.read()
temp.write(content)
temp_path = temp.name
try:
# Transcribe
result = model.transcribe(temp_path)
return JSONResponse({
"text": result["text"],
"language": result["language"],
"segments": [
{
"start": s["start"],
"end": s["end"],
"text": s["text"]
}
for s in result["segments"]
]
})
finally:
os.unlink(temp_path)
@app.post("/translate")
async def translate_audio(file: UploadFile, target_language: str = "en"):
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp:
content = await file.read()
temp.write(content)
temp_path = temp.name
try:
result = model.transcribe(temp_path, task="translate")
return {"translated_text": result["text"]}
finally:
os.unlink(temp_path)
Summary
| Library | Best For |
|---|---|
| SpeechRecognition | Quick prototypes, multiple APIs |
| Whisper | Accurate offline transcription |
| Whisper API | Production, low latency |
| PyAudio | Real-time audio capture |
| Pydub | Audio file processing |
Speech recognition enables powerful voice interfaces and transcription services for modern applications.
Advertisement
Moshiour Rahman
Software Architect & AI Engineer
Enterprise software architect with deep expertise in financial systems, distributed architecture, and AI-powered applications. Building large-scale systems at Fortune 500 companies. Specializing in LLM orchestration, multi-agent systems, and cloud-native solutions. I share battle-tested patterns from real enterprise projects.
Related Articles
AI Agents Fundamentals: Build Your First Agent from Scratch
Master AI agents from the ground up. Learn the agent loop, build a working agent in pure Python, and understand the foundations that power LangGraph and CrewAI.
PythonHugging Face Transformers: Complete Python Tutorial
Master Hugging Face Transformers for NLP tasks. Learn text classification, named entity recognition, question answering, and fine-tuning models.
PythonGetting Started with Machine Learning in Python: A Practical Guide
Learn machine learning fundamentals with Python. Build your first ML models using scikit-learn with hands-on examples for classification, regression, and real-world predictions.
Comments
Comments are powered by GitHub Discussions.
Configure Giscus at giscus.app to enable comments.