Speech Recognition in Python: Complete Voice Processing Guide
Build speech recognition applications with Python. Learn speech-to-text, voice commands, audio processing with Whisper, SpeechRecognition, and real-time transcription.
Moshiour Rahman
Advertisement
What is Speech Recognition?
Speech recognition converts spoken language into text. Modern AI models like Whisper achieve near-human accuracy, enabling voice assistants, transcription services, and accessibility tools.
Applications
| Use Case | Examples |
|---|---|
| Voice Assistants | Siri, Alexa, Google Assistant |
| Transcription | Meeting notes, subtitles |
| Voice Commands | Smart home, car systems |
| Accessibility | Screen readers, dictation |
Getting Started
Installation
pip install SpeechRecognition pyaudio
pip install openai-whisper # For Whisper
pip install pydub # Audio processing
Basic Speech Recognition
import speech_recognition as sr
# Initialize recognizer
recognizer = sr.Recognizer()
# Recognize from microphone
with sr.Microphone() as source:
print("Adjusting for ambient noise...")
recognizer.adjust_for_ambient_noise(source, duration=1)
print("Speak now...")
audio = recognizer.listen(source, timeout=5)
try:
# Using Google's free API
text = recognizer.recognize_google(audio)
print(f"You said: {text}")
except sr.UnknownValueError:
print("Could not understand audio")
except sr.RequestError as e:
print(f"API error: {e}")
Recognize from Audio File
import speech_recognition as sr
recognizer = sr.Recognizer()
# Load audio file
with sr.AudioFile("audio.wav") as source:
audio = recognizer.record(source)
# Recognize with different engines
# Google (free, requires internet)
text = recognizer.recognize_google(audio)
# Google Cloud (paid, more accurate)
# text = recognizer.recognize_google_cloud(audio, credentials_json=CREDENTIALS)
# Sphinx (offline, less accurate)
# text = recognizer.recognize_sphinx(audio)
print(f"Transcription: {text}")
OpenAI Whisper
Local Whisper
import whisper
# Load model (tiny, base, small, medium, large)
model = whisper.load_model("base")
# Transcribe audio
result = model.transcribe("audio.mp3")
print(result["text"])
# With options
result = model.transcribe(
"audio.mp3",
language="en",
task="transcribe", # or "translate"
fp16=False, # For CPU
verbose=True
)
# Access segments with timestamps
for segment in result["segments"]:
print(f"[{segment['start']:.2f}s - {segment['end']:.2f}s]: {segment['text']}")
Whisper API
from openai import OpenAI
client = OpenAI()
# Transcribe audio
with open("audio.mp3", "rb") as audio_file:
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
response_format="text"
)
print(transcript)
# With timestamps
with open("audio.mp3", "rb") as audio_file:
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
response_format="verbose_json",
timestamp_granularities=["word", "segment"]
)
for segment in transcript.segments:
print(f"[{segment.start:.2f}s]: {segment.text}")
# Translate to English
with open("spanish_audio.mp3", "rb") as audio_file:
translation = client.audio.translations.create(
model="whisper-1",
file=audio_file
)
print(translation.text)
Real-Time Transcription
Continuous Listening
import speech_recognition as sr
import threading
import queue
class RealtimeTranscriber:
def __init__(self):
self.recognizer = sr.Recognizer()
self.audio_queue = queue.Queue()
self.running = False
def listen(self):
with sr.Microphone() as source:
self.recognizer.adjust_for_ambient_noise(source)
print("Listening...")
while self.running:
try:
audio = self.recognizer.listen(source, timeout=1, phrase_time_limit=5)
self.audio_queue.put(audio)
except sr.WaitTimeoutError:
continue
def transcribe(self):
while self.running or not self.audio_queue.empty():
try:
audio = self.audio_queue.get(timeout=1)
text = self.recognizer.recognize_google(audio)
print(f">> {text}")
except queue.Empty:
continue
except sr.UnknownValueError:
pass
except sr.RequestError as e:
print(f"Error: {e}")
def start(self):
self.running = True
listen_thread = threading.Thread(target=self.listen)
transcribe_thread = threading.Thread(target=self.transcribe)
listen_thread.start()
transcribe_thread.start()
return listen_thread, transcribe_thread
def stop(self):
self.running = False
# Usage
transcriber = RealtimeTranscriber()
threads = transcriber.start()
input("Press Enter to stop...")
transcriber.stop()
for t in threads:
t.join()
Streaming with Whisper
import pyaudio
import numpy as np
import whisper
import threading
import queue
class WhisperStreamer:
def __init__(self, model_name: str = "base"):
self.model = whisper.load_model(model_name)
self.audio_queue = queue.Queue()
self.sample_rate = 16000
self.chunk_duration = 5 # seconds
def record_audio(self):
p = pyaudio.PyAudio()
stream = p.open(
format=pyaudio.paFloat32,
channels=1,
rate=self.sample_rate,
input=True,
frames_per_buffer=1024
)
print("Recording...")
while self.running:
frames = []
for _ in range(int(self.sample_rate / 1024 * self.chunk_duration)):
if not self.running:
break
data = stream.read(1024, exception_on_overflow=False)
frames.append(np.frombuffer(data, dtype=np.float32))
if frames:
audio_data = np.concatenate(frames)
self.audio_queue.put(audio_data)
stream.stop_stream()
stream.close()
p.terminate()
def transcribe_stream(self):
while self.running or not self.audio_queue.empty():
try:
audio = self.audio_queue.get(timeout=1)
result = self.model.transcribe(audio, fp16=False)
if result["text"].strip():
print(f">> {result['text']}")
except queue.Empty:
continue
def start(self):
self.running = True
record_thread = threading.Thread(target=self.record_audio)
transcribe_thread = threading.Thread(target=self.transcribe_stream)
record_thread.start()
transcribe_thread.start()
return record_thread, transcribe_thread
def stop(self):
self.running = False
# Usage
streamer = WhisperStreamer("base")
threads = streamer.start()
input("Press Enter to stop...")
streamer.stop()
Voice Commands
Command Recognition
import speech_recognition as sr
from typing import Callable, Dict
class VoiceCommandHandler:
def __init__(self):
self.recognizer = sr.Recognizer()
self.commands: Dict[str, Callable] = {}
def register_command(self, trigger: str, action: Callable):
self.commands[trigger.lower()] = action
def listen_and_execute(self):
with sr.Microphone() as source:
self.recognizer.adjust_for_ambient_noise(source)
print("Listening for commands...")
while True:
try:
audio = self.recognizer.listen(source, timeout=5)
text = self.recognizer.recognize_google(audio).lower()
print(f"Heard: {text}")
for trigger, action in self.commands.items():
if trigger in text:
action(text)
break
else:
print("Command not recognized")
except sr.WaitTimeoutError:
continue
except sr.UnknownValueError:
continue
# Define commands
def open_browser(text):
import webbrowser
webbrowser.open("https://google.com")
print("Opening browser...")
def tell_time(text):
from datetime import datetime
print(f"The time is {datetime.now().strftime('%H:%M')}")
def stop_listening(text):
print("Goodbye!")
exit()
# Register and run
handler = VoiceCommandHandler()
handler.register_command("open browser", open_browser)
handler.register_command("what time", tell_time)
handler.register_command("stop listening", stop_listening)
handler.listen_and_execute()
Intent Recognition with NLP
import speech_recognition as sr
from transformers import pipeline
class SmartVoiceAssistant:
def __init__(self):
self.recognizer = sr.Recognizer()
self.classifier = pipeline(
"zero-shot-classification",
model="facebook/bart-large-mnli"
)
self.intents = [
"play music",
"set reminder",
"search web",
"send message",
"get weather",
"control lights"
]
def classify_intent(self, text: str) -> dict:
result = self.classifier(text, self.intents)
return {
"intent": result["labels"][0],
"confidence": result["scores"][0],
"text": text
}
def listen_and_classify(self):
with sr.Microphone() as source:
self.recognizer.adjust_for_ambient_noise(source)
print("Listening...")
audio = self.recognizer.listen(source)
text = self.recognizer.recognize_google(audio)
intent_result = self.classify_intent(text)
return intent_result
# Usage
assistant = SmartVoiceAssistant()
result = assistant.listen_and_classify()
print(f"Intent: {result['intent']} ({result['confidence']:.2%})")
print(f"Text: {result['text']}")
Audio Processing
Audio File Conversion
from pydub import AudioSegment
# Load audio
audio = AudioSegment.from_file("input.mp3")
# Convert format
audio.export("output.wav", format="wav")
# Change sample rate
audio = audio.set_frame_rate(16000)
# Convert to mono
audio = audio.set_channels(1)
# Normalize volume
from pydub.effects import normalize
audio = normalize(audio)
# Trim silence
from pydub.silence import split_on_silence
chunks = split_on_silence(
audio,
min_silence_len=500,
silence_thresh=-40
)
# Concatenate non-silent parts
trimmed = sum(chunks)
trimmed.export("trimmed.wav", format="wav")
Audio Enhancement
import numpy as np
from scipy import signal
from pydub import AudioSegment
def reduce_noise(audio_path: str, output_path: str):
# Load audio
audio = AudioSegment.from_file(audio_path)
samples = np.array(audio.get_array_of_samples())
# Apply noise reduction (simple spectral subtraction)
# Estimate noise from first 0.5 seconds
noise_sample = samples[:int(audio.frame_rate * 0.5)]
noise_profile = np.abs(np.fft.fft(noise_sample))
# Apply to full audio
audio_fft = np.fft.fft(samples)
audio_fft_clean = audio_fft - noise_profile.mean()
# Inverse FFT
cleaned = np.real(np.fft.ifft(audio_fft_clean)).astype(np.int16)
# Save
cleaned_audio = AudioSegment(
cleaned.tobytes(),
frame_rate=audio.frame_rate,
sample_width=audio.sample_width,
channels=audio.channels
)
cleaned_audio.export(output_path, format="wav")
def apply_bandpass_filter(audio_path: str, low: int = 300, high: int = 3400):
"""Apply bandpass filter for voice frequencies."""
audio = AudioSegment.from_file(audio_path)
samples = np.array(audio.get_array_of_samples(), dtype=np.float32)
# Design bandpass filter
nyquist = audio.frame_rate / 2
low_norm = low / nyquist
high_norm = high / nyquist
b, a = signal.butter(4, [low_norm, high_norm], btype='band')
filtered = signal.filtfilt(b, a, samples)
return filtered.astype(np.int16)
FastAPI Transcription Service
from fastapi import FastAPI, UploadFile, HTTPException
from fastapi.responses import JSONResponse
import whisper
import tempfile
import os
app = FastAPI()
model = whisper.load_model("base")
@app.post("/transcribe")
async def transcribe_audio(file: UploadFile):
# Validate file type
allowed_types = ["audio/mpeg", "audio/wav", "audio/mp3", "audio/x-wav"]
if file.content_type not in allowed_types:
raise HTTPException(400, "Invalid file type")
# Save uploaded file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp:
content = await file.read()
temp.write(content)
temp_path = temp.name
try:
# Transcribe
result = model.transcribe(temp_path)
return JSONResponse({
"text": result["text"],
"language": result["language"],
"segments": [
{
"start": s["start"],
"end": s["end"],
"text": s["text"]
}
for s in result["segments"]
]
})
finally:
os.unlink(temp_path)
@app.post("/translate")
async def translate_audio(file: UploadFile, target_language: str = "en"):
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp:
content = await file.read()
temp.write(content)
temp_path = temp.name
try:
result = model.transcribe(temp_path, task="translate")
return {"translated_text": result["text"]}
finally:
os.unlink(temp_path)
Summary
| Library | Best For |
|---|---|
| SpeechRecognition | Quick prototypes, multiple APIs |
| Whisper | Accurate offline transcription |
| Whisper API | Production, low latency |
| PyAudio | Real-time audio capture |
| Pydub | Audio file processing |
Speech recognition enables powerful voice interfaces and transcription services for modern applications.
Advertisement
Moshiour Rahman
Software Architect & AI Engineer
Enterprise software architect with deep expertise in financial systems, distributed architecture, and AI-powered applications. Building large-scale systems at Fortune 500 companies. Specializing in LLM orchestration, multi-agent systems, and cloud-native solutions. I share battle-tested patterns from real enterprise projects.
Related Articles
Getting Started with Machine Learning in Python: A Practical Guide
Learn machine learning fundamentals with Python. Build your first ML models using scikit-learn with hands-on examples for classification, regression, and real-world predictions.
PythonLangChain Tutorial: Build AI Applications with Python
Master LangChain for building LLM-powered applications. Learn chains, agents, memory, RAG, and integrate with OpenAI, HuggingFace, and vector databases.
PythonDeep Learning with PyTorch: Complete Beginner's Guide
Learn deep learning with PyTorch from scratch. Build neural networks, CNNs, RNNs, and train models for image classification and NLP tasks.
Comments
Comments are powered by GitHub Discussions.
Configure Giscus at giscus.app to enable comments.