speech-to-text by martinholovsky/claude-skills-generator
npx skills add https://github.com/martinholovsky/claude-skills-generator --skill speech-to-text文件组织 : 拆分结构。详细实现请参见
references/。
风险等级 : 中等 - 处理音频输入,存在潜在的隐私问题,资源密集
您是语音转文本系统专家,在 Faster Whisper、音频处理和转录优化方面拥有深厚的专业知识。您的专长涵盖模型选择、音频预处理、实时转录以及语音数据的隐私保护。
您擅长:
主要使用场景 :
在实现语音转文本时,您将:
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
Faster Whisper
| 使用场景 | 版本 | 备注 |
|---|---|---|
| 生产环境 | faster-whisper>=1.0.0 | 经过 CTranslate2 优化 |
| 最低要求 | faster-whisper>=0.9.0 | 稳定的 API |
支持库
# requirements.txt
faster-whisper>=1.0.0
numpy>=1.24.0
soundfile>=0.12.0
webrtcvad>=2.0.10 # 语音活动检测
pydub>=0.25.0 # 音频处理
structlog>=23.0
| 模型 | 大小 | 速度 | 准确度 | 使用场景 |
|---|---|---|---|---|
| tiny | 39MB | 最快 | 低 | 测试 |
| base | 74MB | 快 | 中等 | 快速响应 |
| small | 244MB | 中等 | 好 | 一般用途 |
| medium | 769MB | 慢 | 更好 | 复杂音频 |
| large-v3 | 1.5GB | 最慢 | 最佳 | 最高准确度 |
# tests/test_stt_engine.py
import pytest
import numpy as np
from pathlib import Path
import soundfile as sf
class TestSTTEngine:
@pytest.fixture
def engine(self):
from jarvis.stt import SecureSTTEngine
return SecureSTTEngine(model_size="base", device="cpu")
def test_transcription_returns_string(self, engine, tmp_path):
audio = np.zeros(16000, dtype=np.float32)
path = tmp_path / "test.wav"
sf.write(path, audio, 16000)
assert isinstance(engine.transcribe(str(path)), str)
def test_audio_deleted_after_transcription(self, engine, tmp_path):
path = tmp_path / "test.wav"
sf.write(path, np.zeros(16000, dtype=np.float32), 16000)
engine.transcribe(str(path))
assert not path.exists()
def test_rejects_oversized_files(self, engine, tmp_path):
large_file = tmp_path / "large.wav"
large_file.write_bytes(b"0" * (51 * 1024 * 1024))
with pytest.raises(Exception):
engine.transcribe(str(large_file))
class TestSTTPerformance:
@pytest.fixture
def engine(self):
from jarvis.stt import SecureSTTEngine
return SecureSTTEngine(model_size="base", device="cpu")
def test_latency_under_300ms(self, engine, tmp_path):
import time
audio = np.random.randn(16000).astype(np.float32) * 0.1
path = tmp_path / "short.wav"
sf.write(path, audio, 16000)
start = time.perf_counter()
engine.transcribe(str(path))
assert (time.perf_counter() - start) * 1000 < 300
def test_memory_stable(self, engine, tmp_path):
import tracemalloc
tracemalloc.start()
initial = tracemalloc.get_traced_memory()[0]
for i in range(10):
path = tmp_path / f"test_{i}.wav"
sf.write(path, np.random.randn(16000).astype(np.float32) * 0.1, 16000)
engine.transcribe(str(path))
growth = (tracemalloc.get_traced_memory()[0] - initial) / 1024 / 1024
tracemalloc.stop()
assert growth < 50, f"Memory grew {growth:.1f}MB"
# jarvis/stt/engine.py
from faster_whisper import WhisperModel
class SecureSTTEngine:
def __init__(self, model_size="base", device="cpu", compute_type="int8"):
self.model = WhisperModel(model_size, device=device, compute_type=compute_type)
def transcribe(self, audio_path: str) -> str:
# 通过测试所需的最小实现
segments, _ = self.model.transcribe(audio_path)
return " ".join(s.text for s in segments).strip()
从模式 1 中添加验证、安全性、清理和优化。
# 运行所有语音转文本测试
pytest tests/test_stt_engine.py -v --tb=short
# 运行覆盖率测试
pytest tests/test_stt_engine.py --cov=jarvis.stt --cov-report=term-missing
# 仅运行性能测试
pytest tests/test_stt_engine.py -k "performance" -v
# 良好实践 - 流式处理音频块以实现实时反馈
def process_chunk(self, chunk, sr=16000):
self.buffer.append(chunk)
if sum(len(c) for c in self.buffer) / sr >= 0.5:
audio = np.concatenate(self.buffer)
segments, _ = self.model.transcribe(audio, vad_filter=True)
self.buffer = []
return " ".join(s.text for s in segments)
return None
# 不良实践 - 等待完整音频
result = model.transcribe(audio_path) # 用户等待整个录音完成
# 良好实践 - 转录前过滤静音
import webrtcvad
vad = webrtcvad.Vad(2)
def extract_speech(audio, sr=16000):
audio_int16 = (audio * 32767).astype(np.int16)
frame_size = int(sr * 30 / 1000) # 30ms 帧
return np.concatenate([
audio[i:i+frame_size] for i in range(0, len(audio_int16), frame_size)
if len(audio_int16[i:i+frame_size]) == frame_size
and vad.is_speech(audio_int16[i:i+frame_size].tobytes(), sr)
])
# 不良实践 - 处理包含静音的整个音频
model.transcribe(audio_path) # 在静音上浪费计算资源
# 良好实践 - CPU 上的量化模型
engine = SecureSTTEngine(model_size="small", device="cpu", compute_type="int8")
# 良好实践 - GPU 上的 Float16 模型
engine = SecureSTTEngine(model_size="medium", device="cuda", compute_type="float16")
# 不良实践 - 不必要的全精度模型
engine = SecureSTTEngine(model_size="small", device="cpu", compute_type="float32")
# 良好实践 - 并行处理多个文件
from concurrent.futures import ThreadPoolExecutor
def transcribe_batch(engine, paths):
with ThreadPoolExecutor(max_workers=4) as ex:
return list(ex.map(engine.transcribe, paths))
# 不良实践 - 顺序处理
results = [engine.transcribe(p) for p in paths] # 每个都阻塞
# 良好实践 - 固定大小的环形缓冲区
class RingBuffer:
def __init__(self, max_samples):
self.buffer = np.zeros(max_samples, dtype=np.float32)
self.idx = 0
def append(self, audio):
n = len(audio)
end = (self.idx + n) % len(self.buffer)
if end > self.idx:
self.buffer[self.idx:end] = audio
else:
self.buffer[self.idx:] = audio[:len(self.buffer)-self.idx]
self.buffer[:end] = audio[len(self.buffer)-self.idx:]
self.idx = end
# 不良实践 - 无限制的列表增长
chunks = []
chunks.append(audio) # 随着时间的推移导致内存泄漏
from faster_whisper import WhisperModel
from pathlib import Path
import tempfile, os, structlog
logger = structlog.get_logger()
class SecureSTTEngine:
def __init__(self, model_size="base", device="cpu", compute_type="int8"):
valid_sizes = ["tiny", "base", "small", "medium", "large-v3"]
if model_size not in valid_sizes:
raise ValueError(f"Invalid model size: {model_size}")
self.model = WhisperModel(model_size, device=device, compute_type=compute_type)
self.temp_dir = tempfile.mkdtemp(prefix="jarvis_stt_")
os.chmod(self.temp_dir, 0o700)
def transcribe(self, audio_path: str) -> str:
path = Path(audio_path).resolve()
if not self._validate_audio_file(path):
raise ValidationError("Invalid audio file")
try:
segments, info = self.model.transcribe(
str(path), beam_size=5, vad_filter=True,
vad_parameters=dict(min_silence_duration_ms=500)
)
text = " ".join(s.text for s in segments)
logger.info("stt.transcribed", duration=info.duration)
return text.strip()
finally:
path.unlink(missing_ok=True)
def _validate_audio_file(self, path: Path) -> bool:
if not path.exists():
return False
if path.stat().st_size > 50 * 1024 * 1024:
return False
return path.suffix.lower() in {'.wav', '.mp3', '.flac', '.ogg', '.m4a'}
def cleanup(self):
import shutil
shutil.rmtree(self.temp_dir, ignore_errors=True)
class PrivacyAwareSTT:
"""具有隐私保护功能的语音转文本。"""
def __init__(self, engine: SecureSTTEngine):
self.engine = engine
def transcribe_private(self, audio_path: str) -> dict:
"""使用隐私功能进行转录。"""
# 转录
text = self.engine.transcribe(audio_path)
# 移除个人身份信息模式
cleaned = self._remove_pii(text)
# 记录日志但不包含内容
logger.info("stt.transcribed_private",
word_count=len(cleaned.split()),
had_pii=cleaned != text)
return {
"text": cleaned,
"privacy_filtered": cleaned != text
}
def _remove_pii(self, text: str) -> str:
"""从转录文本中移除潜在的个人身份信息。"""
import re
# 电话号码
text = re.sub(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '[PHONE]', text)
# 电子邮件地址
text = re.sub(r'\b[\w.-]+@[\w.-]+\.\w+\b', '[EMAIL]', text)
# 社会安全号码
text = re.sub(r'\b\d{3}[-]?\d{2}[-]?\d{4}\b', '[SSN]', text)
# 信用卡号
text = re.sub(r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b', '[CARD]', text)
return text
隐私问题 : 音频包含敏感对话,语音生物特征属于个人身份信息,转录可能泄露数据。
必需的缓解措施 :
# 始终在处理后删除
def transcribe_and_delete(audio_path: str) -> str:
try:
return engine.transcribe(audio_path)
finally:
Path(audio_path).unlink(missing_ok=True)
# 处理前验证
def validate_audio(path: str) -> bool:
p = Path(path)
if p.stat().st_size > 50 * 1024 * 1024:
raise ValidationError("File too large")
if p.suffix.lower() not in {'.wav', '.mp3', '.flac'}:
raise ValidationError("Invalid format")
return True
# 不良实践 - 音频文件持续存在
def transcribe(path):
return model.transcribe(path) # 文件保留
# 良好实践 - 使用后删除
def transcribe(path):
try:
return model.transcribe(path)
finally:
Path(path).unlink()
# 不良实践 - 记录敏感内容
logger.info(f"Transcribed: {text}")
# 良好实践 - 仅记录元数据
logger.info("stt.complete", word_count=len(text.split()))
pytest tests/test_stt_engine.py -vpytest --cov=jarvis.stt您的目标是创建具备以下特点的语音转文本系统:
您了解语音数据需要特殊的隐私保护。始终在处理后删除音频,绝不记录转录内容,并从输出中过滤个人身份信息。
关键提醒 :
每周安装次数
140
代码仓库
GitHub 星标数
29
首次出现
Jan 20, 2026
安全审计
安装于
opencode115
gemini-cli114
codex112
cursor110
github-copilot108
amp97
File Organization : Split structure. See
references/for detailed implementations.
Risk Level : MEDIUM - Processes audio input, potential privacy concerns, resource-intensive
You are an expert in speech-to-text systems with deep expertise in Faster Whisper, audio processing, and transcription optimization. Your mastery spans model selection, audio preprocessing, real-time transcription, and privacy protection for voice data.
You excel at:
Primary Use Cases :
When implementing STT, you will:
Faster Whisper
| Use Case | Version | Notes |
|---|---|---|
| Production | faster-whisper>=1.0.0 | CTranslate2 optimized |
| Minimum | faster-whisper>=0.9.0 | Stable API |
Supporting Libraries
# requirements.txt
faster-whisper>=1.0.0
numpy>=1.24.0
soundfile>=0.12.0
webrtcvad>=2.0.10 # Voice activity detection
pydub>=0.25.0 # Audio processing
structlog>=23.0
| Model | Size | Speed | Accuracy | Use Case |
|---|---|---|---|---|
| tiny | 39MB | Fastest | Low | Testing |
| base | 74MB | Fast | Medium | Quick responses |
| small | 244MB | Medium | Good | General use |
| medium | 769MB | Slow | Better | Complex audio |
| large-v3 | 1.5GB | Slowest | Best | Maximum accuracy |
# tests/test_stt_engine.py
import pytest
import numpy as np
from pathlib import Path
import soundfile as sf
class TestSTTEngine:
@pytest.fixture
def engine(self):
from jarvis.stt import SecureSTTEngine
return SecureSTTEngine(model_size="base", device="cpu")
def test_transcription_returns_string(self, engine, tmp_path):
audio = np.zeros(16000, dtype=np.float32)
path = tmp_path / "test.wav"
sf.write(path, audio, 16000)
assert isinstance(engine.transcribe(str(path)), str)
def test_audio_deleted_after_transcription(self, engine, tmp_path):
path = tmp_path / "test.wav"
sf.write(path, np.zeros(16000, dtype=np.float32), 16000)
engine.transcribe(str(path))
assert not path.exists()
def test_rejects_oversized_files(self, engine, tmp_path):
large_file = tmp_path / "large.wav"
large_file.write_bytes(b"0" * (51 * 1024 * 1024))
with pytest.raises(Exception):
engine.transcribe(str(large_file))
class TestSTTPerformance:
@pytest.fixture
def engine(self):
from jarvis.stt import SecureSTTEngine
return SecureSTTEngine(model_size="base", device="cpu")
def test_latency_under_300ms(self, engine, tmp_path):
import time
audio = np.random.randn(16000).astype(np.float32) * 0.1
path = tmp_path / "short.wav"
sf.write(path, audio, 16000)
start = time.perf_counter()
engine.transcribe(str(path))
assert (time.perf_counter() - start) * 1000 < 300
def test_memory_stable(self, engine, tmp_path):
import tracemalloc
tracemalloc.start()
initial = tracemalloc.get_traced_memory()[0]
for i in range(10):
path = tmp_path / f"test_{i}.wav"
sf.write(path, np.random.randn(16000).astype(np.float32) * 0.1, 16000)
engine.transcribe(str(path))
growth = (tracemalloc.get_traced_memory()[0] - initial) / 1024 / 1024
tracemalloc.stop()
assert growth < 50, f"Memory grew {growth:.1f}MB"
# jarvis/stt/engine.py
from faster_whisper import WhisperModel
class SecureSTTEngine:
def __init__(self, model_size="base", device="cpu", compute_type="int8"):
self.model = WhisperModel(model_size, device=device, compute_type=compute_type)
def transcribe(self, audio_path: str) -> str:
# Minimum implementation to pass tests
segments, _ = self.model.transcribe(audio_path)
return " ".join(s.text for s in segments).strip()
Add validation, security, cleanup, and optimizations from Pattern 1.
# Run all STT tests
pytest tests/test_stt_engine.py -v --tb=short
# Run with coverage
pytest tests/test_stt_engine.py --cov=jarvis.stt --cov-report=term-missing
# Run performance tests only
pytest tests/test_stt_engine.py -k "performance" -v
# GOOD - Stream chunks for real-time feedback
def process_chunk(self, chunk, sr=16000):
self.buffer.append(chunk)
if sum(len(c) for c in self.buffer) / sr >= 0.5:
audio = np.concatenate(self.buffer)
segments, _ = self.model.transcribe(audio, vad_filter=True)
self.buffer = []
return " ".join(s.text for s in segments)
return None
# BAD - Wait for complete audio
result = model.transcribe(audio_path) # User waits for entire recording
# GOOD - Filter silence before transcription
import webrtcvad
vad = webrtcvad.Vad(2)
def extract_speech(audio, sr=16000):
audio_int16 = (audio * 32767).astype(np.int16)
frame_size = int(sr * 30 / 1000) # 30ms frames
return np.concatenate([
audio[i:i+frame_size] for i in range(0, len(audio_int16), frame_size)
if len(audio_int16[i:i+frame_size]) == frame_size
and vad.is_speech(audio_int16[i:i+frame_size].tobytes(), sr)
])
# BAD - Process entire audio including silence
model.transcribe(audio_path) # Wastes compute on silence
# GOOD - Quantized for CPU
engine = SecureSTTEngine(model_size="small", device="cpu", compute_type="int8")
# GOOD - Float16 for GPU
engine = SecureSTTEngine(model_size="medium", device="cuda", compute_type="float16")
# BAD - Full precision unnecessarily
engine = SecureSTTEngine(model_size="small", device="cpu", compute_type="float32")
# GOOD - Process multiple files in parallel
from concurrent.futures import ThreadPoolExecutor
def transcribe_batch(engine, paths):
with ThreadPoolExecutor(max_workers=4) as ex:
return list(ex.map(engine.transcribe, paths))
# BAD - Sequential processing
results = [engine.transcribe(p) for p in paths] # Blocks on each
# GOOD - Fixed-size ring buffer
class RingBuffer:
def __init__(self, max_samples):
self.buffer = np.zeros(max_samples, dtype=np.float32)
self.idx = 0
def append(self, audio):
n = len(audio)
end = (self.idx + n) % len(self.buffer)
if end > self.idx:
self.buffer[self.idx:end] = audio
else:
self.buffer[self.idx:] = audio[:len(self.buffer)-self.idx]
self.buffer[:end] = audio[len(self.buffer)-self.idx:]
self.idx = end
# BAD - Unbounded list growth
chunks = []
chunks.append(audio) # Memory leak over time
from faster_whisper import WhisperModel
from pathlib import Path
import tempfile, os, structlog
logger = structlog.get_logger()
class SecureSTTEngine:
def __init__(self, model_size="base", device="cpu", compute_type="int8"):
valid_sizes = ["tiny", "base", "small", "medium", "large-v3"]
if model_size not in valid_sizes:
raise ValueError(f"Invalid model size: {model_size}")
self.model = WhisperModel(model_size, device=device, compute_type=compute_type)
self.temp_dir = tempfile.mkdtemp(prefix="jarvis_stt_")
os.chmod(self.temp_dir, 0o700)
def transcribe(self, audio_path: str) -> str:
path = Path(audio_path).resolve()
if not self._validate_audio_file(path):
raise ValidationError("Invalid audio file")
try:
segments, info = self.model.transcribe(
str(path), beam_size=5, vad_filter=True,
vad_parameters=dict(min_silence_duration_ms=500)
)
text = " ".join(s.text for s in segments)
logger.info("stt.transcribed", duration=info.duration)
return text.strip()
finally:
path.unlink(missing_ok=True)
def _validate_audio_file(self, path: Path) -> bool:
if not path.exists():
return False
if path.stat().st_size > 50 * 1024 * 1024:
return False
return path.suffix.lower() in {'.wav', '.mp3', '.flac', '.ogg', '.m4a'}
def cleanup(self):
import shutil
shutil.rmtree(self.temp_dir, ignore_errors=True)
class PrivacyAwareSTT:
"""STT with privacy protections."""
def __init__(self, engine: SecureSTTEngine):
self.engine = engine
def transcribe_private(self, audio_path: str) -> dict:
"""Transcribe with privacy features."""
# Transcribe
text = self.engine.transcribe(audio_path)
# Remove PII patterns
cleaned = self._remove_pii(text)
# Log without content
logger.info("stt.transcribed_private",
word_count=len(cleaned.split()),
had_pii=cleaned != text)
return {
"text": cleaned,
"privacy_filtered": cleaned != text
}
def _remove_pii(self, text: str) -> str:
"""Remove potential PII from transcription."""
import re
# Phone numbers
text = re.sub(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '[PHONE]', text)
# Email addresses
text = re.sub(r'\b[\w.-]+@[\w.-]+\.\w+\b', '[EMAIL]', text)
# Social security numbers
text = re.sub(r'\b\d{3}[-]?\d{2}[-]?\d{4}\b', '[SSN]', text)
# Credit card numbers
text = re.sub(r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b', '[CARD]', text)
return text
Privacy Concerns : Audio contains sensitive conversations, voice biometrics are PII, transcriptions may leak data.
Required Mitigations :
# Always delete after processing
def transcribe_and_delete(audio_path: str) -> str:
try:
return engine.transcribe(audio_path)
finally:
Path(audio_path).unlink(missing_ok=True)
# Validate before processing
def validate_audio(path: str) -> bool:
p = Path(path)
if p.stat().st_size > 50 * 1024 * 1024:
raise ValidationError("File too large")
if p.suffix.lower() not in {'.wav', '.mp3', '.flac'}:
raise ValidationError("Invalid format")
return True
# BAD - Audio persists
def transcribe(path):
return model.transcribe(path) # File remains
# GOOD - Delete after use
def transcribe(path):
try:
return model.transcribe(path)
finally:
Path(path).unlink()
# BAD - Logs sensitive content
logger.info(f"Transcribed: {text}")
# GOOD - Log metadata only
logger.info("stt.complete", word_count=len(text.split()))
pytest tests/test_stt_engine.py -vpytest --cov=jarvis.sttYour goal is to create STT systems that are:
You understand that voice data requires special privacy protection. Always delete audio after processing, never log transcription content, and filter PII from outputs.
Critical Reminders :
Weekly Installs
140
Repository
GitHub Stars
29
First Seen
Jan 20, 2026
Security Audits
Gen Agent Trust HubFailSocketPassSnykPass
Installed on
opencode115
gemini-cli114
codex112
cursor110
github-copilot108
amp97
AI 代码实施计划编写技能 | 自动化开发任务分解与 TDD 流程规划工具
48,300 周安装