text-to-speech by martinholovsky/claude-skills-generator
npx skills add https://github.com/martinholovsky/claude-skills-generator --skill text-to-speech文件组织 : 拆分结构。详细实现请参阅
references/。
风险等级 : 中等 - 生成音频输出,可能存在不当内容合成的风险,资源密集型
您是一位文本转语音系统专家,在 Kokoro TTS、语音合成和音频生成优化方面拥有深厚的专业知识。您的专长涵盖模型配置、语音定制、流式音频输出以及合成语音的安全处理。
您擅长:
主要用例 :
# tests/test_tts_engine.py
import pytest
from pathlib import Path
class TestSecureTTSEngine:
def test_synthesize_returns_valid_audio(self, tts_engine):
audio_path = tts_engine.synthesize("Hello test")
assert Path(audio_path).exists()
assert audio_path.endswith('.wav')
def test_audio_has_correct_sample_rate(self, tts_engine):
import soundfile as sf
audio_path = tts_engine.synthesize("Test")
_, sample_rate = sf.read(audio_path)
assert sample_rate == 24000
def test_rejects_empty_text(self, tts_engine):
with pytest.raises(ValidationError):
tts_engine.synthesize("")
def test_rejects_text_exceeding_limit(self, tts_engine):
with pytest.raises(ValidationError):
tts_engine.synthesize("x" * 6000)
def test_filters_sensitive_content(self, tts_engine):
audio_path = tts_engine.synthesize("password: secret123")
assert Path(audio_path).exists()
def test_cleanup_removes_temp_files(self, tts_engine):
tts_engine.synthesize("Test")
temp_dir = tts_engine.temp_dir
tts_engine.cleanup()
assert not Path(temp_dir).exists()
@pytest.fixture
def tts_engine():
from jarvis.tts import SecureTTSEngine
engine = SecureTTSEngine(voice="af_heart")
yield engine
engine.cleanup()
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
实现具有所需方法的 SecureTTSEngine。仅专注于使测试通过。
测试通过后,为流式输出、缓存和异步兼容性进行重构。
pytest tests/test_tts_engine.py -v # 运行测试
pytest --cov=jarvis.tts --cov-report=term-missing # 覆盖率
mypy src/jarvis/tts/ # 类型检查
python -m jarvis.tts --test "Hello JARVIS" # 集成测试
# 不好 - 等待完整音频
audio_chunks = []
for _, _, audio in pipeline(text):
audio_chunks.append(audio)
play_audio(np.concatenate(audio_chunks)) # 长时间等待
# 好 - 立即流式传输分块
with sd.OutputStream(samplerate=24000, channels=1) as stream:
for _, _, audio in pipeline(text):
stream.write(audio) # 生成即播放
# 不好: pipeline = KPipeline(lang_code="a") # 每次重新加载
# 好 - 单例模式
class TTSEngine:
_pipeline = None
@classmethod
def get_pipeline(cls):
if cls._pipeline is None:
cls._pipeline = KPipeline(lang_code="a")
return cls._pipeline
# 不好: data, sr = sf.read(audio_path) # 整个文件加载到 RAM
# 好 - 分块处理
with sf.SoundFile(audio_path) as f:
while f.tell() < len(f):
yield process(f.read(24000))
# 不好: audio = engine.synthesize(text) # 阻塞事件循环
# 好 - 在 executor 中运行
audio = await loop.run_in_executor(None, engine.synthesize, text)
# 不好: return SecureTTSEngine(voice=VOICES[voice_type]) # 冷启动
# 好 - 启动时预加载
def _preload_voices(self, types: list[str]):
for t in types:
self.engines[t] = SecureTTSEngine(voice=VOICES[t])
实现 TTS 时,您需要:
Kokoro TTS
| 用途 | 版本 | 备注 |
|---|---|---|
| 生产环境 | kokoro>=0.3.0 | 最新稳定版 |
支持库
# requirements.txt
kokoro>=0.3.0
numpy>=1.24.0
soundfile>=0.12.0
sounddevice>=0.4.6
scipy>=1.10.0
pydantic>=2.0
structlog>=23.0
| 语音 | 风格 | 用例 |
|---|---|---|
| af_heart | 温暖、友好 | 默认 JARVIS |
| af_bella | 专业 | 正式响应 |
| am_adam | 男性 | 替代语音 |
| bf_emma | 英式 | 口音变体 |
from kokoro import KPipeline
import soundfile as sf
import numpy as np
from pathlib import Path
import tempfile
import os
import structlog
logger = structlog.get_logger()
class SecureTTSEngine:
"""具有内容过滤功能的安全文本转语音引擎。"""
def __init__(self, voice: str = "af_heart", lang_code: str = "a"):
# 初始化 Kokoro 管道
self.pipeline = KPipeline(lang_code=lang_code)
self.voice = voice
# 内容过滤模式
self.blocked_patterns = [
r"password\s*[:=]",
r"api[_-]?key\s*[:=]",
r"secret\s*[:=]",
]
# 创建安全的临时目录
self.temp_dir = tempfile.mkdtemp(prefix="jarvis_tts_")
os.chmod(self.temp_dir, 0o700)
logger.info("tts.initialized", voice=voice)
def synthesize(self, text: str) -> str:
"""将文本合成为音频文件。"""
# 验证和过滤输入
if not self._validate_text(text):
raise ValidationError("Invalid text input")
filtered_text = self._filter_sensitive(text)
# 生成音频
audio_path = Path(self.temp_dir) / f"{uuid.uuid4()}.wav"
generator = self.pipeline(
filtered_text,
voice=self.voice,
speed=1.0
)
# 收集音频分块
audio_chunks = []
for _, _, audio in generator:
audio_chunks.append(audio)
if not audio_chunks:
raise TTSError("No audio generated")
# 拼接并保存
full_audio = np.concatenate(audio_chunks)
sf.write(str(audio_path), full_audio, 24000)
logger.info("tts.synthesized",
text_length=len(text),
audio_duration=len(full_audio) / 24000)
return str(audio_path)
def _validate_text(self, text: str) -> bool:
"""验证文本输入。"""
if not text or not text.strip():
return False
# 长度限制(防止 DoS)
if len(text) > 5000:
logger.warning("tts.text_too_long", length=len(text))
return False
return True
def _filter_sensitive(self, text: str) -> str:
"""从文本中过滤敏感内容。"""
import re
filtered = text
for pattern in self.blocked_patterns:
if re.search(pattern, filtered, re.IGNORECASE):
logger.warning("tts.sensitive_content_filtered")
filtered = re.sub(pattern + r'\S+', '[FILTERED]', filtered, flags=re.IGNORECASE)
return filtered
def cleanup(self):
"""清理临时文件。"""
import shutil
if os.path.exists(self.temp_dir):
shutil.rmtree(self.temp_dir)
# 为低延迟,生成时流式传输音频分块
with sd.OutputStream(samplerate=24000, channels=1) as stream:
for _, _, audio in pipeline(text, voice=voice):
stream.write(audio) # 立即播放
# 使用哈希键缓存常用短语
cache_key = hashlib.sha256(f"{text}:{voice}".encode()).hexdigest()
cache_path = cache_dir / f"{cache_key}.wav"
if cache_path.exists():
return str(cache_path) # 缓存命中
# 生成,保存到缓存,返回路径
# 按语音类型延迟加载引擎
VOICES = {"default": "af_heart", "formal": "af_bella"}
def get_engine(voice_type: str) -> SecureTTSEngine:
if voice_type not in engines:
engines[voice_type] = SecureTTSEngine(voice=VOICES[voice_type])
return engines[voice_type]
# 用于并发的信号量 + 超时保护
async with asyncio.Semaphore(2):
result = await asyncio.wait_for(
loop.run_in_executor(None, engine.synthesize, text),
timeout=30.0
)
防止合成不当内容:
class ContentFilter:
"""在合成前过滤不当内容。"""
BLOCKED_CATEGORIES = [
"violence",
"hate_speech",
"explicit",
]
def filter(self, text: str) -> tuple[str, bool]:
"""过滤文本并返回(过滤后的文本,是否被修改)。"""
# 移除潜在的命令注入
text = text.replace(";", "").replace("|", "").replace("&", "")
# 检查被阻止的模式
for pattern in self.blocked_patterns:
if re.search(pattern, text, re.IGNORECASE):
return "[Content filtered]", True
return text, False
def validate_tts_input(text: str) -> bool:
"""验证用于 TTS 合成的文本。"""
# 长度限制
if len(text) > 5000:
raise ValidationError("Text too long (max 5000 chars)")
# 字符验证
if not all(c.isprintable() or c in '\n\t' for c in text):
raise ValidationError("Invalid characters in text")
return True
# 不好 - 不过滤
def speak(user_input: str):
engine.synthesize(user_input)
# 好 - 先过滤
def speak(user_input: str):
filtered = content_filter.filter(user_input)
engine.synthesize(filtered)
# 不好 - 可能生成非常长的音频
engine.synthesize(long_text) # 无限制
# 好 - 强制执行限制
if len(text) > 5000:
raise ValidationError("Text too long")
engine.synthesize(text)
pytest tests/test_tts_engine.py -vpytest --cov=jarvis.ttsmypy src/jarvis/tts/python -m jarvis.tts --test您的目标是创建具备以下特点的 TTS 系统:
您理解 TTS 需要输入验证和内容过滤,以防止合成不当内容。始终强制执行文本长度限制并清理生成的音频文件。
关键提醒 :
每周安装次数
83
代码仓库
GitHub 星标数
32
首次出现时间
2026年1月20日
安全审计
安装于
gemini-cli68
codex67
opencode66
cursor64
github-copilot63
cline54
File Organization : Split structure. See
references/for detailed implementations.
Risk Level : MEDIUM - Generates audio output, potential for inappropriate content synthesis, resource-intensive
You are an expert in text-to-speech systems with deep expertise in Kokoro TTS, voice synthesis, and audio generation optimization. Your mastery spans model configuration, voice customization, streaming audio output, and secure handling of synthesized speech.
You excel at:
Primary Use Cases :
# tests/test_tts_engine.py
import pytest
from pathlib import Path
class TestSecureTTSEngine:
def test_synthesize_returns_valid_audio(self, tts_engine):
audio_path = tts_engine.synthesize("Hello test")
assert Path(audio_path).exists()
assert audio_path.endswith('.wav')
def test_audio_has_correct_sample_rate(self, tts_engine):
import soundfile as sf
audio_path = tts_engine.synthesize("Test")
_, sample_rate = sf.read(audio_path)
assert sample_rate == 24000
def test_rejects_empty_text(self, tts_engine):
with pytest.raises(ValidationError):
tts_engine.synthesize("")
def test_rejects_text_exceeding_limit(self, tts_engine):
with pytest.raises(ValidationError):
tts_engine.synthesize("x" * 6000)
def test_filters_sensitive_content(self, tts_engine):
audio_path = tts_engine.synthesize("password: secret123")
assert Path(audio_path).exists()
def test_cleanup_removes_temp_files(self, tts_engine):
tts_engine.synthesize("Test")
temp_dir = tts_engine.temp_dir
tts_engine.cleanup()
assert not Path(temp_dir).exists()
@pytest.fixture
def tts_engine():
from jarvis.tts import SecureTTSEngine
engine = SecureTTSEngine(voice="af_heart")
yield engine
engine.cleanup()
Implement SecureTTSEngine with required methods. Focus only on making tests pass.
After tests pass, refactor for streaming output, caching, and async compatibility.
pytest tests/test_tts_engine.py -v # Run tests
pytest --cov=jarvis.tts --cov-report=term-missing # Coverage
mypy src/jarvis/tts/ # Type check
python -m jarvis.tts --test "Hello JARVIS" # Integration
# BAD - Wait for full audio
audio_chunks = []
for _, _, audio in pipeline(text):
audio_chunks.append(audio)
play_audio(np.concatenate(audio_chunks)) # Long wait
# GOOD - Stream chunks immediately
with sd.OutputStream(samplerate=24000, channels=1) as stream:
for _, _, audio in pipeline(text):
stream.write(audio) # Play as generated
# BAD: pipeline = KPipeline(lang_code="a") # Reload each time
# GOOD - Singleton pattern
class TTSEngine:
_pipeline = None
@classmethod
def get_pipeline(cls):
if cls._pipeline is None:
cls._pipeline = KPipeline(lang_code="a")
return cls._pipeline
# BAD: data, sr = sf.read(audio_path) # Full file in RAM
# GOOD - Process in chunks
with sf.SoundFile(audio_path) as f:
while f.tell() < len(f):
yield process(f.read(24000))
# BAD: audio = engine.synthesize(text) # Blocks event loop
# GOOD - Run in executor
audio = await loop.run_in_executor(None, engine.synthesize, text)
# BAD: return SecureTTSEngine(voice=VOICES[voice_type]) # Cold start
# GOOD - Preload at startup
def _preload_voices(self, types: list[str]):
for t in types:
self.engines[t] = SecureTTSEngine(voice=VOICES[t])
When implementing TTS, you will:
Kokoro TTS
| Use Case | Version | Notes |
|---|---|---|
| Production | kokoro>=0.3.0 | Latest stable |
Supporting Libraries
# requirements.txt
kokoro>=0.3.0
numpy>=1.24.0
soundfile>=0.12.0
sounddevice>=0.4.6
scipy>=1.10.0
pydantic>=2.0
structlog>=23.0
| Voice | Style | Use Case |
|---|---|---|
| af_heart | Warm, friendly | Default JARVIS |
| af_bella | Professional | Formal responses |
| am_adam | Male | Alternative voice |
| bf_emma | British | Accent variation |
from kokoro import KPipeline
import soundfile as sf
import numpy as np
from pathlib import Path
import tempfile
import os
import structlog
logger = structlog.get_logger()
class SecureTTSEngine:
"""Secure text-to-speech with content filtering."""
def __init__(self, voice: str = "af_heart", lang_code: str = "a"):
# Initialize Kokoro pipeline
self.pipeline = KPipeline(lang_code=lang_code)
self.voice = voice
# Content filter patterns
self.blocked_patterns = [
r"password\s*[:=]",
r"api[_-]?key\s*[:=]",
r"secret\s*[:=]",
]
# Create secure temp directory
self.temp_dir = tempfile.mkdtemp(prefix="jarvis_tts_")
os.chmod(self.temp_dir, 0o700)
logger.info("tts.initialized", voice=voice)
def synthesize(self, text: str) -> str:
"""Synthesize text to audio file."""
# Validate and filter input
if not self._validate_text(text):
raise ValidationError("Invalid text input")
filtered_text = self._filter_sensitive(text)
# Generate audio
audio_path = Path(self.temp_dir) / f"{uuid.uuid4()}.wav"
generator = self.pipeline(
filtered_text,
voice=self.voice,
speed=1.0
)
# Collect audio chunks
audio_chunks = []
for _, _, audio in generator:
audio_chunks.append(audio)
if not audio_chunks:
raise TTSError("No audio generated")
# Concatenate and save
full_audio = np.concatenate(audio_chunks)
sf.write(str(audio_path), full_audio, 24000)
logger.info("tts.synthesized",
text_length=len(text),
audio_duration=len(full_audio) / 24000)
return str(audio_path)
def _validate_text(self, text: str) -> bool:
"""Validate text input."""
if not text or not text.strip():
return False
# Length limit (prevent DoS)
if len(text) > 5000:
logger.warning("tts.text_too_long", length=len(text))
return False
return True
def _filter_sensitive(self, text: str) -> str:
"""Filter sensitive content from text."""
import re
filtered = text
for pattern in self.blocked_patterns:
if re.search(pattern, filtered, re.IGNORECASE):
logger.warning("tts.sensitive_content_filtered")
filtered = re.sub(pattern + r'\S+', '[FILTERED]', filtered, flags=re.IGNORECASE)
return filtered
def cleanup(self):
"""Clean up temp files."""
import shutil
if os.path.exists(self.temp_dir):
shutil.rmtree(self.temp_dir)
# Stream audio chunks as generated for low latency
with sd.OutputStream(samplerate=24000, channels=1) as stream:
for _, _, audio in pipeline(text, voice=voice):
stream.write(audio) # Play immediately
# Cache common phrases with hash key
cache_key = hashlib.sha256(f"{text}:{voice}".encode()).hexdigest()
cache_path = cache_dir / f"{cache_key}.wav"
if cache_path.exists():
return str(cache_path) # Cache hit
# Generate, save to cache, return path
# Lazy-load engines per voice type
VOICES = {"default": "af_heart", "formal": "af_bella"}
def get_engine(voice_type: str) -> SecureTTSEngine:
if voice_type not in engines:
engines[voice_type] = SecureTTSEngine(voice=VOICES[voice_type])
return engines[voice_type]
# Semaphore for concurrency + timeout for protection
async with asyncio.Semaphore(2):
result = await asyncio.wait_for(
loop.run_in_executor(None, engine.synthesize, text),
timeout=30.0
)
Prevent synthesis of inappropriate content:
class ContentFilter:
"""Filter inappropriate content before synthesis."""
BLOCKED_CATEGORIES = [
"violence",
"hate_speech",
"explicit",
]
def filter(self, text: str) -> tuple[str, bool]:
"""Filter text and return (filtered_text, was_modified)."""
# Remove potential command injection
text = text.replace(";", "").replace("|", "").replace("&", "")
# Check for blocked patterns
for pattern in self.blocked_patterns:
if re.search(pattern, text, re.IGNORECASE):
return "[Content filtered]", True
return text, False
def validate_tts_input(text: str) -> bool:
"""Validate text for TTS synthesis."""
# Length limit
if len(text) > 5000:
raise ValidationError("Text too long (max 5000 chars)")
# Character validation
if not all(c.isprintable() or c in '\n\t' for c in text):
raise ValidationError("Invalid characters in text")
return True
# BAD - No filtering
def speak(user_input: str):
engine.synthesize(user_input)
# GOOD - Filter first
def speak(user_input: str):
filtered = content_filter.filter(user_input)
engine.synthesize(filtered)
# BAD - Can generate very long audio
engine.synthesize(long_text) # No limit
# GOOD - Enforce limits
if len(text) > 5000:
raise ValidationError("Text too long")
engine.synthesize(text)
pytest tests/test_tts_engine.py -vpytest --cov=jarvis.ttsmypy src/jarvis/tts/python -m jarvis.tts --testYour goal is to create TTS systems that are:
You understand that TTS requires input validation and content filtering to prevent synthesis of inappropriate content. Always enforce text length limits and clean up generated audio files.
Critical Reminders :
Weekly Installs
83
Repository
GitHub Stars
32
First Seen
Jan 20, 2026
Security Audits
Gen Agent Trust HubFailSocketPassSnykPass
Installed on
gemini-cli68
codex67
opencode66
cursor64
github-copilot63
cline54
AI 代码实施计划编写技能 | 自动化开发任务分解与 TDD 流程规划工具
50,900 周安装
Skill Creator 指南:如何为 Claude AI 创建高效技能模块 | 技能开发与优化
139 周安装
CSS伪元素最佳实践与视图过渡API检查工具 - 提升前端代码质量
139 周安装
Vercel React 最佳实践指南:65条性能优化规则,提升Next.js应用性能
141 周安装
NativeWind v4 Expo 配置指南:React Native Tailwind CSS 样式库集成教程
140 周安装
AI技能创建指南 - 如何为智能体开发高效、模块化的专业技能包
139 周安装
MCP CLI 使用指南:动态调用 MCP 服务器工具,无需永久集成
142 周安装