voice-ai-development by sickn33/antigravity-awesome-skills
npx skills add https://github.com/sickn33/antigravity-awesome-skills --skill voice-ai-development角色 : 语音 AI 架构师
您是构建实时语音应用程序的专家。您从延迟预算、音频质量和用户体验的角度思考问题。您深知,语音应用在快速响应时感觉神奇,在反应迟缓时则显得糟糕。您能为每个用例选择合适的供应商组合,并为了感知响应速度而进行不懈的优化。
使用 GPT-4o 的原生语音到语音功能
使用场景 : 当您需要集成的语音 AI,而不想使用独立的 STT/TTS 时
import asyncio
import websockets
import json
import base64
OPENAI_API_KEY = "sk-..."
async def voice_session():
url = "wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview"
headers = {
"Authorization": f"Bearer {OPENAI_API_KEY}",
"OpenAI-Beta": "realtime=v1"
}
async with websockets.connect(url, extra_headers=headers) as ws:
# 配置会话
await ws.send(json.dumps({
"type": "session.update",
"session": {
"modalities": ["text", "audio"],
"voice": "alloy", # alloy, echo, fable, onyx, nova, shimmer
"input_audio_format": "pcm16",
"output_audio_format": "pcm16",
"input_audio_transcription": {
"model": "whisper-1"
},
"turn_detection": {
"type": "server_vad", # 语音活动检测
"threshold": 0.5,
"prefix_padding_ms": 300,
"silence_duration_ms": 500
},
"tools": [
{
"type": "function",
"name": "get_weather",
"description": "Get weather for a location",
"parameters": {
"type": "object",
"properties": {
"location": {"type": "string"}
}
}
}
]
}
}))
# 发送音频 (PCM16, 24kHz, 单声道)
async def send_audio(audio_bytes):
await ws.send(json.dumps({
"type": "input_audio_buffer.append",
"audio": base64.b64encode(audio_bytes).decode()
}))
# 接收事件
async for message in ws:
event = json.loads(message)
if event["type"] == "resp
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
使用 Vapi 平台构建语音助手
使用场景 : 基于电话的助手,快速部署
# Vapi 提供带有 Webhook 的托管语音助手
from flask import Flask, request, jsonify
import vapi
app = Flask(__name__)
client = vapi.Vapi(api_key="...")
# 创建一个助手
assistant = client.assistants.create(
name="Support Agent",
model={
"provider": "openai",
"model": "gpt-4o",
"messages": [
{
"role": "system",
"content": "You are a helpful support agent..."
}
]
},
voice={
"provider": "11labs",
"voiceId": "21m00Tcm4TlvDq8ikWAM" # Rachel
},
firstMessage="Hi! How can I help you today?",
transcriber={
"provider": "deepgram",
"model": "nova-2"
}
)
# 用于对话事件的 Webhook
@app.route("/vapi/webhook", methods=["POST"])
def vapi_webhook():
event = request.json
if event["type"] == "function-call":
# 处理工具调用
name = event["functionCall"]["name"]
args = event["functionCall"]["parameters"]
if name == "check_order":
result = check_order(args["order_id"])
return jsonify({"result": result})
elif event["type"] == "end-of-call-report":
# 通话结束 - 保存转录文本
transcript = event["transcript"]
save_transcript(event["call"]["id"], transcript)
return jsonify({"ok": True})
# 发起外呼
call = client.calls.create(
assistant_id=assistant.id,
customer={
"number": "+1234567890"
},
phoneNumber={
"twilioPhoneNumber": "+0987654321"
}
)
# 或创建网页通话
web_call = client.calls.create(
assistant_id=assistant.id,
type="web"
)
# 返回用于 WebRTC 连接的 URL
一流的转录和合成
使用场景 : 高质量语音,自定义处理流程
import asyncio
from deepgram import DeepgramClient, LiveTranscriptionEvents
from elevenlabs import ElevenLabs
# Deepgram 实时转录
deepgram = DeepgramClient(api_key="...")
async def transcribe_stream(audio_stream):
connection = deepgram.listen.live.v("1")
async def on_transcript(result):
transcript = result.channel.alternatives[0].transcript
if transcript:
print(f"Heard: {transcript}")
if result.is_final:
# 处理最终转录文本
await handle_user_input(transcript)
connection.on(LiveTranscriptionEvents.Transcript, on_transcript)
await connection.start({
"model": "nova-2", # 最佳质量
"language": "en",
"smart_format": True,
"interim_results": True, # 获取部分结果
"utterance_end_ms": 1000,
"vad_events": True, # 语音活动检测
"encoding": "linear16",
"sample_rate": 16000
})
# 流式传输音频
async for chunk in audio_stream:
await connection.send(chunk)
await connection.finish()
# ElevenLabs 流式合成
eleven = ElevenLabs(api_key="...")
def text_to_speech_stream(text: str):
"""流式传输 TTS 音频块。"""
audio_stream = eleven.text_to_speech.convert_as_stream(
voice_id="21m00Tcm4TlvDq8ikWAM", # Rachel
model_id="eleven_turbo_v2_5", # 最快
text=text,
output_format="pcm_24000" # 原始 PCM 格式,低延迟
)
for chunk in audio_stream:
yield chunk
# 或者使用 WebSocket 实现最低延迟
async def tts_websocket(text_stream):
async with eleven.text_to_speech.stream_async(
voice_id="21m00Tcm4TlvDq8ikWAM",
model_id="eleven_turbo_v2_5"
) as tts:
async for text_chunk in text_stream:
audio = await tts.send(text_chunk)
yield audio
# 刷新剩余的音频
final_audio = await tts.flush()
yield final_audio
为何不好 : 增加数秒延迟。用户感知为缓慢。失去对话流畅性。
替代方案 : 所有环节都采用流式处理:
为何不好 : 用户体验令人沮丧。感觉像在和机器说话。浪费时间。
替代方案 : 实现插话检测。使用 VAD 检测用户语音。立即停止 TTS。清空音频队列。
为何不好 : 可能不是最佳质量。单点故障。难以优化。
替代方案 : 混合使用最佳供应商:
与以下技能配合良好:langgraph, structured-output, langfuse
此技能适用于执行概述中描述的工作流程或操作。
每周安装量
397
代码仓库
GitHub 星标数
27.1K
首次出现
Jan 19, 2026
安全审计
安装于
opencode325
gemini-cli322
claude-code304
codex278
cursor278
antigravity268
Role : Voice AI Architect
You are an expert in building real-time voice applications. You think in terms of latency budgets, audio quality, and user experience. You know that voice apps feel magical when fast and broken when slow. You choose the right combination of providers for each use case and optimize relentlessly for perceived responsiveness.
Native voice-to-voice with GPT-4o
When to use : When you want integrated voice AI without separate STT/TTS
import asyncio
import websockets
import json
import base64
OPENAI_API_KEY = "sk-..."
async def voice_session():
url = "wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview"
headers = {
"Authorization": f"Bearer {OPENAI_API_KEY}",
"OpenAI-Beta": "realtime=v1"
}
async with websockets.connect(url, extra_headers=headers) as ws:
# Configure session
await ws.send(json.dumps({
"type": "session.update",
"session": {
"modalities": ["text", "audio"],
"voice": "alloy", # alloy, echo, fable, onyx, nova, shimmer
"input_audio_format": "pcm16",
"output_audio_format": "pcm16",
"input_audio_transcription": {
"model": "whisper-1"
},
"turn_detection": {
"type": "server_vad", # Voice activity detection
"threshold": 0.5,
"prefix_padding_ms": 300,
"silence_duration_ms": 500
},
"tools": [
{
"type": "function",
"name": "get_weather",
"description": "Get weather for a location",
"parameters": {
"type": "object",
"properties": {
"location": {"type": "string"}
}
}
}
]
}
}))
# Send audio (PCM16, 24kHz, mono)
async def send_audio(audio_bytes):
await ws.send(json.dumps({
"type": "input_audio_buffer.append",
"audio": base64.b64encode(audio_bytes).decode()
}))
# Receive events
async for message in ws:
event = json.loads(message)
if event["type"] == "resp
Build voice agents with Vapi platform
When to use : Phone-based agents, quick deployment
# Vapi provides hosted voice agents with webhooks
from flask import Flask, request, jsonify
import vapi
app = Flask(__name__)
client = vapi.Vapi(api_key="...")
# Create an assistant
assistant = client.assistants.create(
name="Support Agent",
model={
"provider": "openai",
"model": "gpt-4o",
"messages": [
{
"role": "system",
"content": "You are a helpful support agent..."
}
]
},
voice={
"provider": "11labs",
"voiceId": "21m00Tcm4TlvDq8ikWAM" # Rachel
},
firstMessage="Hi! How can I help you today?",
transcriber={
"provider": "deepgram",
"model": "nova-2"
}
)
# Webhook for conversation events
@app.route("/vapi/webhook", methods=["POST"])
def vapi_webhook():
event = request.json
if event["type"] == "function-call":
# Handle tool call
name = event["functionCall"]["name"]
args = event["functionCall"]["parameters"]
if name == "check_order":
result = check_order(args["order_id"])
return jsonify({"result": result})
elif event["type"] == "end-of-call-report":
# Call ended - save transcript
transcript = event["transcript"]
save_transcript(event["call"]["id"], transcript)
return jsonify({"ok": True})
# Start outbound call
call = client.calls.create(
assistant_id=assistant.id,
customer={
"number": "+1234567890"
},
phoneNumber={
"twilioPhoneNumber": "+0987654321"
}
)
# Or create web call
web_call = client.calls.create(
assistant_id=assistant.id,
type="web"
)
# Returns URL for WebRTC connection
Best-in-class transcription and synthesis
When to use : High quality voice, custom pipeline
import asyncio
from deepgram import DeepgramClient, LiveTranscriptionEvents
from elevenlabs import ElevenLabs
# Deepgram real-time transcription
deepgram = DeepgramClient(api_key="...")
async def transcribe_stream(audio_stream):
connection = deepgram.listen.live.v("1")
async def on_transcript(result):
transcript = result.channel.alternatives[0].transcript
if transcript:
print(f"Heard: {transcript}")
if result.is_final:
# Process final transcript
await handle_user_input(transcript)
connection.on(LiveTranscriptionEvents.Transcript, on_transcript)
await connection.start({
"model": "nova-2", # Best quality
"language": "en",
"smart_format": True,
"interim_results": True, # Get partial results
"utterance_end_ms": 1000,
"vad_events": True, # Voice activity detection
"encoding": "linear16",
"sample_rate": 16000
})
# Stream audio
async for chunk in audio_stream:
await connection.send(chunk)
await connection.finish()
# ElevenLabs streaming synthesis
eleven = ElevenLabs(api_key="...")
def text_to_speech_stream(text: str):
"""Stream TTS audio chunks."""
audio_stream = eleven.text_to_speech.convert_as_stream(
voice_id="21m00Tcm4TlvDq8ikWAM", # Rachel
model_id="eleven_turbo_v2_5", # Fastest
text=text,
output_format="pcm_24000" # Raw PCM for low latency
)
for chunk in audio_stream:
yield chunk
# Or with WebSocket for lowest latency
async def tts_websocket(text_stream):
async with eleven.text_to_speech.stream_async(
voice_id="21m00Tcm4TlvDq8ikWAM",
model_id="eleven_turbo_v2_5"
) as tts:
async for text_chunk in text_stream:
audio = await tts.send(text_chunk)
yield audio
# Flush remaining audio
final_audio = await tts.flush()
yield final_audio
Why bad : Adds seconds of latency. User perceives as slow. Loses conversation flow.
Instead : Stream everything:
Why bad : Frustrating user experience. Feels like talking to a machine. Wastes time.
Instead : Implement barge-in detection. Use VAD to detect user speech. Stop TTS immediately. Clear audio queue.
Why bad : May not be best quality. Single point of failure. Harder to optimize.
Instead : Mix best providers:
Works well with: langgraph, structured-output, langfuse
This skill is applicable to execute the workflow or actions described in the overview.
Weekly Installs
397
Repository
GitHub Stars
27.1K
First Seen
Jan 19, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykFail
Installed on
opencode325
gemini-cli322
claude-code304
codex278
cursor278
antigravity268
超能力技能使用指南:AI助手技能调用优先级与工作流程详解
41,800 周安装